# Bag of Words

In [None]:
first_category = 'government'
second_category = 'romance'

## CountVectorizer

### Text preprocessing

In [None]:
import nltk
nltk.download('brown')
nltk.download('stopwords')
from nltk.corpus import brown
from nltk.corpus import stopwords

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
english_stopwords = stopwords.words('english')
english_stopwords[:5]

['i', 'me', 'my', 'myself', 'we']

In [None]:
regex_to_match = r'^[a-zA-Z]+$'
import re

def prepare_tokens(tokens):
  lowered = [token.lower() for token in tokens]
  onlywords = [token for token in lowered if re.fullmatch(regex_to_match, token)] 
  no_stopwords = [token for token in onlywords if not token in english_stopwords] 
  return no_stopwords

def prepare_corpus(tokenized_corpus):
  results = []
  for sentence in tokenized_corpus:
    results.append(' '.join(prepare_tokens(sentence)))
  return results

first_corpus = prepare_corpus(brown.sents(categories=[first_category]))  
first_corpus[:3]

['office business economics obe department commerce provides basic measures national economy current analysis changes economic situation business outlook',
 'develops analyzes national income balance international payments many business indicators',
 'measures essential job presenting business government facts required meet objective expanding business improving operation economy']

In [None]:
second_corpus = prepare_corpus(brown.sents(categories=[second_category]))  
len(second_corpus)

4431

### Vectorize
text is preprocessed, ready to apply vectorization on top of that

In [None]:
import pandas as pd

In [None]:
df1 = pd.DataFrame()
df1['text'] = first_corpus
df1['category'] = first_category
df2 = pd.DataFrame()
df2['text'] = second_corpus
df2['category'] = second_category
df = pd.concat([df1, df2], ignore_index=True)
df.head()
df.groupby('category').count()

Unnamed: 0_level_0,text
category,Unnamed: 1_level_1
government,3032
romance,4431


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
whole_corpus = df['text']

In [None]:
whole_corpus[0]

'office business economics obe department commerce provides basic measures national economy current analysis changes economic situation business outlook'

In [None]:
cvfit = cv.fit(whole_corpus)
X = cvfit.transform(whole_corpus)
print(X[0])

  (0, 381)	1
  (0, 842)	1
  (0, 1321)	2
  (0, 1555)	1
  (0, 1864)	1
  (0, 2402)	1
  (0, 2631)	1
  (0, 3138)	1
  (0, 3140)	1
  (0, 3143)	1
  (0, 6057)	1
  (0, 6443)	1
  (0, 6614)	1
  (0, 6675)	1
  (0, 6798)	1
  (0, 7645)	1
  (0, 8975)	1


In [None]:
X.shape

(7463, 11112)

In [None]:
cvfit.get_feature_names()[:5]

['aa', 'aback', 'abandon', 'abandoning', 'abernathy']

In [None]:
cvfit.get_feature_names()[1321]

'business'

In [None]:
cvfit.get_feature_names()[6614]

'obe'

## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidfv = TfidfVectorizer()
tfidfvfit = tfidfv.fit(whole_corpus)

In [None]:
Xt = tfidfvfit.transform(whole_corpus)
print(Xt[0])

  (0, 8975)	0.22632206862865534
  (0, 7645)	0.23740129161468576
  (0, 6798)	0.262496733865745
  (0, 6675)	0.20510493898107457
  (0, 6614)	0.3037472109213802
  (0, 6443)	0.18962884780354536
  (0, 6057)	0.24474887306072352
  (0, 3143)	0.23327996979525453
  (0, 3140)	0.27357595685177544
  (0, 3138)	0.21349792884379376
  (0, 2631)	0.183539799138351
  (0, 2402)	0.22332658636747968
  (0, 1864)	0.22332658636747968
  (0, 1555)	0.23740129161468576
  (0, 1321)	0.33841164023210485
  (0, 842)	0.2180468878336781
  (0, 381)	0.25809986567424625


In [None]:
cvfit.get_feature_names()[6443]

'national'

# Embeddings

In [None]:
import gensim.downloader as api
wv = api.load("glove-twitter-25")



In [None]:
print(wv.most_similar(positive=['car'], topn=5))

[('front', 0.936506986618042), ('on', 0.9070020318031311), ('table', 0.8939012885093689), ('truck', 0.8898833394050598), ('place', 0.8800071477890015)]


In [None]:
print(wv.most_similar(positive=['news', 'information'], topn=5))

[('business', 0.9177083373069763), ('report', 0.9114616513252258), ('uk', 0.9089304804801941), ('details', 0.9047592282295227), ('management', 0.9039708971977234)]


In [None]:
wv.most_similar(positive=['woman', 'king'], negative=['man'])

[('meets', 0.8841923475265503),
 ('prince', 0.832163393497467),
 ('queen', 0.8257461190223694),
 ('’s', 0.8174097537994385),
 ('crow', 0.8134994506835938),
 ('hunter', 0.8131038546562195),
 ('father', 0.811583399772644),
 ('soldier', 0.8111359477043152),
 ('mercy', 0.8082392811775208),
 ('hero', 0.8082262873649597)]

In [None]:
wv.most_similar(positive=['computer', 'programmer'], negative=['expert'])

[('server', 0.8650869131088257),
 ('scanner', 0.8362976312637329),
 ('wireless', 0.8300669193267822),
 ('computers', 0.8194390535354614),
 ('hotspot', 0.812617301940918),
 ('desktop', 0.810374915599823),
 ('engine', 0.8094812631607056),
 ('charging', 0.8057352900505066),
 ('sql', 0.8054704666137695),
 ('gps', 0.8033141493797302)]