## Dataset preparation

In [119]:
data = {
    "reviews": ["I love this movie although is't not a movie", "The worst movie about homomorphisms ever", 
              "I didn't like the movie", "Not the worst film but still bad"],
    "user_status": ["premium", "free", "incognito", "premium"],
    "stars": [5, 1, 3, 2] 
} 

In [120]:
import pandas as pd
df = pd.DataFrame(data)

In [121]:
df

Unnamed: 0,reviews,user_status,stars
0,I love this movie although is't not a movie,premium,5
1,The worst movie about homomorphisms ever,free,1
2,I didn't like the movie,incognito,3
3,Not the worst film but still bad,premium,2


## One-hot encoding

In [122]:
df = pd.get_dummies(df, columns=["user_status"])
df

Unnamed: 0,reviews,stars,user_status_free,user_status_incognito,user_status_premium
0,I love this movie although is't not a movie,5,0,0,1
1,The worst movie about homomorphisms ever,1,1,0,0
2,I didn't like the movie,3,0,1,0
3,Not the worst film but still bad,2,0,0,1


See also 
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

## Lemmatization / Stemming

In [54]:
text = "I am loving dogs!"

In [55]:
from nltk import word_tokenize
word_tokenize(text)

['I', 'am', 'loving', 'dogs', '!']

In [56]:
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
words

['i', 'am', 'loving', 'dogs']

In [103]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in words]

['i', 'am', 'loving', 'dog']

In [104]:
from nltk.corpus import wordnet
[lemmatizer.lemmatize(word, 'v') for word in words]

['i', 'be', 'love', 'dog']

In [57]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
[stemmer.stem(word) for word in words]

['i', 'am', u'love', u'dog']

## Count Vectorizer / Bag of Words

In [108]:
class LemmaTokenizer():
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [124]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
vectorizer.fit(df.reviews)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer object at 0x000002A4B5A9AC18>,
        vocabulary=None)

In [125]:
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [126]:
vectorizer.vocabulary_

{'love': 6,
 'movie': 7,
 "is't": 4,
 'worst': 9,
 'homomorphism': 3,
 'did': 1,
 "n't": 8,
 'like': 5,
 'film': 2,
 'bad': 0}

In [127]:
reviews = vectorizer.transform(df.reviews)
reviews

<4x10 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [128]:
reviews.toarray()

array([[0, 0, 0, 0, 1, 0, 1, 2, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 1, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

## TF-IDF

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(df.reviews)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [138]:
vectorizer.vocabulary_

{'love': 5,
 'movie': 6,
 'worst': 7,
 'homomorphisms': 3,
 'didn': 1,
 'like': 4,
 'film': 2,
 'bad': 0}

In [140]:
reviews = vectorizer.transform(df.reviews)

In [141]:
reviews

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [142]:
reviews.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.61666846, 0.78722298, 0.        ],
       [0.        , 0.        , 0.        , 0.70203482, 0.        ,
        0.        , 0.44809973, 0.55349232],
       [0.        , 0.64450299, 0.        , 0.        , 0.64450299,
        0.        , 0.41137791, 0.        ],
       [0.61761437, 0.        , 0.61761437, 0.        , 0.        ,
        0.        , 0.        , 0.48693426]])

## Combining together

In [155]:
X_user_status = df[["user_status_free", "user_status_incognito", "user_status_premium"]].values
y = df["stars"].values

In [156]:
y

array([5, 1, 3, 2], dtype=int64)

In [158]:
X_user_status

array([[0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=uint8)

In [175]:
X = np.concatenate([X_user_status, reviews.toarray()], axis=1)

In [176]:
X

array([[0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61666846, 0.78722298,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70203482, 0.        , 0.        , 0.44809973,
        0.55349232],
       [0.        , 1.        , 0.        , 0.        , 0.64450299,
        0.        , 0.        , 0.64450299, 0.        , 0.41137791,
        0.        ],
       [0.        , 0.        , 1.        , 0.61761437, 0.        ,
        0.61761437, 0.        , 0.        , 0.        , 0.        ,
        0.48693426]])

In [177]:
X.shape

(4, 11)

In [27]:
import gensim.downloader as api
corpus = api.load('text8')



In [28]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(corpus)

In [46]:
print(model.most_similar('queen'))

[(u'princess', 0.7655839920043945), (u'elizabeth', 0.7323757410049438), (u'prince', 0.7205917835235596), (u'mary', 0.6880574226379395), (u'king', 0.685602605342865), (u'consort', 0.6744344830513), (u'duchess', 0.6717782020568848), (u'crown', 0.6609401106834412), (u'lord', 0.6212639808654785), (u'aragon', 0.6199183464050293)]


  """Entry point for launching an IPython kernel.


In [47]:
from scipy.spatial.distance import cosine
similarity = 1 - cosine(model['king'] - model['man'] + model['woman'], model['queen'])

  


In [48]:
similarity

0.6973008513450623