## Dataset preparation

In [None]:
data = {
    "reviews": ["I love this movie although is't not a movie", "The worst movie about homomorphisms ever", 
              "I didn't like the movie", "Not the worst film but still bad"],
    "user_status": ["premium", "free", "incognito", "premium"],
    "stars": [5, 1, 3, 2] 
} 

In [None]:
import pandas as pd
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,reviews,stars,user_status
0,I love this movie although is't not a movie,5,premium
1,The worst movie about homomorphisms ever,1,free
2,I didn't like the movie,3,incognito
3,Not the worst film but still bad,2,premium


## One-hot encoding

In [9]:
df = pd.get_dummies(df, columns=["user_status"])
df

Unnamed: 0,reviews,stars,user_status_free,user_status_incognito,user_status_premium
0,I love this movie although is't not a movie,5,0,0,1
1,The worst movie about homomorphisms ever,1,1,0,0
2,I didn't like the movie,3,0,1,0
3,Not the worst film but still bad,2,0,0,1


See also 
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

## Lemmatization / Stemming

In [10]:
text = "I am loving dogs!"

In [11]:
from nltk import word_tokenize
word_tokenize(text)

['I', 'am', 'loving', 'dogs', '!']

In [12]:
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()]
words

['i', 'am', 'loving', 'dogs']

In [13]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
[lemmatizer.lemmatize(word) for word in words]

['i', 'am', 'loving', u'dog']

In [14]:
from nltk.corpus import wordnet
[lemmatizer.lemmatize(word, 'v') for word in words]

['i', u'be', u'love', u'dog']

In [15]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
[stemmer.stem(word) for word in words]

['i', 'am', u'love', u'dog']

## Count Vectorizer / Bag of Words

In [17]:
class LemmaTokenizer():
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(tokenizer=LemmaTokenizer(), stop_words='english')
vectorizer.fit(df.reviews)

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=<__main__.LemmaTokenizer instance at 0x0000000011D86148>,
        vocabulary=None)

In [19]:
vectorizer.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [20]:
vectorizer.vocabulary_

{u'bad': 0,
 u'did': 1,
 u'film': 2,
 u'homomorphism': 3,
 u"is't": 4,
 u'like': 5,
 u'love': 6,
 u'movie': 7,
 u"n't": 8,
 u'worst': 9}

In [21]:
reviews = vectorizer.transform(df.reviews)
reviews

<4x10 sparse matrix of type '<type 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [22]:
reviews.toarray()

array([[0, 0, 0, 0, 1, 0, 1, 2, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1, 0, 1, 1, 0],
       [1, 0, 1, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [23]:
df.reviews

0    I love this movie although is't not a movie
1       The worst movie about homomorphisms ever
2                        I didn't like the movie
3               Not the worst film but still bad
Name: reviews, dtype: object

In [25]:
doc = "I didn't like the movie"
doc = doc.replace("n't", " not")
word_tokenize(doc)

['I', 'did', 'not', 'like', 'the', 'movie']

## TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(df.reviews)

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [27]:
vectorizer.vocabulary_

{u'bad': 0,
 u'didn': 1,
 u'film': 2,
 u'homomorphisms': 3,
 u'like': 4,
 u'love': 5,
 u'movie': 6,
 u'worst': 7}

In [28]:
reviews = vectorizer.transform(df.reviews)

In [29]:
reviews

<4x8 sparse matrix of type '<type 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [30]:
reviews.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.61666846, 0.78722298, 0.        ],
       [0.        , 0.        , 0.        , 0.70203482, 0.        ,
        0.        , 0.44809973, 0.55349232],
       [0.        , 0.64450299, 0.        , 0.        , 0.64450299,
        0.        , 0.41137791, 0.        ],
       [0.61761437, 0.        , 0.61761437, 0.        , 0.        ,
        0.        , 0.        , 0.48693426]])

In [32]:
df.reviews

0    I love this movie although is't not a movie
1       The worst movie about homomorphisms ever
2                        I didn't like the movie
3               Not the worst film but still bad
Name: reviews, dtype: object

## Combining together

In [33]:
X_user_status = df[["user_status_free", "user_status_incognito", "user_status_premium"]].values
y = df["stars"].values

In [34]:
y

array([5, 1, 3, 2], dtype=int64)

In [35]:
X_user_status

array([[0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]], dtype=uint8)

In [37]:
import numpy as np
X = np.concatenate([X_user_status, reviews.toarray()], axis=1)

In [38]:
X

array([[0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.61666846, 0.78722298,
        0.        ],
       [1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.70203482, 0.        , 0.        , 0.44809973,
        0.55349232],
       [0.        , 1.        , 0.        , 0.        , 0.64450299,
        0.        , 0.        , 0.64450299, 0.        , 0.41137791,
        0.        ],
       [0.        , 0.        , 1.        , 0.61761437, 0.        ,
        0.61761437, 0.        , 0.        , 0.        , 0.        ,
        0.48693426]])

In [39]:
X.shape

(4L, 11L)

In [40]:
import gensim.downloader as api
corpus = api.load('text8')

In [41]:
from gensim.models.word2vec import Word2Vec
model = Word2Vec(corpus)

In [42]:
print(model.most_similar('queen'))

[(u'elizabeth', 0.7420587539672852), (u'princess', 0.7322125434875488), (u'king', 0.7214112281799316), (u'prince', 0.7202872037887573), (u'crown', 0.6830958127975464), (u'duchess', 0.6664050221443176), (u'consort', 0.6534654498100281), (u'lord', 0.6516781449317932), (u'mary', 0.6270979642868042), (u'scotland', 0.6161083579063416)]


  """Entry point for launching an IPython kernel.


In [43]:
from scipy.spatial.distance import cosine
similarity = 1 - cosine(model['king'] - model['man'] + model['woman'], model['queen'])

  


In [44]:
similarity

0.7103353142738342

In [47]:
model['man']

  """Entry point for launching an IPython kernel.


array([-0.39321202, -0.82979596,  0.5366214 , -0.51586336,  0.01703729,
        2.3186903 ,  1.9217904 ,  0.7300166 , -1.6900206 , -0.02804155,
       -0.35286266,  0.25595567, -0.03586963, -1.5847023 , -0.04313298,
        0.4324705 ,  1.8898307 , -0.06880635, -1.7258778 ,  1.3760555 ,
        0.14075226, -0.8342037 , -0.24810284,  0.76952535,  0.6901221 ,
        1.1811479 ,  0.7810249 , -0.05300916, -0.43695885,  1.5687736 ,
       -0.82605004,  1.2219188 ,  0.33462164, -1.212732  , -3.8276238 ,
        0.21853316,  1.0441166 , -1.0442098 ,  2.2259822 , -0.7789739 ,
       -3.2994199 , -1.1204485 , -1.767611  ,  0.78989303, -1.060451  ,
       -2.310821  ,  0.89852375,  0.97229236, -1.2660741 ,  1.1673027 ,
       -1.6573262 ,  0.85546345,  0.37193745, -1.551588  ,  0.7949051 ,
        0.33036926, -1.5941076 , -1.9791772 ,  0.02571503,  1.0324751 ,
        2.7636142 ,  0.9749916 , -0.5964365 ,  0.64952445,  0.75418764,
        1.1445526 ,  0.25784975, -2.0969234 ,  1.7010124 ,  2.30