<a href="https://colab.research.google.com/github/mjahanshahi/intermediate-nlp/blob/master/Using_Embeddings_and_NLP_For_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data processing

In [21]:
import spacy
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import pickle
#!python -m spacy download en_core_web_md en
import en_core_web_md
nlp = en_core_web_md.load()

In [23]:
def basic_tokenizer(doc, model=nlp):
 
    parsed_doc = model(doc)

    # Tokens are those that are comprised of alphabetic characters and not urls and not stop words  
    return [t.lemma_ for t in parsed_doc if (t.is_alpha)&(not t.like_url)&(not t.is_stop)]

In [26]:
# Here we use scikit learn's count vectorizer with our tokenizer
cv = CountVectorizer(tokenizer=basic_tokenizer)

# Our mini corpus
text_data = ["A friend gave me these when I was recently diagnosed with breast cancer. I bought another pair because they are the best pjs I’ve ever owned. As soon as I came out of surgery, I asked the nurses to help me change into them. This is me wearing them on my first walk down the hospital hall. Now I only take them off to wash them and then put them immediately back on. They’ve survived 100 washes and still look new. They’re the best gift I’ve received during my breast cancer treatment",
            "While as others claimed these do run a little on the longer side, at 5'7'' and a size small, I found the length luxurious rather than sloppy. Other than the length, the fit is pretty true to size. They wash and wear well, don't get stretched out and have an extra button at the neck for when you need a little extra warmth. The elastic waist band is thick and just generous enough not to be tight, but just in case you're tiny waisted, there is a drawstring as well. These have taken over as my new favorite!",
            "I bought these pjs in navy and white stripes a few years ago and have been in love with them forever, so soft and dreamy just like the name. I decided to finally splurge on a second pair recently and it seems like the quality has gotten cheaper. The seams are EXTREMELY itchy. There’s a weird plastic piece that runs along all the seams and is constantly scratching my skin....not the best feeling in bed. Please get rid of the weird plastic seam!!",
            "These are great soft pajamas, except the size small pants I received have a 33' inseam, which is obviously crazy long. I'm not sure if the pair I received is flawed, given the description says they have a 27' inseam"]

v = cv.fit_transform(text_data).toarray()
print(v)


[[0 1 0 0 2 0 1 2 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 2
  0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0
  0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 0 0 0 1 0 2 1 0 0 0]
 [0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 1 2 0 1 0 0 1 1 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 2 0 2 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
  0 2 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 1 0 1 1 1 0 0 0]
 [1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 0 0 1 0 0 1
  0 0 0 0 0 0 1 0 2 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 2 0 1 0 1 1 1 0 1 3
  1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 1 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
  1 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 2 0 0 0 1 0 0
  0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [27]:
# Make these arrays human readable with feature names 
dict(zip(cv.get_feature_names(), v.sum(axis=0)))

{'ago': 1,
 'ask': 1,
 'band': 1,
 'bed': 1,
 'breast': 2,
 'button': 1,
 'buy': 2,
 'cancer': 2,
 'case': 1,
 'change': 1,
 'cheap': 1,
 'claim': 1,
 'come': 1,
 'constantly': 1,
 'crazy': 1,
 'decide': 1,
 'description': 1,
 'diagnose': 1,
 'drawstring': 1,
 'dreamy': 1,
 'elastic': 1,
 'extra': 2,
 'extremely': 1,
 'favorite': 1,
 'feeling': 1,
 'finally': 1,
 'find': 1,
 'fit': 1,
 'flawed': 1,
 'forever': 1,
 'friend': 1,
 'generous': 1,
 'get': 1,
 'gift': 1,
 'give': 2,
 'good': 3,
 'great': 1,
 'hall': 1,
 'help': 1,
 'hospital': 1,
 'immediately': 1,
 'inseam': 2,
 'itchy': 1,
 'length': 2,
 'like': 2,
 'little': 2,
 'long': 2,
 'look': 1,
 'love': 1,
 'luxurious': 1,
 'navy': 1,
 'neck': 1,
 'need': 1,
 'new': 2,
 'nurse': 1,
 'obviously': 1,
 'own': 1,
 'pair': 3,
 'pajama': 1,
 'pant': 1,
 'piece': 1,
 'pjs': 2,
 'plastic': 2,
 'pretty': 1,
 'quality': 1,
 'receive': 3,
 'recently': 2,
 'rid': 1,
 'run': 2,
 'say': 1,
 'scratch': 1,
 'seam': 3,
 'second': 1,
 'size': 3,
 's

## Dataframe Cleanup

In [92]:
# Now let's apply to the review dataset
DATASET_LINK = "https://raw.githubusercontent.com/AFAgarap/ecommerce-reviews-analysis/master/Womens%20Clothing%20E-Commerce%20Reviews.csv"
df = pd.read_csv(DATASET_LINK, usecols=["Clothing ID", "Title", "Review Text", "Rating"])
df.shape

(23486, 4)

In [91]:
df.sample(15)

Unnamed: 0,Clothing ID,Title,Review Text,Rating,Review Tokens
4163,1078,Great dress!,This dress is a great casual outfit. tha fabri...,4,
2743,984,Fun jacket,Love the fabric and the casual way this jacket...,5,
8799,907,Cute but color different than pictured,I purchased this sweater in the grey color and...,4,
17369,1048,Flattering and simply perfect,I tried these on in the store and instantly fe...,5,
12195,875,Perfect fall piece,I saw this and had to have it! it is so beaut...,4,
15466,1068,Pants oversized,I wanted to love these pants since i hate supe...,3,
11243,1022,Pleasantly surprised,I ordered these jsut coz they were on sale... ...,5,
11927,868,Cozy elegance!,"First off, this top is super cozy! i adore an ...",5,
10362,1067,Beautiful,I love this jumpsuit. i've worn it twice and r...,5,
7782,936,Love it!,This sweater is super cozy and comfy and my ne...,5,


In [93]:
# Lets remove any review without a text review or a rating
df.dropna(how = "any", subset=['Review Text', 'Rating'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.shape
# Remove this sampling to ensure broader reach
df = df.sample(2000)

In [94]:
df.sample(15)

Unnamed: 0,Clothing ID,Title,Review Text,Rating
12917,1072,,This dress is one of my recent faves from reta...,5
9982,1056,Very nice and versatile pants,I'm typically a 27. this 27 seems snug but the...,4
15160,865,Stylish & comfortable,I ordered the green top in store after trying ...,4
651,1087,Simple but different.,I bought this dress in the cream color. it was...,4
18641,867,Cute but no,This shirt ran small and was more sheer than i...,2
36,1002,,This is a comfortable skirt that can span seas...,4
21470,451,Just don't wash it,I loved this dress...until i washed it. the la...,3
4669,940,Like wearing a hug,"Great sweater, beautiful detail, warm and cozy...",5
10890,829,Soft and swinging,The color and fabric are really soft and lovel...,4
12548,1081,"Perfect fit, forgiving belly","This dress is so soft, and fits like a dream. ...",5


## From Tokens to Vectors

In [101]:
count = CountVectorizer(tokenizer=basic_tokenizer)
count_vecs = count.fit_transform(df['Review Text'])
count_df = pd.DataFrame(count_vecs.toarray(), columns=count.get_feature_names())

In [104]:
count_df.head()

Unnamed: 0,ab,abby,abck,abdomen,able,absolute,absolutely,abstract,abt,abundance,ac,accent,accented,accentuate,accentuatea,accentuated,accept,acceptable,acceptably,access,accessorize,accessorizing,accessory,accidentally,acco,accommodate,accomodate,accompany,accomplish,accord,accumulate,accuracy,accurate,accurately,achieve,acknowledge,acrylic,act,actual,actuallly,...,wrinkle,wrinkled,wrinkling,wrinkly,wrist,write,wrong,x,xl,xs,xsmall,xsp,xspetite,xxs,xxsp,y,yank,yarn,yay,year,yellow,yellowed,yellowy,yes,yesterday,yikes,yo,yoga,yogi,yolk,young,yr,yuck,yummy,zero,zip,ziploc,zipped,zipper,zoom
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [102]:
tfidf = TfidfVectorizer(tokenizer=basic_tokenizer)
tfidf_vecs = tfidf.fit_transform(df['Review Text'])
tfidf_df = pd.DataFrame(tfidf_vecs.toarray(), columns=tfidf.get_feature_names())

In [103]:
tfidf_df.head()

Unnamed: 0,ab,abby,abck,abdomen,able,absolute,absolutely,abstract,abt,abundance,ac,accent,accented,accentuate,accentuatea,accentuated,accept,acceptable,acceptably,access,accessorize,accessorizing,accessory,accidentally,acco,accommodate,accomodate,accompany,accomplish,accord,accumulate,accuracy,accurate,accurately,achieve,acknowledge,acrylic,act,actual,actuallly,...,wrinkle,wrinkled,wrinkling,wrinkly,wrist,write,wrong,x,xl,xs,xsmall,xsp,xspetite,xxs,xxsp,y,yank,yarn,yay,year,yellow,yellowed,yellowy,yes,yesterday,yikes,yo,yoga,yogi,yolk,young,yr,yuck,yummy,zero,zip,ziploc,zipped,zipper,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.113478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [136]:
def top_tfidf_words(tfidf_df):
    return(tfidf_df[tfidf_df>0.3].mean(axis=0))
top_tfidf_words(tfidf_df)

ab         0.327036
abby       0.322623
abck            NaN
abdomen         NaN
able       0.301735
             ...   
zip        0.376426
ziploc          NaN
zipped     0.319633
zipper     0.376748
zoom            NaN
Length: 3613, dtype: float64

In [135]:
# Get similarities
count_sims = cosine_similarity(count_vecs)
tfidf_sims = cosine_similarity(tfidf_vecs)

count_sims

array([[1.        , 0.04351941, 0.07106691, ..., 0.04065578, 0.        ,
        0.07312724],
       [0.04351941, 1.        , 0.20412415, ..., 0.21408721, 0.16556654,
        0.14002801],
       [0.07106691, 0.20412415, 1.        , ..., 0.2224746 , 0.10814761,
        0.17149859],
       ...,
       [0.04065578, 0.21408721, 0.2224746 , ..., 1.        , 0.12373764,
        0.29433147],
       [0.        , 0.16556654, 0.10814761, ..., 0.12373764, 1.        ,
        0.05564149],
       [0.07312724, 0.14002801, 0.17149859, ..., 0.29433147, 0.05564149,
        1.        ]])

### Exercise Time!

How do we use these arrays of similarities to identify documents that are similar to the first review?

## From Review to Document Vectors

In [137]:
# Use spaCy's vectors
toy_df = df.head(10)

for index, row in toy_df.iterrows():
  rating = row["Rating"]
  doc = nlp(row["Review Text"])

  # A 1D numpy array representing the document’s semantics.
  doc_vector = doc.vector
  # The L2 norm of the vector representation.
  doc_vector_norm = doc.vector_norm


### Exercise Time!
Generate a 2D-array / dataframe with the 300d vectors + Rating (so 301 columns, with 10 rows for the toy dataframe)

## Future Work
Add vectors

## Decide which machine learning algorithm to use


![](https://scikit-learn.org/stable/_static/ml_map.png)
[Reference](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html)


In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier

etree_w2v = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])
etree_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", ExtraTreesClassifier(n_estimators=200))])

## Doc2Vec
This is the gensim implementation of doc2vec. 

In [None]:
# Init the Doc2Vec model
doc2vec_model = gensim.models.doc2vec.Doc2Vec(vector_size=20, min_count=4, epochs=20)

# Build the Volabulary
doc2vec_.build_vocab(train_data)

# Train the Doc2Vec model
doc2vec_.train(train_data, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)