In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords

import gensim
from gensim import models
import gensim.test.utils
from gensim.models import Word2Vec,word2vec

# Read csv

In [2]:
amazon=pd.read_csv("amazon.csv")
amazon

Unnamed: 0,Text,Score
0,The description and photo on this product need...,3
1,This was a great book!!!! It is well thought t...,5
2,"I am a first year teacher, teaching 5th grade....",5
3,I got the book at my bookfair at school lookin...,5
4,Hi! I'm Martine Redman and I created this puzz...,5
...,...,...
49995,Stays on continuously without shutting off! It...,4
49996,these look great in our 10 gallon tank- colors...,4
49997,"This works great, but needs a better way to at...",4
49998,she absolutely LOVES this thing. I dice up gre...,5


# Removing stop words and symbols from reviews

In [3]:
amazon.drop(labels=6930, axis=0,inplace=True)
amazon.drop(labels=15126, axis=0,inplace=True)
amazon.drop(labels=21703, axis=0,inplace=True)
amazon.drop(labels=45232, axis=0,inplace=True)
amazon.drop(labels=47652, axis=0,inplace=True)

In [4]:
spec_chars = ["!",'"',"#","%","&","'","(",")",
              "*","+",",","-",".","/",":",";","<",
              "=",">","?","@","[","\\","]","^","_",
              "`","{","|","}","~","–","th"]
for char in spec_chars:
    amazon['Text']= amazon['Text'].str.replace(char, '')

In [5]:
stop_words = stopwords.words('english')
amazon['Text']=amazon['Text'].str.lower()
amazon['Text'] = amazon['Text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words )]))
amazon['Text']

0        description photo product needs changed indica...
1        great book well ought rough easily imagine e e...
2        first year teacher teaching 5 grade special re...
3        got e book bookfair school looking someing e s...
4        hi im martine redman created puzzle briarpatch...
                               ...                        
49995    stays continuously wiout shutting however reac...
49996    ese look great 10 gallon tank colors nice fitt...
49997    works great needs better way attach made ow wi...
49998    absolutely loves ing dice green beans carrots ...
49999    hurt neck went rehab exercises using ese eraba...
Name: Text, Length: 49995, dtype: object

# Preparing data for word2vec

In [6]:
review = amazon.Text.apply(gensim.utils.simple_preprocess)

In [7]:
review[0]

['description',
 'photo',
 'product',
 'needs',
 'changed',
 'indicate',
 'product',
 'buffalos',
 'version',
 'beef',
 'jerky']

# Word2Vec

In [13]:
model = Word2Vec(
    window=1,
    min_count=1,
   size=30
)

In [14]:
model.build_vocab(review, progress_per=10000)

In [15]:
model.corpus_count

49995

In [16]:
model.train(review, total_examples=model.corpus_count, epochs=model.epochs)

(9361152, 9944130)

In [17]:
model['buffalos']

  model['buffalos']


array([ 0.02844503,  0.03905932,  0.01299202, -0.006973  ,  0.01492871,
       -0.01844316,  0.01625709, -0.02798548, -0.00056799, -0.01690986,
       -0.02750277, -0.00336712, -0.03455596, -0.02725443,  0.01327674,
        0.00692735,  0.00451391,  0.00728074, -0.01626475, -0.02994874,
       -0.0077282 ,  0.01901851, -0.00153956, -0.00236461,  0.00873401,
       -0.00244642, -0.03493404,  0.01976734, -0.01522306, -0.02120811],
      dtype=float32)

# Dataframe for random forest and sentence embenddings

In [18]:
data=[]
for i in review:
    #print(i)
    data.append(model[i])  

  data.append(model[i])


In [19]:
data[0]

array([[ 1.36311615e+00,  2.33265519e+00,  7.38686919e-01,
        -3.33364278e-01,  2.20142916e-01, -1.52771011e-01,
         2.39272666e+00, -1.32103062e+00, -4.41159755e-01,
        -6.68473989e-02, -9.11748230e-01,  1.22988307e+00,
         6.63443446e-01, -1.41512120e+00,  9.92511749e-01,
         6.53236806e-01, -5.71447313e-01, -8.97217035e-01,
        -1.33602023e-01, -1.13896348e-01, -5.80222383e-02,
         5.52219510e-01, -2.22887620e-01, -2.25475058e-01,
         3.67450863e-01,  1.34195292e+00, -1.97339058e-01,
         1.94138503e+00, -2.89860219e-01,  1.73268533e+00],
       [ 5.11402547e-01,  5.76397300e-01,  4.31425333e-01,
        -2.42484689e-01,  3.77824366e-01, -2.89658844e-01,
         1.25829113e+00, -1.26901507e+00, -2.94939559e-02,
        -2.63377547e-01, -7.53781557e-01,  6.81210101e-01,
         2.96793759e-01, -1.12556279e+00,  6.48210883e-01,
         5.53578675e-01, -3.27411413e-01, -5.25753975e-01,
         7.25641176e-02, -5.42322457e-01,  1.10151336e-

In [20]:
rf=[]
for i in range(len(data)):
    rf.append(np.mean(data[i], axis=0))

In [21]:
rf

[array([ 0.53176785,  0.5349631 ,  0.2597369 , -0.04155971,  0.77268595,
        -0.33978358,  0.94462603, -0.71545225, -0.07297313, -0.41309813,
        -0.55902267,  0.44622278, -0.17316778, -1.1567222 ,  0.7020261 ,
         0.7344494 , -0.3572258 , -0.16383599, -0.15149643, -0.10739697,
         0.17182037,  0.15596722, -0.19014062, -0.35191092,  0.8029956 ,
         0.7481846 , -0.68971187,  0.8299488 , -0.16571619,  0.8791373 ],
       dtype=float32),
 array([ 6.8680102e-01,  1.0919112e-05,  3.8896173e-01, -5.4637078e-02,
         2.2397906e-01, -2.2513707e-01,  1.0163717e+00, -8.3774197e-01,
         1.5664410e-03, -2.1532527e-01, -3.7323922e-01,  4.7566554e-01,
        -1.7622225e-01, -9.6059382e-01,  8.7329811e-01,  1.0687647e+00,
        -5.2184159e-01, -1.5250975e-01,  2.3987176e-01, -2.0257504e-01,
        -3.8945457e-01,  2.1309916e-02,  2.6350564e-01, -8.0594534e-01,
         1.1456157e+00,  7.1747613e-01, -1.2097412e+00,  4.4159475e-01,
         8.2441516e-02,  7.6833665

# Random Forest

In [22]:
from sklearn.model_selection import train_test_split


X = rf #Features
y = amazon['Score']   # Target Variables
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=7)

In [23]:
X_train

[array([ 0.6907923 ,  0.15605867,  0.25254646, -0.08108079,  0.2522195 ,
        -0.43951178,  0.959229  , -0.70753485, -0.20979518, -0.40543017,
        -0.49457234,  0.5501713 , -0.5330401 , -0.8758646 ,  0.72409093,
         1.1340557 , -0.38481757, -0.18302831,  0.5969708 , -0.24722682,
        -0.13485214, -0.1528591 ,  0.12334066, -0.7497411 ,  1.4669765 ,
         1.0005455 , -1.4538743 ,  0.68693006, -0.22094977,  0.8797793 ],
       dtype=float32),
 array([ 0.732754  ,  0.14963265,  0.07839935, -0.39186475,  0.6053326 ,
        -0.1248517 ,  0.72665256, -0.5766821 , -0.09672884, -0.19444168,
        -0.3825979 ,  0.19590305, -0.2518582 , -0.7593697 ,  0.6533622 ,
         0.91453457, -0.6364168 , -0.30822316,  0.25675052, -0.09876599,
        -0.23378888, -0.15389977, -0.1081825 , -0.5520812 ,  1.285071  ,
         0.8462738 , -0.82858825,  0.34584936, -0.35501465,  0.66951317],
       dtype=float32),
 array([ 0.6480144 ,  0.0883607 ,  0.6992525 , -0.03132074,  0.585425  ,
   

In [24]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=400)
rfc.fit(X_train, y_train)

RandomForestClassifier(n_estimators=400)

In [25]:
rfc_pred = rfc.predict(X_test)

In [26]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           1       0.53      0.17      0.25      1016
           2       1.00      0.01      0.02       581
           3       0.86      0.02      0.05       793
           4       0.42      0.03      0.05      1709
           5       0.61      0.99      0.76      5900

    accuracy                           0.61      9999
   macro avg       0.68      0.24      0.23      9999
weighted avg       0.61      0.61      0.49      9999



In [27]:
print(accuracy_score(y_test,rfc_pred)*100)

60.83608360836084
