<b> Clean and Pre-Process Data </b>

In [30]:
import pandas as pd
import gensim

reviews = pd.read_csv("amazon.csv")

# Clean data using the built in cleaner in gensim 
# remove all punctuation and stop words and tokenize the given sentence
reviews['Text'] = reviews['Text'].apply(lambda x: gensim.utils.simple_preprocess(x))

<b> Create Categories based on Scores </b>

In [31]:
reviews.Score = reviews.Score.astype(int) #convert Score to Integer

#function to create labels
def create_label(row):

    if row['Score'] < 3:
        return 'Negative'

    elif row['Score'] > 3:
        return 'Positive'

    else:
        return 'Neutral'

#create the labels
reviews['Score'] = reviews.apply(create_label, axis=1)

#show label distribution
reviews['Score'].value_counts()

Positive    38159
Negative     7939
Neutral      3902
Name: Score, dtype: int64

<b> Train and Test Sets </b>

In [32]:
from sklearn.model_selection import train_test_split

y = reviews['Score']
x = reviews['Text']

train_reviews, test_reviews, train_scores, test_scores = train_test_split(x, y, test_size=0.2)

<b> Word Embeddings </b>

In [33]:
from gensim.models import Word2Vec, word2vec
import os

# Set values for various word2vec parameters                    
min_word_count = 40   # Minimum word count                        
num_workers = 3       # Number of threads to run in parallel
downsampling = 1e-3   # Downsample setting for frequent words
num_features = 300    # Word vector dimensionality 
context = 10          # Context window size
    
model_name = 'train_model'

if not os.path.exists(model_name): 

    model = gensim.models.Word2Vec(train_reviews, workers = num_workers, vector_size = num_features, 
            min_count = min_word_count, window = context, sample = downsampling)

    model.save(model_name)

else:
    model = gensim.models.Word2Vec.load(model_name)

<b> Vectors and Average </b>

In [34]:
import numpy as np

index2key_set = set(model.wv.index_to_key) 

train_reviews_vec = np.array([np.array([model.wv[word] for word in review if word in index2key_set]) for review in train_reviews])

test_reviews_vec = np.array([np.array([model.wv[word] for word in review if word in index2key_set]) for review in test_reviews])

In [6]:
# Each sentance have different number of array vectors 
# which may cause an error while we training the model

for i, v in enumerate(train_reviews_vec):
    print(len(train_reviews.iloc[i]), len(v))

131 126
50 47
113 104
1 1
45 40
38 38
58 49
21 18
40 38
66 54
73 69
21 19
82 77
36 36
36 36
45 44
44 38
17 16
37 33
60 57
121 113
19 18
25 25
159 152
57 50
72 62
96 88
4 2
33 31
96 87
54 48
53 51
19 18
80 77
50 49
50 44
78 76
272 260
24 22
66 61
116 109
59 46
324 311
66 50
44 44
149 126
69 60
84 74
37 35
86 84
67 62
44 41
21 19
22 22
44 41
52 47
42 38
20 20
31 31
112 106
67 61
20 20
128 121
110 108
17 17
73 67
79 64
1 1
43 41
54 52
17 17
53 53
140 133
36 36
43 39
80 76
140 131
144 139
38 38
36 35
106 96
85 79
110 109
32 31
42 39
53 52
114 108
205 191
37 34
80 79
216 191
111 102
73 71
7 7
102 98
60 55
20 20
82 76
65 61
228 198
30 27
25 24
49 48
241 231
55 54
24 23
33 32
297 257
73 70
33 33
96 94
95 93
60 54
38 37
124 120
54 50
56 50
30 30
87 73
53 53
209 195
388 362
40 35
49 45
55 52
87 81
62 61
50 49
286 272
37 36
43 42
98 90
78 74
23 16
25 23
115 112
91 88
34 30
410 326
199 183
145 135
33 29
21 20
38 35
130 122
77 74
152 147
90 87
22 22
24 24
57 53
119 115
20 20
120 106
35 34
52 52
69

In [11]:
# average the first element across the word vectors
# and store that as the first entry in our final vector

# Compute sentence vectors by averaging the word vectors 
# for the words contained in the sentence

train_reviews_vec_avg = []

for v in train_reviews_vec:
    if v.size:
        train_reviews_vec_avg.append(v.mean(axis=0))
    else:
        train_reviews_vec_avg.append(np.zeros(num_features, dtype=float)) # create an array of zero's
        
test_reviews_vec_avg = []

for v in test_reviews_vec:
    if v.size:
        test_reviews_vec_avg.append(v.mean(axis=0))
    else:
        test_reviews_vec_avg.append(np.zeros(num_features, dtype=float))

In [8]:
# all the sentence vector lengths are consistent
for i, v in enumerate(train_reviews_vec_avg):
    print(len(train_reviews.iloc[i]), len(v))

131 300
50 300
113 300
1 300
45 300
38 300
58 300
21 300
40 300
66 300
73 300
21 300
82 300
36 300
36 300
45 300
44 300
17 300
37 300
60 300
121 300
19 300
25 300
159 300
57 300
72 300
96 300
4 300
33 300
96 300
54 300
53 300
19 300
80 300
50 300
50 300
78 300
272 300
24 300
66 300
116 300
59 300
324 300
66 300
44 300
149 300
69 300
84 300
37 300
86 300
67 300
44 300
21 300
22 300
44 300
52 300
42 300
20 300
31 300
112 300
67 300
20 300
128 300
110 300
17 300
73 300
79 300
1 300
43 300
54 300
17 300
53 300
140 300
36 300
43 300
80 300
140 300
144 300
38 300
36 300
106 300
85 300
110 300
32 300
42 300
53 300
114 300
205 300
37 300
80 300
216 300
111 300
73 300
7 300
102 300
60 300
20 300
82 300
65 300
228 300
30 300
25 300
49 300
241 300
55 300
24 300
33 300
297 300
73 300
33 300
96 300
95 300
60 300
38 300
124 300
54 300
56 300
30 300
87 300
53 300
209 300
388 300
40 300
49 300
55 300
87 300
62 300
50 300
286 300
37 300
43 300
98 300
78 300
23 300
25 300
115 300
91 300
34 300
410 300
1

<b> Random Forest </b>

In [9]:
from sklearn.ensemble import RandomForestClassifier

# Fit a random forest to the training data, using 100 trees
forest = RandomForestClassifier(n_estimators = 100)

forest_model = forest.fit(train_reviews_vec_avg, train_scores.values.ravel())

In [12]:
# Use the trained model to make predictions on the test data
y_pred = forest_model.predict(test_reviews_vec_avg)

In [18]:
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

print(classification_report(test_scores, y_pred))

recall = recall_score(test_scores, y_pred, average='micro')
precision = precision_score(test_scores, y_pred, average='micro')
f1 = f1_score(test_scores, y_pred, average='micro')

print('Recall: ', recall)
print('\nPrecision: ', precision)
print('\nF1-Score: ', f1)


Recall:  0.6264

Precision:  0.6264

F1-Score:  0.6264
