<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Building RF model with Word2Vec</p>

### Importing Required Modules

In [43]:
import pandas as pd
import gensim
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np

### Reading Data

In [38]:
# Reading data from pickle file which preserved the list data type
X_train = pd.read_pickle("data/X_train.pkl")
X_test = pd.read_pickle("data/X_test.pkl")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

In [39]:
X_train[0]

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat']

### Vectorizing the Data

In [40]:
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100, # 'size' of the vector,
                                   window=3,        #'window' is the maximum distance between the current and predicted word 
                                                    # within a sentence.
                                   min_count=2      #'min_count' ignore all words with total frequency lower than this
                                  )

In [41]:
w2v_model.wv.index_to_key[:10]

['u', 'call', '2', 'im', 'ur', 'get', 'free', 'å', 'go', '4']

In [46]:
words = w2v_model.wv.index_to_key
X_train_vect = [np.array([w2v_model.wv[i]
                          for i in ls
                          if i in w2v_model.wv.index_to_key])
                for ls in X_train]
X_test_vect = [np.array([w2v_model.wv[i]
                          for i in ls
                          if i in w2v_model.wv.index_to_key])
                for ls in X_test]

In [47]:
X_train_vect_avg = []
for vect in X_train_vect:
    if len(vect) != 0:
        X_train_vect_avg.append(vect.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100))

        
X_test_vect_avg = []
for vect in X_test_vect:
    if len(vect) != 0:
        X_test_vect_avg.append(vect.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100))

In [50]:
len(X_train_vect_avg), len(X_test_vect_avg)

(4457, 1115)

In [52]:
X_train_vect[0]

array([[-6.69803545e-02,  1.02860115e-01,  2.17516553e-02,
         1.32015173e-03, -4.82793525e-03, -1.28149644e-01,
         3.31592038e-02,  1.59097254e-01, -4.87229973e-02,
        -3.01212296e-02, -4.52283174e-02, -1.08836077e-01,
        -6.65801065e-03,  2.22745612e-02, -7.72668427e-05,
        -5.98038025e-02,  1.78443659e-02, -7.70218223e-02,
        -1.06462808e-02, -1.52409285e-01,  5.72751053e-02,
         6.41649291e-02,  5.13491295e-02, -2.54002586e-02,
        -3.22458297e-02,  1.34149659e-02, -6.22793287e-02,
        -6.78288266e-02, -6.49586767e-02,  1.61025785e-02,
         9.89502668e-02,  4.73650694e-02,  4.30650897e-02,
        -4.59256545e-02, -5.29546700e-02,  1.03328601e-01,
        -7.69436266e-03, -6.29767925e-02, -4.39025573e-02,
        -1.41546652e-01,  2.59117968e-02, -7.69384876e-02,
        -2.63931360e-02,  2.83053168e-03,  6.33048639e-02,
        -5.07848114e-02, -7.55221620e-02, -1.57285575e-02,
         2.91331913e-02,  6.29634783e-02,  5.65952919e-0

In [51]:
X_test_vect_avg[0]

array([-0.12942263,  0.17597687,  0.03171194, -0.0127439 , -0.02371626,
       -0.2331    ,  0.06882467,  0.3134724 , -0.08379792, -0.07236389,
       -0.09798687, -0.19297709, -0.01566335,  0.04329884,  0.00925785,
       -0.10843015,  0.02198656, -0.15612675, -0.00080967, -0.29926273,
        0.09199122,  0.12773265,  0.0930573 , -0.03509374, -0.05741769,
        0.02295233, -0.09939837, -0.13883917, -0.10759931,  0.01537984,
        0.17733431,  0.07516875,  0.06037512, -0.09840945, -0.08572774,
        0.2039517 , -0.00238057, -0.11347193, -0.09018958, -0.28189018,
        0.03971195, -0.14564332, -0.03493845,  0.01026732,  0.14099069,
       -0.11275298, -0.13549386, -0.04071792,  0.05404897,  0.10176396,
        0.09356767, -0.0922479 , -0.03853101, -0.02643051, -0.07900909,
        0.06065756,  0.10860988, -0.00364809, -0.11652074,  0.01662985,
        0.06663826,  0.04090013, -0.03826622, -0.03890239, -0.15443702,
        0.11308748,  0.04518036,  0.08523154, -0.18573488,  0.18

### Building Random Forest with Word2Vec

In [60]:
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [61]:
y_pred = rf_model.predict(X_test_vect_avg)

In [62]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96       976
           1       0.99      0.49      0.65       139

    accuracy                           0.94      1115
   macro avg       0.96      0.74      0.81      1115
weighted avg       0.94      0.94      0.93      1115

