# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('./data/X_train.csv')
X_test = pd.read_csv('./data/X_test.csv')
y_train = pd.read_csv('./data/y_train.csv')
y_test = pd.read_csv('./data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [7]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)

X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [8]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [10]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[ 5.9639569e-04,  7.3035160e-04, -3.8827457e-03,  5.8789278e-04,
        -4.2779152e-03, -6.8040134e-04,  4.9140695e-03,  3.9929179e-03,
        -6.0071843e-04,  5.6044763e-04, -4.2864536e-03, -2.5919455e-03,
        -6.7205669e-04, -8.9368370e-04,  4.8545515e-03,  1.0402094e-03,
        -1.7739738e-03,  2.6903355e-03, -4.3217693e-03,  1.7335152e-03,
        -4.1503562e-03, -7.0833758e-04, -3.8211327e-03,  3.9190855e-03,
         1.3686051e-03, -2.1762033e-03,  2.2074300e-05,  4.6246266e-03,
         2.9554672e-03, -2.2591446e-03, -1.8773340e-03,  4.1531795e-03,
         1.1565351e-03,  8.5715420e-04, -1.9132494e-03,  6.8562466e-04,
         4.2995992e-03, -4.8008002e-03, -3.7418474e-03,  1.6722732e-03,
        -1.6568202e-04,  4.6566953e-03,  3.0433354e-03, -7.9261292e-05,
         4.5219203e-03,  3.0972885e-03,  3.4422444e-03, -2.5187277e-03,
        -4.5196275e-04,  4.0694904e-03,  2.9836893e-03,  2.6207906e-03,
        -4.1318708e-03,  1.0546823e-03, -7.9928992e-05,  3.79577

In [11]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([ 5.4272678e-04, -1.5297036e-03, -3.1510997e-03, -2.0462256e-03,
       -6.8847928e-04,  1.4440207e-03,  6.0950581e-04,  2.2677360e-03,
       -2.6647714e-03,  2.6886708e-03, -2.5065949e-03, -3.2517365e-03,
       -5.7582150e-04,  1.4874248e-03,  3.5149299e-03, -1.8657855e-04,
        8.1190869e-05,  2.5700219e-03,  9.4861723e-05, -1.3684977e-03,
       -2.2762862e-04, -1.0003566e-03, -4.1132811e-03,  4.9238640e-04,
        2.6610540e-03, -3.4013302e-03,  8.2918175e-04,  4.1657281e-03,
        1.3195022e-03, -1.7748127e-03,  7.1233278e-04, -1.1018617e-04,
       -2.1567169e-04, -9.6434070e-04,  1.3809546e-03, -1.1269862e-04,
        1.8898126e-03, -2.5859007e-03, -4.1592838e-03,  7.4108841e-04,
        8.4729301e-04, -3.1692674e-05,  2.5866497e-03, -8.2523812e-04,
        4.7518490e-03,  2.8385362e-03,  2.7966280e-03, -3.2381150e-03,
       -2.5861126e-03,  4.4759358e-03, -8.7965396e-04,  1.2106941e-03,
       -1.1716789e-03,  2.2768169e-03,  1.0357732e-03,  1.6360080e-03,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [12]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [13]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [14]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.548 / Recall: 0.244 / Accuracy: 0.859
