<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Building RF model with Doc2Vec</p>

### Importing Required Modules

In [1]:
import pandas as pd
import gensim
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
import numpy as np

### Reading Data

In [2]:
# Reading data from pickle file which preserved the list data type
X_train = pd.read_pickle("data/X_train.pkl")
X_test = pd.read_pickle("data/X_test.pkl")
y_train = pd.read_csv("data/y_train.csv")
y_test = pd.read_csv("data/y_test.csv")

In [3]:
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(message, [tag])
                     for tag, message in enumerate(X_train)]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(message, [tag])
                     for tag, message in enumerate(X_test)]

In [5]:
tagged_docs_train[:5]

[TaggedDocument(words=['nothing', 'really', 'making', 'sure', 'everybodys', 'speed'], tags=[0]),
 TaggedDocument(words=['urgent', 'urgent', '800', 'free', 'flights', 'europe', 'give', 'away', 'call', 'b4', '10th', 'sept', 'take', 'friend', '4', 'free', 'call', 'claim', '09050000555', 'ba128nnfwfly150ppm'], tags=[1]),
 TaggedDocument(words=['cashbincouk', 'get', 'lots', 'cash', 'weekend', 'wwwcashbincouk', 'dear', 'welcome', 'weekend', 'got', 'biggest', 'best', 'ever', 'cash', 'give', 'away'], tags=[2]),
 TaggedDocument(words=['thinking', 'going', 'reg', 'pract', 'lessons', 'flung', 'advance', 'haha', 'wat', 'time', 'u', 'going'], tags=[3]),
 TaggedDocument(words=['moji', 'informed', 'saved', 'lives', 'thanks'], tags=[4])]

### Training the Doc2Vec

In [6]:
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                  vector_size=100,
                                  window=5,
                                  min_count=2
                                 )

In [11]:
train_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_train]

In [13]:
test_vectors = [d2v_model.infer_vector(v.words) for v in tagged_docs_test]

### Building Random Forest Model with Doc2Vec

In [14]:
rf = RandomForestClassifier()
rf_model = rf.fit(train_vectors, y_train.values.ravel())

In [15]:
y_pred = rf_model.predict(test_vectors)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       976
           1       0.97      0.73      0.84       139

    accuracy                           0.96      1115
   macro avg       0.97      0.87      0.91      1115
weighted avg       0.96      0.96      0.96      1115

