# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('./data/X_train.csv')
X_test = pd.read_csv('./data/X_test.csv')
y_train = pd.read_csv('./data/y_train.csv')
y_test = pd.read_csv('./data/y_test.csv')

X_train.head()

Unnamed: 0,clean_text
0,"['let', 'want', 'house', '8am']"
1,"['thats', 'necessarily', 'respectful']"
2,"['lol', 'im', 'hot', 'air', 'balloon']"
3,"['sorry', 'never', 'hear', 'unless', 'book', '..."
4,"['ta', 'jobs', 'available', 'let', 'know', 'pl..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [6]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'let': 4327,
 'want': 7755,
 'house': 3693,
 '8am': 759,
 'thats': 7165,
 'necessarily': 4981,
 'respectful': 6051,
 'lol': 4428,
 'im': 3806,
 'hot': 3685,
 'air': 940,
 'balloon': 1281,
 'sorry': 6663,
 'never': 5013,
 'hear': 3529,
 'unless': 7541,
 'book': 1492,
 'one': 5232,
 'kinda': 4155,
 'jokethet': 4051,
 'really': 5919,
 'looking': 4442,
 'skinny': 6530,
 'white': 7873,
 'girls': 3283,
 'lineyou': 4377,
 'much': 4897,
 'camera': 1717,
 'something': 6636,
 'like': 4358,
 'theyre': 7193,
 'casting': 1778,
 'look': 4438,
 'ta': 7020,
 'jobs': 4036,
 'available': 1216,
 'know': 4189,
 'please': 5555,
 'cos': 2131,
 'need': 4986,
 'start': 6785,
 'working': 7975,
 'uhhhhrmm': 7496,
 'isnt': 3946,
 'tb': 7076,
 'test': 7130,
 'bad': 1266,
 'youre': 8167,
 'sick': 6467,
 'thank': 7151,
 'youve': 8173,
 'wonderful': 7955,
 'would': 7996,
 'still': 6824,
 'cozy': 2159,
 'exhausted': 2834,
 'last': 4263,
 'nightnobody': 5041,
 'went': 7839,
 'school': 6259,
 'work': 7971,
 'everythin

In [7]:
# How are these vectors stored?
X_test_vect[0]

<1x8216 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [8]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [9]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [10]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [11]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.744 / Accuracy: 0.962
