# Compare NLP Techniques: Build Model On TF-IDF Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

X_train = pd.read_csv('data/X_train.csv')
X_test = pd.read_csv('data/X_test.csv')
y_train = pd.read_csv('data/y_train.csv')
y_test = pd.read_csv('data/y_test.csv')

X_train.head()

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,clean_text
0,"['spare', 'power', 'supplies']"
1,"['urgent', 'trying', 'contact', 'u', 'todays',..."
2,"['1st', 'wk', 'free', 'gr8', 'tones', 'str8', ..."
3,"['call', 'meet']"
4,"['haha', 'really', 'oh', 'deduct', 'lesson', '..."


### Create TF-IDF Vectors

In [2]:
# Instantiate and fit a TFIDF vectorizer and then use that trained vectorizer
# to transform the messages in the training and test sets
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train['clean_text'])
X_train_vect = tfidf_vect.transform(X_train['clean_text'])
X_test_vect = tfidf_vect.transform(X_test['clean_text'])

In [3]:
# What words did the vectorizer learn?
tfidf_vect.vocabulary_

{'spare': 6728,
 'power': 5672,
 'supplies': 7002,
 'urgent': 7611,
 'trying': 7465,
 'contact': 2077,
 'todays': 7336,
 'draw': 2567,
 'shows': 6492,
 '800': 694,
 'prize': 5751,
 'guaranteed': 3416,
 'call': 1678,
 '09050001808': 163,
 'land': 4234,
 'line': 4360,
 'claim': 1920,
 'm95': 4534,
 'valid12hrs': 7659,
 '1st': 373,
 'wk': 7982,
 'free': 3122,
 'gr8': 3363,
 'tones': 7362,
 'str8': 6896,
 'txt': 7500,
 'nokia': 5095,
 '8007': 697,
 'classic': 1927,
 'hit': 3601,
 'polys': 5624,
 'nokia150p': 5096,
 'poly200p': 5618,
 '16': 344,
 'meet': 4673,
 'haha': 3449,
 'really': 5938,
 'oh': 5221,
 'deduct': 2342,
 'lesson': 4317,
 'tmr': 7325,
 'sounds': 6712,
 'great': 3386,
 'home': 3638,
 'login': 4415,
 'dat': 2280,
 'time': 7297,
 'dad': 2248,
 'fetching': 2954,
 'probably': 5757,
 'still': 6868,
 'going': 3321,
 'stuff': 6930,
 'hey': 3578,
 'guy': 3434,
 'know': 4184,
 'breathing': 1564,
 'neck': 5004,
 'get': 3262,
 'bud': 1612,
 'anyway': 1060,
 'youd': 8204,
 'able': 817,


In [4]:
# How are these vectors stored?
X_test_vect[0]

<1x8263 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [5]:
# Can we convert the vectors to arrays?
X_test_vect[0].toarray()

array([[0., 0., 0., ..., 0., 0., 0.]])

### Fit RandomForestClassifier On Top Of Vectors

In [6]:
# Fit a basic Random Forest model on these vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())

In [7]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect)

In [8]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 1.0 / Recall: 0.795 / Accuracy: 0.97
