# NLP Modeling Exercise

In [1]:
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
spam = pd.read_csv("./spam_clean.csv")
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
X_train, X_test, y_train, y_test = train_test_split(spam.text, spam.label, stratify=spam.label, random_state=123)

In [4]:
X_train.head()

3520    Hey... are you going to quit soon? Xuhui and i...
2279                    Sorry, I'll call later in meeting
2568    Hey. For me there is no leave on friday. Wait ...
5537    Want explicit SEX in 30 secs? Ring 02073162414...
3443         Yes but I don't care cause I know its there!
Name: text, dtype: object

In [5]:
y_train.head()

3520     ham
2279     ham
2568     ham
5537    spam
3443     ham
Name: label, dtype: object

## TF-IDF

In [6]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)
train_tfidf_values = tfidf.transform(X_train)

train_tfidf_values

<4179x7467 sparse matrix of type '<class 'numpy.float64'>'
	with 55404 stored elements in Compressed Sparse Row format>

### Logistic Regression Model

In [7]:
model = LogisticRegression()
model.fit(train_tfidf_values, y_train)

preds = model.predict(train_tfidf_values)



In [8]:
train = pd.DataFrame(dict(actual=y_train, predicted=preds))

pd.crosstab(train.predicted, train.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3618,110
spam,1,450


In [9]:
print(classification_report(train.actual, train.predicted))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      3619
        spam       1.00      0.80      0.89       560

   micro avg       0.97      0.97      0.97      4179
   macro avg       0.98      0.90      0.94      4179
weighted avg       0.97      0.97      0.97      4179



In [10]:
test_tfidf_values = tfidf.transform(X_test)
test_preds = model.predict(test_tfidf_values)

test = pd.DataFrame(dict(actual=y_test, predicted=test_preds))
pd.crosstab(test.predicted, test.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1203,28
spam,3,159


In [11]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       0.98      0.85      0.91       187

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.98      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



### Naive Bayes Model

In [12]:
gnb = GaussianNB()
gnb.fit(train_tfidf_values.todense(), y_train)
y_train_pred = gnb.predict(train_tfidf_values.todense())
print(
    "Accuracy of GNB classifier on training set: {:.2f}".format(
        gnb.score(train_tfidf_values.todense(), y_train)
    )
)
print("Confusion Matrix")
print(pd.crosstab(y_train_pred, y_train))
print("Classification Report")
print(classification_report(y_train, y_train_pred))

Accuracy of GNB classifier on training set: 0.95
Confusion Matrix
label   ham  spam
row_0            
ham    3419     0
spam    200   560
Classification Report
              precision    recall  f1-score   support

         ham       1.00      0.94      0.97      3619
        spam       0.74      1.00      0.85       560

   micro avg       0.95      0.95      0.95      4179
   macro avg       0.87      0.97      0.91      4179
weighted avg       0.96      0.95      0.96      4179



In [13]:
y_test_pred = gnb.predict(test_tfidf_values.todense())
print(
    "Accuracy of GNB classifier on test set: {:.2f}".format(
        gnb.score(test_tfidf_values.todense(), y_test)
    )
)
print("Confusion Matrix")
print(pd.crosstab(y_test_pred, y_test))
print("Classification Report")
print(classification_report(y_test, y_test_pred))

Accuracy of GNB classifier on test set: 0.90
Confusion Matrix
label   ham  spam
row_0            
ham    1084    15
spam    122   172
Classification Report
              precision    recall  f1-score   support

         ham       0.99      0.90      0.94      1206
        spam       0.59      0.92      0.72       187

   micro avg       0.90      0.90      0.90      1393
   macro avg       0.79      0.91      0.83      1393
weighted avg       0.93      0.90      0.91      1393



**Has many times more false spams as logistic regression; on the other hand, this model has fewer false hams. I think the LR model is preferrable because this model would require the user to check the spam folder too much that it has no purpose**

## TF

In [14]:
# X_train.head()

In [15]:
# entire_text = " ".join(X_train)
# words = re.sub(r"[^\w\s]", "", entire_text.lower()).split()

In [16]:
# tf = pd.Series(words).value_counts(normalize=True)
# tf.head()

In [17]:
tf = CountVectorizer()
tf.fit(X_train)
train_tf_values = tf.transform(X_train)

### Logistic Regression Model

In [18]:
model = LogisticRegression()
model.fit(train_tf_values, y_train)

preds = model.predict(train_tf_values)



In [19]:
train = pd.DataFrame(dict(actual=y_train, predicted=preds))

pd.crosstab(train.predicted, train.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3619,8
spam,0,552


In [20]:
print(classification_report(train.actual, train.predicted))

              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3619
        spam       1.00      0.99      0.99       560

   micro avg       1.00      1.00      1.00      4179
   macro avg       1.00      0.99      1.00      4179
weighted avg       1.00      1.00      1.00      4179



In [21]:
test_tf_values = tf.transform(X_test)
test_preds = model.predict(test_tf_values)

test = pd.DataFrame(dict(actual=y_test, predicted=test_preds))
pd.crosstab(test.predicted, test.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1202,17
spam,4,170


In [22]:
print(classification_report(y_test, test_preds))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1206
        spam       0.98      0.91      0.94       187

   micro avg       0.98      0.98      0.98      1393
   macro avg       0.98      0.95      0.97      1393
weighted avg       0.98      0.98      0.98      1393



**This model does better than the TF-IDF LR model. Fewer false hams and comparable false spams.**

### Naive Bayes Model

In [23]:
gnb = GaussianNB()
gnb.fit(train_tf_values.todense(), y_train)
y_train_pred = gnb.predict(train_tf_values.todense())
print(
    "Accuracy of GNB classifier on training set: {:.2f}".format(
        gnb.score(train_tf_values.todense(), y_train)
    )
)

Accuracy of GNB classifier on training set: 0.95


In [24]:
train = pd.DataFrame(dict(actual=y_train, predicted=y_train_pred))

pd.crosstab(train.predicted, train.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,3419,0
spam,200,560


In [25]:
print("Classification Report")
print(classification_report(y_train, y_train_pred))

Classification Report
              precision    recall  f1-score   support

         ham       1.00      0.94      0.97      3619
        spam       0.74      1.00      0.85       560

   micro avg       0.95      0.95      0.95      4179
   macro avg       0.87      0.97      0.91      4179
weighted avg       0.96      0.95      0.96      4179



In [26]:
y_test_pred = gnb.predict(test_tf_values.todense())
print(
    "Accuracy of GNB classifier on test set: {:.2f}".format(
        gnb.score(test_tf_values.todense(), y_test)
    )
)

Accuracy of GNB classifier on test set: 0.91


In [27]:
test = pd.DataFrame(dict(actual=y_test, predicted=y_test_pred))

pd.crosstab(test.predicted, test.actual)

actual,ham,spam
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
ham,1084,9
spam,122,178


In [28]:
print("Classification Report")
print(classification_report(y_test, y_test_pred))

Classification Report
              precision    recall  f1-score   support

         ham       0.99      0.90      0.94      1206
        spam       0.59      0.95      0.73       187

   micro avg       0.91      0.91      0.91      1393
   macro avg       0.79      0.93      0.84      1393
weighted avg       0.94      0.91      0.91      1393



**Naive Bayes still lags behind LR. Bad false spam numbers**