In [1]:
from pprint import pprint
import pandas as pd
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

def split(df, stratify=None):
    '''
    splitting our data into train, validate, test w/ straitfying should we choose.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df[stratify])
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate[stratify])
    return train, validate, test

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

df = pd.read_csv('spam_clean.csv')

In [3]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Take the work we did in the lessons further

In [4]:
# lesson example
cv = CountVectorizer()
X = cv.fit_transform(df.text.apply(clean).apply(' '.join))
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=12, stratify = y)

tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X_train, y_train)

tree.score(X_train, y_train)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))


train['predicted'] = tree.predict(X_train)
test['predicted'] = tree.predict(X_test)

In [5]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 93.07%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3691   141
spam        168   457
---
              precision    recall  f1-score   support

         ham       0.96      0.96      0.96      3859
        spam       0.73      0.76      0.75       598

    accuracy                           0.93      4457
   macro avg       0.85      0.86      0.85      4457
weighted avg       0.93      0.93      0.93      4457



In [6]:
tree.score(X_test, y_test)


0.9246636771300448

# What other types of models (i.e. different classifcation algorithms) could you use?

## Logistic Regression

In [11]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

lr = LogisticRegression().fit(X_train, y_train)

train['predicted'] = lr.predict(X_train)
test['predicted'] = lr.predict(X_test)

In [12]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 97.49%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3856   109
spam          3   489
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      3859
        spam       0.99      0.82      0.90       598

    accuracy                           0.97      4457
   macro avg       0.98      0.91      0.94      4457
weighted avg       0.98      0.97      0.97      4457



In [13]:
lr.score(X_test, y_test)


0.9641255605381166

In [14]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 96.41%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        965    39
spam         1   110
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       0.99      0.74      0.85       149

    accuracy                           0.96      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.96      0.96      1115



## KNN

In [17]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

knn = KNeighborsClassifier().fit(X_train, y_train)

train['predicted'] = knn.predict(X_train)
test['predicted'] = knn.predict(X_test)

In [19]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 92.30%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859   343
spam          0   255
---
              precision    recall  f1-score   support

         ham       0.92      1.00      0.96      3859
        spam       1.00      0.43      0.60       598

    accuracy                           0.92      4457
   macro avg       0.96      0.71      0.78      4457
weighted avg       0.93      0.92      0.91      4457



In [20]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))


Accuracy: 90.40%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966   107
spam         0    42
---
              precision    recall  f1-score   support

         ham       0.90      1.00      0.95       966
        spam       1.00      0.28      0.44       149

    accuracy                           0.90      1115
   macro avg       0.95      0.64      0.69      1115
weighted avg       0.91      0.90      0.88      1115



## Random Forest

In [21]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

rf = RandomForestClassifier().fit(X_train, y_train)

train['predicted'] = rf.predict(X_train)
test['predicted'] = rf.predict(X_test)

In [22]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))


Accuracy: 100.00%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3859     0
spam          0   598
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3859
        spam       1.00      1.00      1.00       598

    accuracy                           1.00      4457
   macro avg       1.00      1.00      1.00      4457
weighted avg       1.00      1.00      1.00      4457



In [23]:
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.predicted, test.actual))
print('---')
print(classification_report(test.actual, test.predicted))

Accuracy: 97.85%
---
Confusion Matrix
actual     ham  spam
predicted           
ham        966    24
spam         0   125
---
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       966
        spam       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## Count Vectorizer

In [24]:
cv = CountVectorizer()
X = cv.fit_transform(df.text)
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

knn = KNeighborsClassifier().fit(X_train, y_train)

train['predicted'] = rf.predict(X_train)
test['predicted'] = rf.predict(X_test)

In [25]:
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.predicted, train.actual))
print('---')
print(classification_report(train.actual, train.predicted))

Accuracy: 99.44%
---
Confusion Matrix
actual      ham  spam
predicted            
ham        3858    24
spam          1   574
---
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      3859
        spam       1.00      0.96      0.98       598

    accuracy                           0.99      4457
   macro avg       1.00      0.98      0.99      4457
weighted avg       0.99      0.99      0.99      4457



# How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

TF-IDF models have higher accuracy scores than only term frequency ones.
