In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [2]:
#https://archive.ics.uci.edu/ml/datasets/sms+spam+collection --- dataset link
data = pd.read_csv('SMSSpamCollection',sep='\t',names=['labels','text'])

In [3]:
# 0 = no-spam or ham,,,, 1 = spam
data.labels = data.labels.astype('category').cat.codes

In [4]:
data.head()

Unnamed: 0,labels,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
def clean(row):
    text = re.sub(r'\W',' ',row)
    text = re.sub(r'\s+',' ',text)
    return text

In [6]:
lem = WordNetLemmatizer()
def lema(row):
    text = [lem.lemmatize(word) for word in nltk.word_tokenize(row) if word not in stopwords.words('english')]
    return ' '.join(text)

In [7]:
data['clean'] = data['text'].apply(clean)
data.clean = data.clean.apply(lema)

In [8]:
data.head()

Unnamed: 0,labels,text,clean
0,0,"Go until jurong point, crazy.. Available only ...",Go jurong point crazy Available bugis n great ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry 2 wkly comp win FA Cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,U dun say early hor U c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I think go usf life around though


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

  from pandas import MultiIndex, Int64Index


In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.clean, data.labels, test_size=0.25, random_state=8675309)
x_train_tfidf = pd.DataFrame(tfidf.fit_transform(X_train).toarray())
x_test_tfidf = pd.DataFrame(tfidf.transform(X_test).toarray())

In [24]:
def run_exps(X_train , y_train, X_test, y_test):
    dfs = []
    models = [
              ('LogReg', LogisticRegression()), 
              ('RF', RandomForestClassifier()),
              ('KNN', KNeighborsClassifier()),
              ('SVM', SVC()), 
              ('GNB', GaussianNB()),
              ('XGB', XGBClassifier())
            ]
    results = []
    names = []
    scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc']
    target_names = ['spam', 'notspam']
    for name, model in models:
        print(name)
        kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
        cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
        clf = model.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(cv_results)
        names.append(name)
        this_df = pd.DataFrame(cv_results)
        this_df['model'] = name
        dfs.append(this_df)
        print(name)
        print(classification_report(y_test, y_pred, target_names=target_names))
    final = pd.concat(dfs, ignore_index=True)
    return final

In [26]:
a = run_exps(x_train_tfidf,y_train,x_test_tfidf,y_test)

LogReg
LogReg
              precision    recall  f1-score   support

        spam       0.96      1.00      0.98      1200
     notspam       0.99      0.75      0.85       193

    accuracy                           0.96      1393
   macro avg       0.97      0.87      0.92      1393
weighted avg       0.96      0.96      0.96      1393

RF
RF
              precision    recall  f1-score   support

        spam       0.97      1.00      0.99      1200
     notspam       1.00      0.84      0.91       193

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393

KNN
KNN
              precision    recall  f1-score   support

        spam       0.90      1.00      0.95      1200
     notspam       1.00      0.30      0.46       193

    accuracy                           0.90      1393
   macro avg       0.95      0.65      0.70      1393
weighted avg       0.91      0.90      0.88   

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):




  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


XGB
              precision    recall  f1-score   support

        spam       0.97      0.99      0.98      1200
     notspam       0.95      0.82      0.88       193

    accuracy                           0.97      1393
   macro avg       0.96      0.91      0.93      1393
weighted avg       0.97      0.97      0.97      1393

