In [32]:
import kaggle
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet as wn
from spacy.cli import download
from spacy import load
import time

In [33]:
kaggle.api.dataset_download_files('uciml/sms-spam-collection-dataset', path='./', unzip=True)

In [34]:
df = pd.read_csv('./spam.csv', encoding='latin-1')

In [35]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [36]:
nltk.download('wordnet')
nltk.download('wordnet2022')
nltk.download('omw-1.4')
download('en_core_web_sm')
nlp = load('en_core_web_sm')


[nltk_data] Downloading package wordnet to /home/querriqe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet2022 to
[nltk_data]     /home/querriqe/nltk_data...
[nltk_data]   Package wordnet2022 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/querriqe/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.8/12.8 MB 117.6 kB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [37]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

In [38]:
df['v1'].value_counts()

v1
ham     4825
spam     747
Name: count, dtype: int64

In [39]:
from sklearn.preprocessing import LabelEncoder
le= LabelEncoder()
df['v1'] = le.fit_transform(df['v1'])
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [40]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemmatizer = WordNetLemmatizer() #lemmatizer (cats -> cat, dogs -> dog, etc.)
stop_words = stopwords.words('english') #stopwords (is, at, etc.)
tokenizer = nltk.RegexpTokenizer(r'\w+') 

def token_list(massage):
    tokens = tokenizer.tokenize(massage)
    lc_tokens = [t.lower() for t in tokens] # lowercased tokens
    lm_tokens = [lemmatizer.lemmatize(t) for t in lc_tokens] #lemmatized tokens
    tokens = [t for t in lm_tokens if t not in stop_words] #get rid of stopwords(is, at, etc.)
    
    return tokens

In [41]:
df = df.sample(frac=1, random_state=42)
df = df.reset_index(drop=True)

train_test_rat = int(len(df) * 0.8)
train_df, test_df = df[:train_test_rat], df[train_test_rat:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df['v1'].value_counts()

v1
0    3867
1     590
Name: count, dtype: int64

In [42]:
token_counter = {}

for message in train_df['v2']:
    tokenlist = token_list(message)
    
    for token in tokenlist:
        if token in token_counter:
            token_counter[token] +=1
        else:
            token_counter[token] = 1

len(token_counter)

7080

In [43]:
def keep_token(token,threshold):
    if token not in token_counter:
        return False
    else:
        return token_counter[token] > threshold

In [44]:
features = set()

for token in token_counter:
    if keep_token(token,100):
        features.add(token)

In [45]:
token_map = {t:i for t,i in zip(features, range(len(features)))}
print(len(token_map))

57


In [46]:
def message_to_count_vector(message):
    count_vector = np.zeros(len(features))

    processed_list_of_tokens = token_list(message)

    for token in processed_list_of_tokens:
        if token not in features:
            continue
        index = token_map[token]
        count_vector[index] += 1
        
    return count_vector

In [47]:
def to_X_y(dff):
    y = dff['v1'].to_numpy().astype(int)
    
    mess_col = dff['v2']
    count_vectors = []
    
    for message in mess_col:
        count_vector = message_to_count_vector(message)
        count_vectors.append(count_vector)
        
    X= np.array(count_vectors).astype(int)
    
    return X, y

In [48]:
X_train, y_train = to_X_y(train_df)

X_test, y_test = to_X_y(test_df)

In [49]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

In [50]:
from sklearn.model_selection import train_test_split 

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
import time


init_models = { 'Logistic Regression' :  LogisticRegression(),
                'SVC': SVC(),
                'K-Neighbors Classifier': KNeighborsClassifier(),
                'Decision Tree Classifier': DecisionTreeClassifier(),
                'Random Forest Classifier' : RandomForestClassifier(),
                'Gradient Boosting Classifier': GradientBoostingClassifier(),
                'XGB Classifier' : XGBClassifier(),
               }

timer=[]
acc = []
models_names = []
for i, (key,model) in enumerate(init_models.items()):
    start_time = time.time()
    model.fit(X_train, y_train)
    end_time = time.time()
    models_names.append(key)
    acc.append(np.mean(cross_val_score(model, X_train, y_train, cv=5)))
    timer.append(end_time-start_time)
models_scores = pd.DataFrame({'model name': models_names, 'cross-val accuracy score': acc, 'time': timer})
models_scores.head(7)

Unnamed: 0,model name,cross-val accuracy score,time
0,Logistic Regression,0.940533,0.09995
1,SVC,0.948948,0.162918
2,K-Neighbors Classifier,0.930715,0.000842
3,Decision Tree Classifier,0.941094,0.012634
4,Random Forest Classifier,0.953436,0.224706
5,Gradient Boosting Classifier,0.952595,0.195025
6,XGB Classifier,0.954278,0.027442


In [53]:
from sklearn.model_selection import GridSearchCV

model = KNeighborsClassifier()
n_neighbors = [3,4,5,6]
leaf_size = [20,30,40,50]
params = { 'n_neighbors': n_neighbors, 'leaf_size': leaf_size }

boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
boost_grid.fit(X_train,y_train)
print(boost_grid.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] END ........................leaf_size=20, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=3; total time=   0.0s[CV] END ........................leaf_size=20, n_neighbors=3; total time=   0.0s

[CV] END ........................leaf_size=20, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=4; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=3; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=4; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=4; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=20, n_neighbors=5; total time=   0.0s
[CV] END ........................leaf_size=20, n

In [54]:
from sklearn.metrics import classification_report

start_time = time.time()

model = KNeighborsClassifier(n_neighbors=3, leaf_size=20)
model.fit(X_train, y_train)
prediction = model.predict(X_val)

end_time = time.time()

print(classification_report(y_val, prediction), '\n')
print('time: ', end_time-start_time)    

              precision    recall  f1-score   support

           0       0.95      0.99      0.97       775
           1       0.92      0.67      0.77       117

    accuracy                           0.95       892
   macro avg       0.93      0.83      0.87       892
weighted avg       0.95      0.95      0.94       892
 

time:  0.14032626152038574


In [55]:
KNN_test_df = pd.DataFrame({'message': test_df['v2'], 'actual': y_test, 'prediction': model.predict(X_test)})
KNN_test_df

Unnamed: 0,message,actual,prediction
0,Thank you. I like you as well...,0,0
1,"New Tones This week include: 1)McFly-All Ab..,...",1,0
2,I am not sure about night menu. . . I know onl...,0,0
3,Hope ur head doesn't hurt 2 much ! Am ploughin...,0,0
4,Hey what how about your project. Started aha da.,0,0
...,...,...,...
1110,I came hostel. I m going to sleep. Plz call me...,0,0
1111,"Sorry, I'll call later",0,0
1112,Prabha..i'm soryda..realy..frm heart i'm sory,0,0
1113,Nt joking seriously i told,0,0


In [56]:
model = RandomForestClassifier(random_state=42)
n_estimators = [50,100,200,500,1000]
max_depth = [None,1,2,3]
params = { 'n_estimators': n_estimators, 'max_depth': max_depth }

boost_grid = GridSearchCV(estimator = model, param_grid = params, 
                                cv = 5, verbose=2, scoring='accuracy',n_jobs = -1)
boost_grid.fit(X_train,y_train)
print(boost_grid.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ....................max_depth=None, n_estimators=50; total time=   0.2s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.3s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.4s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.4s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.4s
[CV] END ...................max_depth=None, n_estimators=100; total time=   0.4s
[CV] END .......................max_depth=1, n_estimators=50; total time=   0.1s
[CV] END .......................max_depth=1, n_

In [57]:
start_time = time.time()
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)
prediction = model.predict(X_val)
end_time = time.time()

print(classification_report(y_val, prediction), '\n')
print('time: ', end_time-start_time)

              precision    recall  f1-score   support

           0       0.97      0.98      0.98       775
           1       0.85      0.82      0.83       117

    accuracy                           0.96       892
   macro avg       0.91      0.90      0.91       892
weighted avg       0.96      0.96      0.96       892
 

time:  0.4513063430786133


In [58]:
RFC_test_df = pd.DataFrame({'message': test_df['v2'], 'actual': y_test, 'prediction': model.predict(X_test)})

In [59]:
print(KNN_test_df.loc[KNN_test_df['actual'] != KNN_test_df['prediction']])
print(1 - (len(KNN_test_df.loc[KNN_test_df['actual'] != KNN_test_df['prediction']])/len(KNN_test_df)))

                                                message  actual  prediction
1     New Tones This week include: 1)McFly-All Ab..,...       1           0
10    PRIVATE! Your 2003 Account Statement for <fone...       1           0
16              Call me da, i am waiting for your call.       0           1
25    SMSSERVICES. for yourinclusive text credits, p...       1           0
27    BangBabes Ur order is on the way. U SHOULD rec...       1           0
...                                                 ...     ...         ...
966   Get your garden ready for summer with a FREE s...       1           0
999   How come it takes so little time for a child w...       1           0
1048  Bloomberg -Message center +447797706009 Why wa...       1           0
1056  You will be receiving this week's Triple Echo ...       1           0
1067  Fantasy Football is back on your TV. Go to Sky...       1           0

[65 rows x 3 columns]
0.9417040358744395


In [60]:
print(RFC_test_df.loc[RFC_test_df['actual'] != RFC_test_df['prediction']])
print(1 - (len(RFC_test_df.loc[RFC_test_df['actual'] != RFC_test_df['prediction']])/len(RFC_test_df)))

                                                message  actual  prediction
1     New Tones This week include: 1)McFly-All Ab..,...       1           0
21                       I'm in a movie. Call me 4 wat?       0           1
35             RCT' THNQ Adrian for U text. Rgds Vatian       1           0
56    You are now unsubscribed all services. Get ton...       1           0
66    Please CALL 08712402972 immediately as there i...       1           0
69    Bored housewives! Chat n date now! 0871750.77....       1           0
77    I take it the post has come then! You must hav...       0           1
88    You won't believe it but it's true. It's Incre...       1           0
104   Hello darlin ive finished college now so txt m...       0           1
115   LIFE has never been this much fun and great un...       1           0
117   Adult 18 Content Your video will be with you s...       1           0
138   Want explicit SEX in 30 secs? Ring 02073162414...       1           0
145   U were

**Summary**

I chosed fastest and most accurate models. 
- KNN: %94,17 accuracy within 0.141373872756958 seconds
- RFC: %95,06 accuracy within 0.4513063430786133 seconds
