#### Using supervised learning for identifing the spam messages: There are a series of messages, some of which are spam and some are not. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score, recall_score
from sklearn.metrics import precision_score, accuracy_score

# Load data:

In [2]:
data = pd.read_csv('spam.csv')
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
message = data['Message']
label = data['Category']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(message).toarray()
Y = label.map({'spam':1 , 'ham':0})

In [4]:
vectorizer.get_feature_names_out()[-100:]

array(['xxuk', 'xxx', 'xxxmobilemovieclub', 'xxxx', 'xxxxx', 'xxxxxx',
       'xxxxxxx', 'xxxxxxxx', 'xxxxxxxxxxxxxx', 'xy', 'y87', 'ya', 'yah',
       'yahoo', 'yalrigu', 'yalru', 'yam', 'yan', 'yar', 'yarasu',
       'yards', 'yavnt', 'yaxx', 'yaxxx', 'yay', 'yck', 'yeah', 'year',
       'years', 'yeesh', 'yeh', 'yelling', 'yellow', 'yelow', 'yen',
       'yeovil', 'yep', 'yer', 'yes', 'yest', 'yesterday', 'yet', 'yetty',
       'yetunde', 'yhl', 'yi', 'yifeng', 'yijue', 'ym', 'ymca', 'yo',
       'yoga', 'yogasana', 'yor', 'yorge', 'you', 'youdoing', 'youi',
       'young', 'younger', 'youphone', 'your', 'youre', 'yourinclusive',
       'yourjob', 'yours', 'yourself', 'youuuuu', 'youwanna', 'yoville',
       'yowifes', 'yoyyooo', 'yr', 'yrs', 'ystrday', 'ything', 'yummmm',
       'yummy', 'yun', 'yunny', 'yuo', 'yuou', 'yup', 'yupz', 'zac',
       'zaher', 'zealand', 'zebra', 'zed', 'zeros', 'zhong', 'zindgi',
       'zoe', 'zogtorius', 'zoom', 'zouk', 'zyada', 'èn', 'ú1', '〨ud'],
 

In [5]:
X_train , X_test, y_train , y_test = train_test_split(X , Y)
print(X_test.shape , y_test.shape)
print(X_train.shape , y_train.shape)

(1393, 8709) (1393,)
(4179, 8709) (4179,)


# Logistic Regression:

In [6]:
from sklearn.linear_model import LogisticRegression

In [7]:
clf1 = LogisticRegression()
clf1.fit(X_train, y_train)

In [8]:
print('Logistic Regression F1score:' ,format(f1_score(y_true= y_test , y_pred= clf1.predict(X_test))) )

Logistic Regression F1score: 0.8410596026490066


# Bagging Classifier-Logistic Regression:

In [9]:
from sklearn.ensemble import BaggingClassifier

In [10]:
clf2 = BaggingClassifier(estimator=LogisticRegression(),
                        n_estimators=5)
clf2.fit(X_train,y_train)

In [11]:
print('Bagging Classifier with Logistic Regression F1score:' ,format(f1_score(y_true= y_test , y_pred= clf2.predict(X_test))) )

Bagging Classifier with Logistic Regression F1score: 0.8135593220338984


# KNN:

In [12]:
from sklearn.neighbors import KNeighborsClassifier as KNN

In [13]:
clf3 = KNN(n_neighbors = 15)
clf3.fit(X_train,y_train)

In [14]:
print('KNN F1score:' ,format(f1_score(y_true= y_test , y_pred= clf3.predict(X_test))) )

KNN F1score: 0.8448844884488449


# BaggingClassifier : with KNN

In [15]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier as KNN

In [16]:
clf4 = BaggingClassifier(estimator= KNN(n_neighbors=15),
                        n_estimators=5)
clf4.fit(X_train,y_train)

In [17]:
print('Bagging Classifier with KNN F1score:' ,format(f1_score(y_true= y_test , y_pred= clf4.predict(X_test))) )

Bagging Classifier with KNN F1score: 0.8333333333333334


# K-Fold cross validation: 

In [18]:
from sklearn.model_selection import KFold

In [19]:
kf = KFold(n_splits=5)
clfs = [clf1,clf2,clf3,clf4]
result = []
for i , (train_index,test_index) in enumerate(kf.split(X)):
    x_train , y_train = X[train_index] , Y[train_index]
    x_test , y_test = X[test_index] , Y[test_index]
    result.append([])
    for clf in clfs:
        clf.fit(x_train , y_train)
        y_pred = clf.predict(x_test)
        result[i].append(f1_score(y_true = y_test , y_pred = y_pred))
        print(i , clf)

0 LogisticRegression()
0 BaggingClassifier(estimator=LogisticRegression(), n_estimators=5)
0 KNeighborsClassifier(n_neighbors=15)
0 BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=15),
                  n_estimators=5)
1 LogisticRegression()
1 BaggingClassifier(estimator=LogisticRegression(), n_estimators=5)
1 KNeighborsClassifier(n_neighbors=15)
1 BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=15),
                  n_estimators=5)
2 LogisticRegression()
2 BaggingClassifier(estimator=LogisticRegression(), n_estimators=5)
2 KNeighborsClassifier(n_neighbors=15)
2 BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=15),
                  n_estimators=5)
3 LogisticRegression()
3 BaggingClassifier(estimator=LogisticRegression(), n_estimators=5)
3 KNeighborsClassifier(n_neighbors=15)
3 BaggingClassifier(estimator=KNeighborsClassifier(n_neighbors=15),
                  n_estimators=5)
4 LogisticRegression()
4 BaggingClassifier(estimator=LogisticRegression(

In [21]:
result = np.array(result)
for i , clf in enumerate(clfs):
    #print('f1-score of {} model: \n mean= {} , std= {}'.format(clf, np.mean(result[:,i])*100, np.std(result[:,i])))
    print('clf{} f1-score: mean= {} , std= {}'.format(i+1,np.mean(result[:,i])*100, np.std(result[:,i])))

clf1 f1-score: mean= 83.00166496656865 , std= 0.009123615023716617
clf2 f1-score: mean= 80.54506035060015 , std= 0.016073512072990178
clf3 f1-score: mean= 81.23772008056352 , std= 0.04032354830379908
clf4 f1-score: mean= 81.47485029170006 , std= 0.04666204263601691


# Result
#### it seams that Logistic Regression model is the best model for this problem. The next model that can be suitable for this problem is 'BaggingClassifier with KNN estimator'. Because its standard deviation gives us more stable results along with its average efficiency.