In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Data Pre-processing 

In [2]:
print("Data Loading")
df = pd.read_csv('spam.csv')
display(df.head())

print("Null columns Deleted")
df.drop(columns=["Unnamed: 2",	'Unnamed: 3',	"Unnamed: 4"], inplace=True)
display(df.head())

print("Renamed Columns")
df.rename(columns={"v1" : 'target', 'v2' : 'text'}, inplace=True)
df.head()

Data Loading


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Null columns Deleted


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Renamed Columns


Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
print("Before removing Duplicates")
display(df.target.value_counts())

print("After Deleting Duplicates")
df = df.drop_duplicates(keep='first')
display(df.target.value_counts())

print("Label encoding on target column")
le = LabelEncoder()
df.target = le.fit_transform(df.target)
df.head()

Before removing Duplicates


target
ham     4825
spam     747
Name: count, dtype: int64

After Deleting Duplicates


target
ham     4516
spam     653
Name: count, dtype: int64

Label encoding on target column


Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Feature engineering

In [4]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from spacy.cli import download
download('en_core_web_sm')
nlp = spacy.load("en_core_web_sm")
tfid = TfidfVectorizer(max_features = 500)

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [5]:
def transform_text(text): 
    doc = nlp(text)
    y = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop and token.is_alpha]
    return " ".join(y)

transform_text("The movie is very good")

'movie good'

In [6]:
df['transformed_text'] = df.text.apply(transform_text)
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy available bugis n great wor...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts tex...
3,0,U dun say so early hor... U c already then say...,u dun early hor u c
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think go usf live


In [7]:
X = tfid.fit_transform(df["transformed_text"]).toarray()
y = df.target.values

print(f'{type(X)}, {type(y)}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

<class 'numpy.ndarray'>, <class 'numpy.ndarray'>


## Model Training

In [8]:
from sklearn.naive_bayes import MultinomialNB 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

mnb = MultinomialNB()
svc = SVC(kernel='sigmoid', gamma=1)
rf = RandomForestClassifier(random_state=42)

clfs = {
    'SVC' : svc,
    "NB" : mnb,
    "RF" : rf
}

In [9]:
from sklearn.metrics import accuracy_score, precision_score, classification_report
def train_classifier(clf, X_train, y_train, X_test, y_test): 
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    return report, accuracy, precision

In [10]:
accuracy_scores = []
precision_scores = []

for name, clf in clfs.items(): 
    report, accuracy, precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print("For : ", name)
    print('accuracy_score', accuracy)
    print('precision', precision)
    print(report)
    accuracy_scores.append(accuracy)
    precision_scores.append(precision)


For :  SVC
accuracy_score 0.9700193423597679
precision 0.9310344827586207
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       903
           1       0.93      0.82      0.87       131

    accuracy                           0.97      1034
   macro avg       0.95      0.91      0.93      1034
weighted avg       0.97      0.97      0.97      1034

For :  NB
accuracy_score 0.9729206963249516
precision 0.9557522123893806
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       903
           1       0.96      0.82      0.89       131

    accuracy                           0.97      1034
   macro avg       0.97      0.91      0.93      1034
weighted avg       0.97      0.97      0.97      1034

For :  RF
accuracy_score 0.9709864603481625
precision 0.904
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       903
           1       0.90      0