In [1]:
import pandas as pd
import re
from sklearn.utils import resample
from sklearn import metrics as mt

In [2]:
base = pd.read_csv(r'spam.csv', encoding='ISO-8859-1')
base.shape

(5572, 5)

In [3]:
base.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
base.describe()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
count,5572,5572,50,12,6
unique,2,5169,43,10,5
top,ham,"Sorry, I'll call later","bt not his girlfrnd... G o o d n i g h t . . .@""","MK17 92H. 450Ppw 16""","GNT:-)"""
freq,4825,30,3,2,2


In [5]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
columns_to_drop = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
base.drop(columns_to_drop, axis=1, inplace=True)
base.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
base = base[['v1', 'v2']]
base.columns = ['label', 'message']

In [8]:
base['label'] = base['label'].map({'ham': 0, 'spam': 1})


base.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    return text

base['message'] = base['message'].apply(clean_text)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(base['message'])
y = base['label']

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB

model_nave = MultinomialNB()
model_nave.fit(x_train, y_train)

In [13]:
predict_naive = model_nave.predict(x_test)
print(mt.accuracy_score(y_test, predict_naive))
print(mt.recall_score(y_test, predict_naive))
print(mt.precision_score(y_test, predict_naive))
print(mt.f1_score(y_test, predict_naive))

0.968609865470852
0.7666666666666667
1.0
0.8679245283018869


In [14]:
# Treino naive
predict_naive_train = model_nave.predict(x_train)
print(mt.accuracy_score(y_train, predict_naive_train))
print(mt.recall_score(y_train, predict_naive_train))
print(mt.precision_score(y_train, predict_naive_train))
print(mt.f1_score(y_train, predict_naive_train))

0.9762171864482836
0.8224455611390284
1.0
0.9025735294117647


In [15]:
# Support vector machine (SVM)
from sklearn.svm import SVC

model_svm = SVC()
model_svm.fit(x_train, y_train)

In [16]:
predict_svm = model_svm.predict(x_test)
print(mt.accuracy_score(y_test, predict_svm))
print(mt.recall_score(y_test, predict_svm))
print(mt.precision_score(y_test, predict_svm))
print(mt.f1_score(y_test, predict_svm))

0.968609865470852
0.78
0.9831932773109243
0.8698884758364313


In [17]:
# Treino svm
predict_svm_train = model_svm.predict(x_train)
print(mt.accuracy_score(y_train, predict_svm_train))
print(mt.recall_score(y_train, predict_svm_train))
print(mt.precision_score(y_train, predict_svm_train))
print(mt.f1_score(y_train, predict_svm_train))

0.9977563383441777
0.983249581239531
1.0
0.9915540540540541


In [18]:
# random forest
from sklearn.ensemble import RandomForestClassifier

model_forest = RandomForestClassifier()
model_forest.fit(x_train, y_train)

In [19]:
predict_forest = model_forest.predict(x_test)
print(mt.accuracy_score(y_test, predict_forest))
print(mt.recall_score(y_test, predict_forest))
print(mt.precision_score(y_test, predict_forest))
print(mt.f1_score(y_test, predict_forest))

0.9721973094170404
0.7933333333333333
1.0
0.8847583643122676


In [20]:
# Treino forest
predict_forest_train = model_forest.predict(x_train)
print(mt.accuracy_score(y_train, predict_forest_train))
print(mt.recall_score(y_train, predict_forest_train))
print(mt.precision_score(y_train, predict_forest_train))
print(mt.f1_score(y_train, predict_forest_train))

1.0
1.0
1.0
1.0


In [21]:
# K-Nearest Neighbors (KNN)
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
model_knn.fit(x_train, y_train)

In [22]:
predict_knn = model_knn.predict(x_test)
print(mt.accuracy_score(y_test, predict_knn))
print(mt.recall_score(y_test, predict_knn))
print(mt.precision_score(y_test, predict_knn))
print(mt.f1_score(y_test, predict_knn))

0.9174887892376682
0.38666666666666666
1.0
0.5576923076923077


In [23]:
predict_knn_train = model_knn.predict(x_train)
print(mt.accuracy_score(y_train, predict_knn_train))
print(mt.recall_score(y_train, predict_knn_train))
print(mt.precision_score(y_train, predict_knn_train))
print(mt.f1_score(y_train, predict_knn_train))

0.9221449405429661
0.4204355108877722
0.996031746031746
0.591283863368669


In [24]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0],
    'fit_prior': [True, False]
}

grid_search = GridSearchCV(model_nave, param_grid, cv=5, scoring='f1', n_jobs=-1)

grid_search.fit(X, y)

best_model = grid_search.best_estimator_

print("best params:", grid_search.best_params_)
print("best F1 Score:", grid_search.best_score_)

from joblib import dump
dump(best_model, 'naive_bayes_tunado.pkl')
dump(vectorizer, 'vectorizer.pkl')

best params: {'alpha': 0.1, 'fit_prior': True}
best F1 Score: 0.8971256099169993


['vectorizer.pkl']