In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
2044,ham,I can send you a pic if you like :),,,
651,ham,I'm gonna say no. Sorry. I would but as normal...,,,
43,ham,WHO ARE YOU SEEING?,,,
3493,spam,You are being contacted by our dating service ...,,,
2208,spam,URGENT!! Your 4* Costa Del Sol Holiday or å£50...,,,


In [4]:
df.shape

(5572, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [7]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
1870,ham,"Aight, I'll text you when I'm back"
912,ham,Can't. I feel nauseous. I'm so pissed. I didn'...
1237,ham,Is ur paper in e morn or aft tmr?
5294,spam,XMAS iscoming & ur awarded either å£500 CD gif...
3694,ham,"Hello, As per request from &lt;#&gt; Rs.5 ha..."


In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [9]:
df = df.drop_duplicates(keep='first')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5169 non-null   int32 
 1   text    5169 non-null   object
dtypes: int32(1), object(1)
memory usage: 101.0+ KB


In [11]:
import nltk
print(nltk.__version__)
nltk.download('punkt')
nltk.download('stopwords')

3.8.1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [13]:
def transform_text(text): 
    text = text.lower() # chuyển thành chữ thường
    text = nltk.word_tokenize(text) # chia text thành mảng các từ riêng lẻ
    
    y = []
    for i in text:
        if i.isalnum(): # kiểm tra từ chỉ bao gồm chữ và số
            y.append(i)
    
    text = y[:] # copy y to text
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # kiểm tra xem từ không phải là từ dừng và dấu câu
            y.append(i)
            
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i)) # cắt giảm từ trở về dạng gốc
            
    return " ".join(y)

In [14]:
transform_text('Whoever has money, please donate to me')

'whoever money pleas donat'

In [15]:
df['transformed_text'] = df['text'].apply(transform_text)

In [16]:
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
cv = CountVectorizer()   #Vector hóa văn bản
tfidf = TfidfVectorizer(max_features = 3000) # chuyển đổi văn bản thành vectơ TF-IDF giữ lại 3000 đặc trưng

In [18]:
X = tfidf.fit_transform(df['transformed_text']).toarray() # chuyển đổi cột 'transformed_text' thành ma trận các vectơ TF-IDF
X.shape

(5169, 3000)

In [19]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X) # chuẩn hóa ma trận về khoảng giá trị [0, 1]
X.shape

(5169, 3000)

In [20]:
y = df['target'].values

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, classification_report, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

# Cross Validation - KFold

In [22]:
from sklearn.model_selection import KFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import cross_val_score
kfold = KFold(n_splits = 10, shuffle=True, random_state=42)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)
param_grid_knn = {'n_neighbors': range(1,20,1)}
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=kfold, scoring='accuracy')
grid_search_knn.fit(X, y)

In [26]:
best_params = grid_search_knn.best_params_  # Best hyperparameters
best_score = grid_search_knn.best_score_   # Best cross-validation score
best_estimator = grid_search_knn.best_estimator_  # Model with best parameters

In [27]:
print(best_params)
print(best_score)
print(best_estimator)

{'n_neighbors': 1}
0.9500899644640366
KNeighborsClassifier(n_neighbors=1)


In [28]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth=20)
param_grid_dtc = {'max_depth': range(1,30,1)}
grid_search_dtc = GridSearchCV(dtc, param_grid_dtc, cv=kfold, scoring='accuracy')
grid_search_dtc.fit(X, y)

In [29]:
best_params = grid_search_dtc.best_params_  # Best hyperparameters
best_score = grid_search_dtc.best_score_   # Best cross-validation score
best_estimator = grid_search_dtc.best_estimator_  # Model with best parameters

In [31]:
print(best_params)
print(best_score)
print(best_estimator)

{'max_depth': 28}
0.9605344638867646
DecisionTreeClassifier(max_depth=28)


In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=40)
param_grid_rfc = {'n_estimators': range(10,120,10)}
grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=kfold, scoring='accuracy')
grid_search_rfc.fit(X, y)

In [33]:
best_params = grid_search_rfc.best_params_  # Best hyperparameters
best_score = grid_search_rfc.best_score_   # Best cross-validation score
best_estimator = grid_search_rfc.best_estimator_  # Model with best parameters

In [34]:
print(best_params)
print(best_score)
print(best_estimator)

{'n_estimators': 100}
0.9758190514746674
RandomForestClassifier()


In [35]:
from sklearn.svm import SVC
svc = SVC(kernel='sigmoid')
param_grid_svc = {'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}
grid_search_svc = GridSearchCV(svc, param_grid_svc, cv=kfold, scoring='accuracy')
grid_search_svc.fit(X, y)

In [36]:
best_params = grid_search_svc.best_params_  # Best hyperparameters
best_score = grid_search_svc.best_score_   # Best cross-validation score
best_estimator = grid_search_svc.best_estimator_  # Model with best parameters

In [37]:
print(best_params)
print(best_score)
print(best_estimator)

{'kernel': 'linear'}
0.9810422383158652
SVC(kernel='linear')


In [38]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

In [39]:
gnb = GaussianNB()
scores = cross_val_score(gnb, X, y, cv=kfold, scoring='accuracy')

In [40]:
print(scores)
print("Mean accuracy:", scores.mean())

[0.87427466 0.85686654 0.8762089  0.87814313 0.86847195 0.84526112
 0.85880077 0.87234043 0.82978723 0.84496124]
Mean accuracy: 0.8605115979188221


In [41]:
mnb = MultinomialNB()
scores = cross_val_score(mnb, X, y, cv=kfold, scoring='accuracy')

In [42]:
print(scores)
print("Mean accuracy:", scores.mean())

[0.9787234  0.9787234  0.97485493 0.97678917 0.97678917 0.98646035
 0.98839458 0.98259188 0.98065764 0.98643411]
Mean accuracy: 0.9810418634639317


In [43]:
bnb = BernoulliNB()
scores = cross_val_score(bnb, X, y, cv=kfold, scoring='accuracy')

In [44]:
print(scores)
print("Mean accuracy:", scores.mean())

[0.97678917 0.97485493 0.98259188 0.97678917 0.98065764 0.98452611
 0.98646035 0.97678917 0.98646035 0.99224806]
Mean accuracy: 0.9818166824104478


In [45]:
from sklearn.linear_model import LogisticRegression

In [46]:
lg_model = LogisticRegression(max_iter=200)
scores = cross_val_score(lg_model, X, y, cv=kfold, scoring='accuracy')

In [47]:
print(scores)
print("Mean accuracy:", scores.mean())

[0.97098646 0.97678917 0.9729207  0.96711799 0.9787234  0.9787234
 0.98065764 0.97098646 0.97098646 0.98255814]
Mean accuracy: 0.9750449822320185
