In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.info()

In [None]:
import nltk
print(nltk.__version__)
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
def transform_text(text): 
    text = text.lower() # chuyển thành chữ thường
    text = nltk.word_tokenize(text) # chia text thành mảng các từ riêng lẻ
    
    y = []
    for i in text:
        if i.isalnum(): # kiểm tra từ chỉ bao gồm chữ và số
            y.append(i)
    
    text = y[:] # copy y to text
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # kiểm tra xem từ không phải là từ dừng và dấu câu
            y.append(i)
            
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i)) # cắt giảm từ trở về dạng gốc
            
    return " ".join(y)

In [None]:
transform_text('Whoever has money, please donate to me')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
cv = CountVectorizer()   #Vector hóa văn bản
tfidf = TfidfVectorizer(max_features = 3000) # chuyển đổi văn bản thành vectơ TF-IDF giữ lại 3000 đặc trưng

In [None]:
X = tfidf.fit_transform(df['transformed_text']).toarray() # chuyển đổi cột 'transformed_text' thành ma trận các vectơ TF-IDF
X.shape

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X) # chuẩn hóa ma trận về khoảng giá trị [0, 1]
X.shape

In [None]:
y = df['target'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, classification_report, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

In [None]:
from sklearn.model_selection import KFold, LeaveOneOut
from sklearn.model_selection import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=2, weights='distance')
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)
print("accuracy score:", accuracy_score(y_test, y_pred_knn))
print("confusion matrix:\n",confusion_matrix(y_test, y_pred_knn))
print("precision score:", precision_score(y_test, y_pred_knn))
print("\n",classification_report(y_test, y_pred_knn))

In [None]:
param_grid_knn = {'n_neighbors': range(1,20,1)}
grid_search_knn = GridSearchCV(knn, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train, y_train)

In [None]:
grid_search_knn.best_estimator_

In [None]:
knn_best_nneighbors = grid_search_knn.best_estimator_

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtc = DecisionTreeClassifier(max_depth=5)
dtc.fit(X_train, y_train)
y_pred_dtc = dtc.predict(X_test)
print("accuracy score:", accuracy_score(y_test, y_pred_dtc))
print("confusion matrix:\n",confusion_matrix(y_test, y_pred_dtc))
print("precision score:", precision_score(y_test, y_pred_dtc))
print("\n",classification_report(y_test, y_pred_dtc))

In [None]:
param_grid_dtc = {'max_depth': range(1,30,1)}
grid_search_dtc = GridSearchCV(dtc, param_grid_dtc, cv=5, scoring='accuracy')
grid_search_dtc.fit(X_train, y_train)

In [None]:
grid_search_dtc.best_estimator_

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rfc = RandomForestClassifier(n_estimators=50)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_test)
print("accuracy score:", accuracy_score(y_test, y_pred_rfc))
print("confusion matrix:\n",confusion_matrix(y_test, y_pred_rfc))
print("precision score:", precision_score(y_test, y_pred_rfc))
print("\n",classification_report(y_test, y_pred_rfc))

In [None]:
param_grid_rfc = {'n_estimators': range(10,100,10)}
grid_search_rfc = GridSearchCV(rfc, param_grid_rfc, cv=5, scoring='accuracy')
grid_search_rfc.fit(X_train, y_train)

In [None]:
grid_search_rfc.best_estimator_