In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

In [3]:
df.sample(5)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
645,ham,"Watching cartoon, listening music &amp; at eve...",,,
3023,ham,I love ya too but try and budget your money be...,,,
1124,ham,Aiyar sorry lor forgot 2 tell u...,,,
3320,ham,Yo im right by yo work,,,
2395,ham,"Babe, I'm back ... Come back to me ...",,,


In [4]:
df.shape

(5572, 5)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [7]:
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

Unnamed: 0,target,text
428,ham,7 at esplanade.. Do Ì_ mind giving me a lift c...
2652,ham,No need for the drug anymore.
841,ham,Pls send me a comprehensive mail about who i'm...
3552,ham,Din i tell u jus now 420
1802,ham,Ok lor thanx... ÌÏ in school?


In [8]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

In [9]:
df = df.drop_duplicates(keep='first')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5169 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   target  5169 non-null   int32 
 1   text    5169 non-null   object
dtypes: int32(1), object(1)
memory usage: 101.0+ KB


In [11]:
import nltk
print(nltk.__version__)
nltk.download('punkt')
nltk.download('stopwords')

3.8.1


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
import string
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [13]:
def transform_text(text): 
    text = text.lower() # chuyển thành chữ thường
    text = nltk.word_tokenize(text) # chia text thành mảng các từ riêng lẻ
    
    y = []
    for i in text:
        if i.isalnum(): # kiểm tra từ chỉ bao gồm chữ và số
            y.append(i)
    
    text = y[:] # copy y to text
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation: # kiểm tra xem từ không phải là từ dừng và dấu câu
            y.append(i)
            
    text = y[:]
    y.clear()
    for i in text:
        y.append(ps.stem(i)) # cắt giảm từ trở về dạng gốc
            
    return " ".join(y)

In [14]:
transform_text('Whoever has money, please donate to me')

'whoever money pleas donat'

In [15]:
df['transformed_text'] = df['text'].apply(transform_text)

In [16]:
df.head()

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though


In [17]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
cv = CountVectorizer()   #Vector hóa văn bản
tfidf = TfidfVectorizer(max_features = 3000) # chuyển đổi văn bản thành vectơ TF-IDF giữ lại 3000 đặc trưng

In [18]:
X = tfidf.fit_transform(df['transformed_text']).toarray() # chuyển đổi cột 'transformed_text' thành ma trận các vectơ TF-IDF
X.shape

(5169, 3000)

In [19]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X) # chuẩn hóa ma trận về khoảng giá trị [0, 1]
X.shape

(5169, 3000)

In [20]:
y = df['target'].values

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, classification_report, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns
from plotly.subplots import make_subplots

# Bootstrap Sampling

In [22]:
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=2)

n_samples = len(X)  # Number of samples in the dataset
n_bootstraps = 100  # Number of bootstrap samples to create
bootstrap_scores = []

for _ in range(n_bootstraps):
    # Create a bootstrap sample with replacement
    X_boot, y_boot = resample(X, y, replace=True, random_state=42)

    # Create and fit a classifier on the bootstrap sample
    clf = KNeighborsClassifier(n_neighbors=2)
    clf.fit(X_boot, y_boot)

    # Evaluate performance on the original dataset (or a held-out test set)
    score = clf.score(X, y)  # Using accuracy as metric
    bootstrap_scores.append(score)

In [24]:
print("Bootstrap scores:", bootstrap_scores)
print("Mean accuracy:", np.mean(bootstrap_scores))
print("Standard deviation:", np.std(bootstrap_scores))

Bootstrap scores: [0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0.9495066744051074, 0

In [25]:
from sklearn.tree import DecisionTreeClassifier
n_samples = len(X)  # Number of samples in the dataset
n_bootstraps = 100  # Number of bootstrap samples to create
bootstrap_scores = []

for _ in range(n_bootstraps):
    # Create a bootstrap sample with replacement
    X_boot, y_boot = resample(X, y, replace=True, random_state=42)

    # Create and fit a classifier on the bootstrap sample
    clf = DecisionTreeClassifier(max_depth=20)
    clf.fit(X_boot, y_boot)

    # Evaluate performance on the original dataset (or a held-out test set)
    score = clf.score(X, y)  # Using accuracy as metric
    bootstrap_scores.append(score)

In [26]:
print("Bootstrap scores:", bootstrap_scores)
print("Mean accuracy:", np.mean(bootstrap_scores))
print("Standard deviation:", np.std(bootstrap_scores))

Bootstrap scores: [0.9740762236409364, 0.9727219965177016, 0.9736893016057264, 0.9738827626233314, 0.9733023795705166, 0.9733023795705166, 0.9727219965177016, 0.9744631456761462, 0.9746566066937512, 0.9738827626233314, 0.9742696846585414, 0.9723350744824918, 0.975236989746566, 0.9744631456761462, 0.9733023795705166, 0.9729154575353066, 0.9734958405881214, 0.9740762236409364, 0.9736893016057264, 0.9731089185529116, 0.9742696846585414, 0.9748500677113562, 0.9734958405881214, 0.9725285355000968, 0.9742696846585414, 0.9733023795705166, 0.9748500677113562, 0.975236989746566, 0.9736893016057264, 0.9738827626233314, 0.9736893016057264, 0.9742696846585414, 0.9748500677113562, 0.9746566066937512, 0.9736893016057264, 0.9734958405881214, 0.9725285355000968, 0.9736893016057264, 0.9744631456761462, 0.9744631456761462, 0.9746566066937512, 0.9729154575353066, 0.9748500677113562, 0.9731089185529116, 0.975430450764171, 0.9734958405881214, 0.9736893016057264, 0.9727219965177016, 0.9740762236409364, 0.97

In [27]:
from sklearn.ensemble import RandomForestClassifier
n_samples = len(X)  # Number of samples in the dataset
n_bootstraps = 100  # Number of bootstrap samples to create
bootstrap_scores = []

for _ in range(n_bootstraps):
    # Create a bootstrap sample with replacement
    X_boot, y_boot = resample(X, y, replace=True, random_state=42)

    # Create and fit a classifier on the bootstrap sample
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_boot, y_boot)

    # Evaluate performance on the original dataset (or a held-out test set)
    score = clf.score(X, y)  # Using accuracy as metric
    bootstrap_scores.append(score)

In [28]:
print("Bootstrap scores:", bootstrap_scores)
print("Mean accuracy:", np.mean(bootstrap_scores))
print("Standard deviation:", np.std(bootstrap_scores))

Bootstrap scores: [0.9885857999613078, 0.9903269491197524, 0.9889727219965178, 0.9885857999613078, 0.9889727219965178, 0.9901334881021474, 0.9895531050493326, 0.9897465660669376, 0.9895531050493326, 0.9895531050493326, 0.9895531050493326, 0.9895531050493326, 0.9893596440317276, 0.9893596440317276, 0.9895531050493326, 0.9889727219965178, 0.9899400270845424, 0.9893596440317276, 0.9899400270845424, 0.9897465660669376, 0.9899400270845424, 0.9899400270845424, 0.9893596440317276, 0.9899400270845424, 0.9899400270845424, 0.9895531050493326, 0.9899400270845424, 0.9899400270845424, 0.9889727219965178, 0.9889727219965178, 0.9887792609789128, 0.9891661830141226, 0.9891661830141226, 0.9889727219965178, 0.9899400270845424, 0.9897465660669376, 0.9889727219965178, 0.9889727219965178, 0.9903269491197524, 0.9883923389437028, 0.9891661830141226, 0.9891661830141226, 0.9897465660669376, 0.9891661830141226, 0.9895531050493326, 0.9897465660669376, 0.9893596440317276, 0.9899400270845424, 0.9887792609789128, 0

In [32]:
from sklearn.svm import SVC
svc = SVC(kernel='sigmoid')
scores = cross_val_score(svc, X, y, cv=kfold, scoring='accuracy')

In [33]:
print(scores)
print("Mean accuracy:", scores.mean())

[0.98065764 0.9787234  0.97678917 0.96905222 0.98065764 0.98646035
 0.98839458 0.97485493 0.97485493 0.98449612]
Mean accuracy: 0.979494099830567
