In [97]:
import pandas as pd
import re

#import nltk
#nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

df = pd.read_csv('data/spam-classification.csv')[:1000]
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [98]:
df["Spam"] = df["Category"].apply(lambda x: int(x == "spam"))
df.drop(columns=["Category"], inplace=True)
df

Unnamed: 0,Message,Spam
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...
995,"I can't, I don't have her number!",0
996,Change again... It's e one next to escalator...,0
997,Yetunde i'm in class can you not run water on ...,0
998,Not a lot has happened here. Feels very quiet....,0


In [99]:
my_stopwords = stopwords.words("english")
my_stopwords.remove("not")
my_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [100]:
ps = PorterStemmer()  # stemming (alternatywnie można użyć: 'lemmatization')

def preprocess_document(text):
    
    text = text.lower()
    text = re.sub(r"[^A-Za-z1-9]", " ", text)
    text = text.split()
    text = [ps.stem(word) for word in text if word not in my_stopwords]
    text = ' '.join(text)
    
    return text

In [101]:
df["Message"].apply(preprocess_document)

0      go jurong point crazi avail bugi n great world...
1                                  ok lar joke wif u oni
2      free entri 2 wkli comp win fa cup final tkt 21...
3                    u dun say earli hor u c alreadi say
4                   nah think goe usf live around though
                             ...                        
995                                               number
996                               chang e one next escal
997                yetund class not run water make ok pl
998    not lot happen feel quiet beth aunt charli wor...
999    wait 4 bu stop aft ur lect lar dun c go get ca...
Name: Message, Length: 1000, dtype: object

In [102]:
df["Message"] = df["Message"].apply(preprocess_document)
df

Unnamed: 0,Message,Spam
0,go jurong point crazi avail bugi n great world...,0
1,ok lar joke wif u oni,0
2,free entri 2 wkli comp win fa cup final tkt 21...,1
3,u dun say earli hor u c alreadi say,0
4,nah think goe usf live around though,0
...,...,...
995,number,0
996,chang e one next escal,0
997,yetund class not run water make ok pl,0
998,not lot happen feel quiet beth aunt charli wor...,0


In [103]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2000)
data = cv.fit_transform(df["Message"]).toarray()

In [104]:
cv.get_feature_names_out()[:20]
# len(cv.get_feature_names_out())

array(['11', '114', '11mth', '11pm', '12', '121', '1216', '123',
       '125698789', '1259231', '125gift', '12hr', '1327bt', '14', '1417',
       '145', '1461', '14thmarch', '15', '153'], dtype=object)

In [105]:
print(data[0].sum())
print(data[1].sum())

10
4


In [106]:
X = pd.DataFrame(data, columns=cv.get_feature_names_out())
X

Unnamed: 0,11,114,11mth,11pm,12,121,1216,123,125698789,1259231,...,youi,yourinclus,yourjob,yowif,yoyyooo,yr,yummi,yun,yup,zaher
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [107]:
y = df["Spam"]
y

0      0
1      0
2      1
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: Spam, Length: 1000, dtype: int64

In [108]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [110]:
y.value_counts()  # słabo zbalansowane dane

0    848
1    152
Name: Spam, dtype: int64

## Regresja logistyczna

In [111]:
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)
y_pred = model_lr.predict(X_test)

accuracy_score(y_test, y_pred)

0.9666666666666667

## Drzewo decyzyjne

In [112]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred = model_dt.predict(X_test)

accuracy_score(y_test, y_pred)

0.9515151515151515

## Naive Bayes

In [113]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

models = [MultinomialNB(), GaussianNB(), BernoulliNB()]
scores = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
scores

[0.9666666666666667, 0.8878787878787879, 0.9515151515151515]

## SVM

In [114]:
from sklearn.svm import SVC

model_svm = SVC()
model_svm.fit(X_train, y_train)
y_pred = model_svm.predict(X_test)

accuracy_score(y_test, y_pred)

0.9424242424242424

## KNN

In [115]:
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)
y_pred = model_knn.predict(X_test)

accuracy_score(y_test, y_pred)

0.8727272727272727

## RandomForest

In [118]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(n_estimators=200)
model_rf.fit(X_train, y_train)
y_pred = model_rf.predict(X_test)

accuracy_score(y_test, y_pred)

0.9575757575757575

In [117]:
n_estimators_list = [150, 200, 250, 300, 350]
scores = []

for n in n_estimators_list:
    
    model_rf = RandomForestClassifier(n_estimators=n)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    scores.append(score)
    
scores

[0.9515151515151515,
 0.9545454545454546,
 0.9515151515151515,
 0.9484848484848485,
 0.9515151515151515]