# Zadanie 1

    a)	Dokonaj klasyfikacji sms-ów ze względu na to, czy są spamem czy nie.
    b)	Jako dane wejścia wykorzystaj wagi TF-IDF
    c)	Dodaj informacje o sentymencie smsa do modelu klasyfikacyjnego
    d)	Dodaj informacę o proporcji czasowników („VERB”), rzeczowników („NOUN”) i przymiotników („ADJ”)
    e)	Wykorzystaj 3 znane Ci algorytmy klasyfikacyjne.

In [7]:
import pandas as pd
import plotly.express as px
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import *

In [8]:
df = pd.read_csv('datasets/SMSSpamCollection.txt', sep='\t', header=None, names=['is_spam', 'text'])
df['is_spam'] = df['is_spam'].map({'ham': 0, 'spam': 1})

print('Liczba wierszy:', len(df))
df.head(10)

Liczba wierszy: 5572


Unnamed: 0,is_spam,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


# Wizualizacja liczności obu klas

In [9]:
fig = px.bar(df['is_spam'].value_counts(),hover_data=[df['is_spam'].value_counts().index],width=500,height = 300)
fig.update_traces(marker_color='rgb(158,202,225)', marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6)
fig.update_layout(title_text='Ham vs Spam', xaxis_title='Czy spam?', yaxis_title='Count')
fig.show()

# Oczyszczanie danych i wyznaczenie liczby czasowników (verbs), rzeczowników (nouns) i przymiotników (adjectives) dla każdego wiersza

In [10]:
def clean_text(x):
    x = x.lower()
    x = x.replace('.','')
    x = x.replace(',','')
    x = x.replace(':','')
    x = x.replace(';','')
    x = x.replace('!','')
    x = x.replace('?','')
    x = x.replace('<br>','')
    x = x.replace('<br />','')
    x = ' '.join([word for word in x.split() if word.isalpha()])
    return x

df['text'] = df['text'].apply(clean_text)
df['text'].head(10)

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in a wkly comp to win fa cup final ...
3          u dun say so early hor u c already then say
4    nah i think he goes to usf he lives around her...
5    freemsg hey there darling been now and no word...
6    even my brother is not like to speak with me t...
7    as per your request melle minnaminunginte nuru...
8    winner as a valued network customer you have b...
9    had your mobile months or more u r entitled to...
Name: text, dtype: object

In [11]:
nlp = spacy.load('en_core_web_trf')

verbs = []
nouns = []
adjs = []

for doc in df['text']:
        transformed_doc = nlp(doc)
        pos = [token.pos_ for token in transformed_doc]
        all_pos = pos.count('VERB') + pos.count('NOUN') + pos.count('ADJ')
        if all_pos > 0: # unikamy dzielenia przez 0
            verbs.append(pos.count('VERB') / all_pos)
            nouns.append(pos.count('NOUN') / all_pos)
            adjs.append(pos.count('ADJ') / all_pos)
        else:
            verbs.append(0)
            nouns.append(0)
            adjs.append(0)


User provided device_type of 'cuda', but CUDA is not available. Disabling



In [12]:
pd.DataFrame({'verbs': verbs, 'nouns': nouns, 'adjs': adjs}).head()

Unnamed: 0,verbs,nouns,adjs
0,0.285714,0.285714,0.428571
1,1.0,0.0,0.0
2,0.363636,0.545455,0.090909
3,1.0,0.0,0.0
4,1.0,0.0,0.0


# Tokenizacja, określenie wag TF-IDF, dodanie 3 kolumn (z proporcjami czasowników, rzeczowników i przymiotników) i podział na zbiory treningowe i testowe

In [13]:
tfidf = TfidfVectorizer(min_df=5)

X = tfidf.fit_transform(df['text'])
X = pd.DataFrame(X.todense(), columns=tfidf.get_feature_names_out())
pos_df = pd.DataFrame({'VERBS': verbs, 'NOUNS' : nouns, 'ADJS' : adjs})
X = pd.concat([X, pos_df], axis=1)

y = df['is_spam']

X.head()

Unnamed: 0,abiola,able,about,abt,accept,access,account,across,activate,actually,...,yours,yourself,yr,yrs,yun,yup,zed,VERBS,NOUNS,ADJS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285714,0.285714,0.428571
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363636,0.545455,0.090909
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

# Regresja logistyczna
- Problem w tym zadaniu jest binarnym problemem klasyfikacyjnym - musimy tekst zaklasyfikować jako spam lub jako ham, mając daną wiadomość SMS.
- Najbardziej będzie nas interesowała wartość parametru `positive sentiment recall`, czyli True Positive Rate - stosunek wiadomości poprawnie zaklasyfikowanych jako spam (TP) do wszystkich, które powinien rozpoznać (TP + FN)

In [15]:
lr = LogisticRegression()
lr.fit(X, y)
y_pred = lr.predict(X_test)

print("Classification accuracy:", accuracy_score(y_test, y_pred))
print("Positive sentiment precision:", precision_score(y_test, y_pred))
print("Positive sentiment recall:", recall_score(y_test, y_pred))
print("Negative sentiment precision:", precision_score(y_test, y_pred,pos_label = 0))
print("Negative sentiment recall:", recall_score(y_test, y_pred, pos_label = 0))

Classification accuracy: 0.9829596412556054
Positive sentiment precision: 0.9927007299270073
Positive sentiment recall: 0.8831168831168831
Negative sentiment precision: 0.9815950920245399
Negative sentiment recall: 0.9989594172736732


In [16]:
weights = list(zip(tfidf.get_feature_names_out(),lr.coef_[0]))
weights.sort(key = lambda x:x[1])
print('10 słów o najbardziej negatywnym sentymencie (nie wskazującym na spam):\n', pd.Series(weights[:10]))
weights.sort(key = lambda x:x[1],reverse = True)
print('10 słów o najbardziej pozytywnym sentymencie (wskazującym na spam):\n', pd.Series(weights[:10]))

10 słów o najbardziej negatywnym sentymencie (nie wskazującym na spam):
 0      (my, -2.4996310352531363)
1      (me, -1.9528249401885591)
2    (later, -1.589708224212615)
3      (ok, -1.4379814930698624)
4    (that, -1.4115907576371531)
5       (am, -1.342572069328012)
6    (home, -1.2874974090977975)
7    (come, -1.2369095516991508)
8       (but, -1.20048860084272)
9     (way, -1.1826803113689837)
dtype: object
10 słów o najbardziej pozytywnym sentymencie (wskazującym na spam):
 0       (call, 5.007161820390904)
1       (txt, 4.8473818813711915)
2         (to, 3.893759675938226)
3       (text, 3.842222430912415)
4      (stop, 3.8108452792688516)
5      (free, 3.3315291424540394)
6     (claim, 3.2331154042773536)
7     (reply, 3.1440241585448883)
8    (mobile, 2.9901685986186104)
9       (who, 2.8334387988121446)
dtype: object


# Naiwny klasyfikator Bayesa

In [17]:
mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)

print("Classification accuracy:", accuracy_score(y_test, y_pred))
print("Positive sentiment precision:", precision_score(y_test, y_pred))
print("Positive sentiment recall:", recall_score(y_test, y_pred))
print("Negative sentiment precision:", precision_score(y_test, y_pred,pos_label = 0))
print("Negative sentiment recall:", recall_score(y_test, y_pred, pos_label = 0))

Classification accuracy: 0.967713004484305
Positive sentiment precision: 0.9916666666666667
Positive sentiment recall: 0.7727272727272727
Negative sentiment precision: 0.964824120603015
Negative sentiment recall: 0.9989594172736732


# K-Najbliższych Sąsiadów - KNN
Algorytm K-Najbliższych Sąsiadów nie działa najlepiej dla bardzo dużych zbiorów danych zawierających dużą ilość cech.
Możemy zauważyć, że dokładność modelu dla K-Najbliższych Sąsiadów jest niższa, niż w przypadku modelu logistycznego, czy modelu Bayesa.

In [19]:
knn = KNeighborsClassifier(algorithm = 'brute', n_jobs=-1)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print("Classification accuracy:", accuracy_score(y_test, y_pred))
print("Positive sentiment precision:", precision_score(y_test, y_pred))
print("Positive sentiment recall:", recall_score(y_test, y_pred))
print("Negative sentiment precision:", precision_score(y_test, y_pred,pos_label = 0))
print("Negative sentiment recall:", recall_score(y_test, y_pred, pos_label = 0))

Classification accuracy: 0.9381165919282511
Positive sentiment precision: 1.0
Positive sentiment recall: 0.551948051948052
Negative sentiment precision: 0.933009708737864
Negative sentiment recall: 1.0


# Drzewo Decyzyjne

In [20]:
dtc = DecisionTreeClassifier(max_depth=5) # Przy większej głębokości drzewa pojawia się overfitting
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

print("Classification accuracy:", accuracy_score(y_test, y_pred))
print("Positive sentiment precision:", precision_score(y_test, y_pred))
print("Positive sentiment recall:", recall_score(y_test, y_pred))
print("Negative sentiment precision:", precision_score(y_test, y_pred,pos_label = 0))
print("Negative sentiment recall:", recall_score(y_test, y_pred, pos_label = 0))

Classification accuracy: 0.947085201793722
Positive sentiment precision: 0.8625954198473282
Positive sentiment recall: 0.7337662337662337
Negative sentiment precision: 0.9583333333333334
Negative sentiment recall: 0.9812695109261186


# Maszyna Wektorów Nośnych (Support Vector Machine - SVM)

In [21]:
svm = LinearSVC(C=1.5)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

print("Classification accuracy:", accuracy_score(y_test, y_pred))
print("Positive sentiment precision:", precision_score(y_test, y_pred))
print("Positive sentiment recall:", recall_score(y_test, y_pred))
print("Negative sentiment precision:", precision_score(y_test, y_pred,pos_label = 0))
print("Negative sentiment recall:", recall_score(y_test, y_pred, pos_label = 0))

Classification accuracy: 0.9820627802690582
Positive sentiment precision: 0.9785714285714285
Positive sentiment recall: 0.8896103896103896
Negative sentiment precision: 0.9825641025641025
Negative sentiment recall: 0.9968782518210197


Dodanie kolumn zawierających stosunek czasowników, rzeczowników i przymiotników pozwoliło nieznacznie zwiększyć dokładność klasyfikacji (wraz ze zwiększeniem True Positive Rate)