In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from utils import prepropcess_data, get_embeddings_tfidf, get_embeddings_gzip
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Загрузка и препроцессинг данных

In [2]:
df = pd.read_csv('train_spam.csv')

In [3]:
df_preproc = prepropcess_data(df)
# df_emb = get_embeddings_tfidf(df_preproc)
# df_emb = get_embeddings_gzip(df_emb)

In [4]:
df_preproc

Unnamed: 0,text_type,text,text_tokenize,text_no_stop_words,text_lemmatized,final_text
0,0,make sure alex knows his birthday is over in f...,"[make, sure, alex, knows, his, birthday, is, o...","[make, sure, alex, knows, birthday, fifteen, m...","[make, sure, alex, know, birthday, fifteen, mi...",make sure alex know birthday fifteen minute fa...
1,0,a resume for john lavorato thanks vince i will...,"[a, resume, for, john, lavorato, thanks, vince...","[resume, john, lavorato, thanks, vince, get, m...","[resume, john, lavorato, thanks, vince, get, m...",resume john lavorato thanks vince get move rig...
2,1,plzz visit my website moviesgodml to get all m...,"[plzz, visit, my, website, moviesgodml, to, ge...","[plzz, visit, website, moviesgodml, get, movie...","[plzz, visit, website, moviesgodml, get, movie...",plzz visit website moviesgodml get movie free ...
3,1,urgent your mobile number has been awarded wit...,"[urgent, your, mobile, number, has, been, awar...","[urgent, mobile, number, awarded, prize, guara...","[urgent, mobile, number, award, prize, guarant...",urgent mobile number award prize guaranteed ca...
4,0,overview of hr associates analyst project per ...,"[overview, of, hr, associates, analyst, projec...","[overview, hr, associates, analyst, project, p...","[overview, hr, associate, analyst, project, pe...",overview hr associate analyst project per davi...
...,...,...,...,...,...,...
16273,1,if you are interested in binary options tradin...,"[if, you, are, interested, in, binary, options...","[interested, binary, options, trading, may, co...","[interested, binary, option, trading, may, con...",interested binary option trading may continue ...
16274,1,dirty pictureblyk on aircel thanks you for bei...,"[dirty, pictureblyk, on, aircel, thanks, you, ...","[dirty, pictureblyk, aircel, thanks, valued, m...","[dirty, pictureblyk, aircel, thanks, value, me...",dirty pictureblyk aircel thanks value member h...
16275,0,or you could do this g on mon 1635465 sep 1635...,"[or, you, could, do, this, g, on, mon, sep, da...","[could, g, mon, sep, david, rees, wrote, mon, ...","[could, g, mon, sep, david, rees, write, mon, ...",could g mon sep david rees write mon sep rob w...
16276,0,insta reels par 80 गंद bhara pada hai 👀 kuch b...,"[insta, reels, par, bhara, pada, hai, kuch, bh...","[insta, reels, par, bhara, pada, hai, kuch, bh...","[insta, reel, par, bhara, pada, hai, kuch, bhi...",insta reel par bhara pada hai kuch bhi dalte c...


In [5]:
# X_tfidf = df_preproc['text_tfidf'].values
# y_tfidf = df_preproc['text_type'].values


# X_gzip = df_preproc['text_gzip'].values
# y_gzip = df_preproc['text_type'].values

# X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, y_tfidf, test_size=0.2)
# X_train_gzip, X_test_gzip, y_train_gzip, y_test_gzip = train_test_split(X_gzip, y_gzip, test_size=0.2)

X_train = df_preproc['final_text'].values
y_train = df_preproc['text_type'].values

X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_train, y_train, test_size=0.2)


# Обучение данных и поиск лучшей модели

## TF-IDF

### Подбор параметров

In [34]:
tfidf = TfidfVectorizer()


svm = SVC()
param_grid_svm = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'svm__C': [0.1, 1, 10, 100, 1000],
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto']
}

lr = LogisticRegression()
param_grid_lr = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'lr__C': [0.1, 1, 10, 100, 1000, 5000, 10000, 100000],
    'lr__penalty': ['l1', 'l2', 'elasticnet']
}

nb = MultinomialNB()
param_grid_nb = {
    'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__max_df': [0.5, 0.75, 1.0],
    'nb__alpha': [0.1, 1, 10],
    'nb__fit_prior': [True, False]
}

pipeline_svm = Pipeline([
    ('tfidf', tfidf),
    ('svm', svm)
])
pipeline_lr = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])
pipeline_nb = Pipeline([
('tfidf', tfidf),
    ('nb', nb)
])

grid_search_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='f1_macro')
grid_search_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='f1_macro')
grid_search_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, scoring='f1_macro')

random_search_svm = RandomizedSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='f1_macro', n_iter=10)
random_search_lr = RandomizedSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='f1_macro', n_iter=10)
random_search_nb = RandomizedSearchCV(pipeline_nb, param_grid_nb, cv=5, scoring='f1_macro', n_iter=10)

In [29]:
grid_search_svm.fit(X_train_tfidf[:1000], y_train_tfidf[:1000])

# Print the best hyperparameters for the SVM model
print("Best hyperparameters for SVM model:")
print(grid_search_svm.best_params_)

Best hyperparameters for SVM model:
{'svm__C': 100, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


Best hyperparameters for SVM model:    
{'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

In [35]:
grid_search_lr.fit(X_train_tfidf[:1000], y_train_tfidf[:1000])

# Print the best hyperparameters for the SVM model
print("Best hyperparameters for LR model:")
print(grid_search_lr.best_params_)

720 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ataka\anaconda3\envs\ml\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ataka\anaconda3\envs\ml\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\ataka\anaconda3\envs\ml\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "c:\Users\ataka\anaconda3\envs\ml\lib\site-packages\sklearn\base.py", line 1152, in wrapp

Best hyperparameters for LR model:
{'lr__C': 100000, 'lr__penalty': 'l2', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


Best hyperparameters for LR model:
{'lr__C': 10, 'lr__penalty': 'l2', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 1)}

In [26]:
grid_search_nb.fit(X_train_tfidf[:5000], y_train_tfidf[:5000])

# Print the best hyperparameters for the SVM model
print("Best hyperparameters for NB model:")
print(grid_search_nb.best_params_)

Best hyperparameters for NB model:
{'nb__alpha': 0.1, 'nb__fit_prior': False, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}


Best hyperparameters for NB model:
{'nb__alpha': 0.1, 'nb__fit_prior': False, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}

### Обучение

SVC

In [6]:
# {'svm__C': 10, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
tfidf = TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))


svm = SVC(C=100, gamma='scale', kernel='linear')

pipeline_svm = Pipeline([
    ('tfidf', tfidf),
    ('svm', svm)
])

pipeline_svm.fit(X_train_tfidf, y_train_tfidf)


In [7]:
y_pred_svm = pipeline_svm.predict(X_test_tfidf)
print("Grid search SVM model evaluation:") 
print("Accuracy:", accuracy_score(y_test_tfidf, y_pred_svm))
print("Precision:", precision_score(y_test_tfidf, y_pred_svm))
print("Recall:", recall_score(y_test_tfidf, y_pred_svm))
print("F1 score:", f1_score(y_test_tfidf, y_pred_svm))

Grid search SVM model evaluation:
Accuracy: 0.9404176904176904
Precision: 0.8910191725529768
Recall: 0.9112487100103199
F1 score: 0.9010204081632653


In [8]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test_tfidf, y_pred_svm)

print(f"ROC AUC score: {roc_auc:.4f}")

ROC AUC score: 0.9320


LR

In [9]:
# {'lr__C': 100000, 'lr__penalty': 'l2', 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
tfidf = TfidfVectorizer(max_df=0.5, ngram_range=(1, 1))


lr = LogisticRegression(C=100000, penalty='l2')

pipeline_lr = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])

pipeline_lr.fit(X_train_tfidf, y_train_tfidf)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
y_pred_lr = pipeline_lr.predict(X_test_tfidf)
print("Grid search LR model evaluation:") 
print("Accuracy:", accuracy_score(y_test_tfidf, y_pred_lr))
print("Precision:", precision_score(y_test_tfidf, y_pred_lr))
print("Recall:", recall_score(y_test_tfidf, y_pred_lr))
print("F1 score:", f1_score(y_test_tfidf, y_pred_lr))

Grid search LR model evaluation:
Accuracy: 0.9161547911547911
Precision: 0.8486973947895792
Recall: 0.8740970072239422
F1 score: 0.8612099644128114


In [11]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test_tfidf, y_pred_lr)

print(f"ROC AUC score: {roc_auc:.4f}")

ROC AUC score: 0.9040


NB

In [12]:
# {'nb__alpha': 0.1, 'nb__fit_prior': False, 'tfidf__max_df': 0.5, 'tfidf__ngram_range': (1, 2)}
tfidf = TfidfVectorizer(max_df=0.5, ngram_range=(1, 2))


nb = MultinomialNB(alpha=0.1, fit_prior=False)

pipeline_nb = Pipeline([
    ('tfidf', tfidf),
    ('svm', nb)
])

pipeline_nb.fit(X_train_tfidf, y_train_tfidf)

In [13]:
y_pred_nb = pipeline_nb.predict(X_test_tfidf)
print("Grid search LR model evaluation:") 
print("Accuracy:", accuracy_score(y_test_tfidf, y_pred_nb))
print("Precision:", precision_score(y_test_tfidf, y_pred_nb))
print("Recall:", recall_score(y_test_tfidf, y_pred_nb))
print("F1 score:", f1_score(y_test_tfidf, y_pred_nb))

Grid search LR model evaluation:
Accuracy: 0.9299754299754299
Precision: 0.8614634146341463
Recall: 0.9112487100103199
F1 score: 0.8856569709127382


In [14]:
from sklearn.metrics import roc_auc_score
roc_auc = roc_auc_score(y_test_tfidf, y_pred_nb)

print(f"ROC AUC score: {roc_auc:.4f}")

ROC AUC score: 0.9246


### Предсказания на тестовых данных

In [15]:
test_df = pd.read_csv('test_spam.csv')

In [16]:
test_df

Unnamed: 0,text
0,j jim whitehead ejw cse ucsc edu writes j you ...
1,original message from bitbitch magnesium net p...
2,java for managers vince durasoft who just taug...
3,there is a youtuber name saiman says
4,underpriced issue with high return on equity t...
...,...
4065,husband to wifetum meri zindagi hoorwifeor kya...
4066,baylor enron case study cindy yes i shall co a...
4067,boring as compared to tp
4068,hellogorgeous hows u my fone was on charge lst...


In [17]:
test_df_preproc = prepropcess_data(test_df)


In [18]:
test_df_X_train = test_df_preproc['text'].values

In [19]:
predict_test_svm = pipeline_svm.predict(test_df_X_train)
predict_test_lr = pipeline_lr.predict(test_df_X_train)
predict_test_nb = pipeline_nb.predict(test_df_X_train)

In [45]:
result_df = pd.DataFrame()

In [46]:
result_df['score'] = predict_test_svm
# result_df['LR'] = predict_test_lr
# result_df['NB'] = predict_test_nb

result_df['text'] = test_df['text']


In [47]:
result_df

Unnamed: 0,score,text
0,0,j jim whitehead ejw cse ucsc edu writes j you ...
1,0,original message from bitbitch magnesium net p...
2,0,java for managers vince durasoft who just taug...
3,0,there is a youtuber name saiman says
4,1,underpriced issue with high return on equity t...
...,...,...
4065,0,husband to wifetum meri zindagi hoorwifeor kya...
4066,0,baylor enron case study cindy yes i shall co a...
4067,0,boring as compared to tp
4068,0,hellogorgeous hows u my fone was on charge lst...


In [48]:

result_df['score'] = result_df['score'].map({0: 'ham', 1: 'spam'})

In [49]:
result_df

Unnamed: 0,score,text
0,ham,j jim whitehead ejw cse ucsc edu writes j you ...
1,ham,original message from bitbitch magnesium net p...
2,ham,java for managers vince durasoft who just taug...
3,ham,there is a youtuber name saiman says
4,spam,underpriced issue with high return on equity t...
...,...,...
4065,ham,husband to wifetum meri zindagi hoorwifeor kya...
4066,ham,baylor enron case study cindy yes i shall co a...
4067,ham,boring as compared to tp
4068,ham,hellogorgeous hows u my fone was on charge lst...


In [50]:
result_df.to_csv('result.csv', index=False)
