## Libraries

In [17]:
import pandas as pd
import numpy as np

import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
nltk.download('punkt')
nltk.download('stopwords')
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data analysis

In [2]:
df_train = pd.read_csv("/content/train_spam.csv")
df_test = pd.read_csv("/content/test_spam.csv")

In [3]:
df_train.head()

Unnamed: 0,text_type,text
0,ham,make sure alex knows his birthday is over in f...
1,ham,a resume for john lavorato thanks vince i will...
2,spam,plzz visit my website moviesgodml to get all m...
3,spam,urgent your mobile number has been awarded wit...
4,ham,overview of hr associates analyst project per ...


In [5]:
df_train.describe()

Unnamed: 0,text_type,text
count,16278,16278
unique,2,16267
top,ham,SPAM ALERT 🚔 User: Username: @DillyBubbl...
freq,11469,7


## Label encoding

In [8]:
df_train["text_type"] = df_train["text_type"].replace("spam", 1)
df_train["text_type"] = df_train["text_type"].replace("ham", 0)

In [9]:
df_train.head()

Unnamed: 0,text_type,text
0,0,make sure alex knows his birthday is over in f...
1,0,a resume for john lavorato thanks vince i will...
2,1,plzz visit my website moviesgodml to get all m...
3,1,urgent your mobile number has been awarded wit...
4,0,overview of hr associates analyst project per ...


## Preprocessing

Transform the text converting to lowercase, tokenizing, removing stop words and punctuation.

In [10]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    filtered_words = []

    for word in text:
        if word.isalnum():
            filtered_words.append(word)
    text = filtered_words[:]
    filtered_words.clear()

    for word in text:
        if word not in stopwords.words('english') and word not in string.punctuation:
            filtered_words.append(word)

    text = filtered_words[:]
    filtered_words.clear()
    stemmer = PorterStemmer()
    for word in text:
        stemmed_word = stemmer.stem(word)
        filtered_words.append(stemmed_word)
    transformed_text = " ".join(filtered_words)

    return transformed_text

In [15]:
df_train['text'] = df_train['text'].apply(transform_text)
df_test['text'] = df_test['text'].apply(transform_text)

In [19]:
df_train.head()

Unnamed: 0,text_type,text
0,0,make sure alex know birthday fifteen minut far...
1,0,resum john lavorato thank vinc get move right ...
2,1,plzz visit websit moviesgodml get movi free al...
3,1,urgent mobil number award prize guarante call ...
4,0,overview hr associ analyst project per david r...


### Train / test split

In [20]:
X = df_train['text']
y = df_train['text_type']

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
X_test = df_test['text']

## Model

In [35]:
import xgboost as xgb

classifier = Pipeline([
    ('countvec', CountVectorizer()),
    ('tfid', TfidfTransformer()),
    ('model', xgb.XGBClassifier())
])

In [36]:
classifier.fit(X_train, y_train)
y_pred_val = classifier.predict(X_val)
y_pred_train = classifier.predict(X_train)

## Report

In [37]:
print("Train report \n", classification_report(y_train, y_pred_train))

Train report 
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      9148
           1       0.99      0.87      0.92      3874

    accuracy                           0.96     13022
   macro avg       0.97      0.93      0.95     13022
weighted avg       0.96      0.96      0.96     13022



In [38]:
print("Validation report \n", classification_report(y_val, y_pred_val))

Validation report 
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      2321
           1       0.94      0.80      0.86       935

    accuracy                           0.93      3256
   macro avg       0.93      0.89      0.91      3256
weighted avg       0.93      0.93      0.92      3256



In [39]:
print("Train roc_auc_score", format(roc_auc_score(y_train, y_pred_train), ".2f"))
print("Validation roc_auc_score", format(roc_auc_score(y_val, y_pred_val), ".2f"))

Train roc_auc_score 0.93
Validation roc_auc_score 0.89


## Test predict

In [40]:
y_pred_test = classifier.predict(X_test)

In [42]:
df_test = pd.read_csv("/content/test_spam.csv")

In [43]:
df_test['score'] = y_pred_test

In [47]:
df_test["score"] = df_test["score"].replace(1, "spam")
df_test["score"] = df_test["score"].replace(0, "ham")

In [49]:
df_test.to_csv('test_spam_scored', index=False)