# Big Data Cup Event #3

In [1]:
import pandas as pd
import numpy as np
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.1.


In [3]:
import os
os.getcwd()

'C:\\Users\\rifai\\OneDrive\\Queens University\\MMA\\MMA 865\\Big Data Cup'

# Load Data

In [4]:
df = pd.read_csv("spamraw_train.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5000 non-null   int64 
 1   sms_text  5000 non-null   object
 2   spam      5000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 117.3+ KB


Unnamed: 0,id,sms_text,spam
0,1,Hope you are having a good week. Just checking in,0
1,2,K..give back my thanks.,0
2,3,Am also doing in cbe only. But have to pay.,0
3,4,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1
4,5,okmail: Dear Dave this is your final notice to...,1


In [5]:
np.bincount(df['spam'])

array([4327,  673], dtype=int64)

# Build Pipeline

In [6]:
from sklearn.model_selection import train_test_split

X = df['sms_text']
y = df['spam']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
type(X_train)
X_train.shape
X_train.head()

type(y_train)
y_train.shape
y_train.head()

pandas.core.series.Series

(4000,)

4227    Cashbin.co.uk (Get lots of cash this weekend!)...
4676         Once free call me sir. I am waiting for you.
800     Keep yourself safe for me because I need you a...
3671    You call him and tell now infront of them. Cal...
4193                    I reach home safe n sound liao...
Name: sms_text, dtype: object

pandas.core.series.Series

(4000,)

4227    1
4676    0
800     0
3671    0
4193    0
Name: spam, dtype: int64

In [8]:
# Simple preprocessor.
# Input is a single document, as a single string.
# Otuput should be a single document, as a single string.
def my_preprocess(doc):
    
    # Lowercase
    doc = doc.lower()
    
    # Remove URLs
    doc = re.sub(r'http\S+', '', doc)
        
    # TODO: What else?

    return doc

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier

vectorizer = CountVectorizer(max_features = 20, preprocessor = my_preprocess)
dt = DecisionTreeClassifier(max_depth=2, random_state=42)

pipe = Pipeline([('cv', vectorizer), ('clf', dt)])


# Other things to try above:
# - More/Different preprocessing steps
# - Different hyperparameter values for CountVectorizer
# - Different hyperparameter values for DecisionTreeClassifier
# - Different ML algorithms
# - Additional feature engineering (can be added to Pipeline via sklearn.pipeline.FeatureUnion)

In [10]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cv',
                 CountVectorizer(max_features=20,
                                 preprocessor=<function my_preprocess at 0x000001C945574400>)),
                ('clf', DecisionTreeClassifier(max_depth=2, random_state=42))])

# Estimate Model Performance

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score

pred_val = pipe.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_val, pred_val, average="micro")))

print("\nClassification Report:")
print(classification_report(y_val, pred_val))

Confusion matrix:
[[843  27]
 [ 75  55]]

F1 Score = 0.89800

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94       870
           1       0.67      0.42      0.52       130

    accuracy                           0.90      1000
   macro avg       0.79      0.70      0.73      1000
weighted avg       0.89      0.90      0.89      1000



# Create Submission File on Kaggle Hold-Out Data

In [12]:
test_df = pd.read_csv('sms-spam/spamraw_test.csv')

pred_test = pipe.predict(test_df['sms_text'])

# Output the predictions to a file to upload to Kaggle.
my_submission = pd.DataFrame({'id': test_df.id, 'predicted': pred_test})
my_submission.head()
my_submission.to_csv('my_submission.csv', index=False)

Unnamed: 0,id,predicted
0,12000,1
1,12001,0
2,12002,0
3,12003,0
4,12004,0
