# Import
---

In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from scipy.sparse import csr_matrix

# import matplotlib.pyplot as plt
import joblib

# Data
---

In [4]:
# this URL doesnt work directly with pd.read_csv
# !wget https://lazyprogrammer.me/course_files/spam.csv

In [5]:
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
df.columns=['labels', 'data']
df.head()

Unnamed: 0,labels,data
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# create binary labels
df['b_labels'] = df['labels'].map({'ham': 0, 'spam': 1})
y = df['b_labels'].values
X = df['data']

In [9]:
df

Unnamed: 0,labels,data,b_labels
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [10]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

In [11]:
X_train[0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(X)
X_train_vectorized = vectorizer.transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [13]:
print(X_train_vectorized[0])

  (0, 7818)	0.3312161306527483
  (0, 6860)	0.49377274125014164
  (0, 6726)	0.48254237729454463
  (0, 5708)	0.3028180054950722
  (0, 3709)	0.35843146768063666
  (0, 2675)	0.43984695910391713


In [14]:
vectorizer.inverse_transform(csr_matrix.toarray(X_train_vectorized[0]))

[array(['download', 'hello', 'pls', 'site', 'songs', 'urgent'],
       dtype='<U34')]

In [15]:
#  vectorizer.inverse_transform(X_train_vectorized[0].toarray())

In [16]:
# gnb = GaussianNB()
# gnb.fit(X_train_vectorized.toarray(), y_train)
# print(gnb.score(X_train_vectorized.toarray(), y_train))
# print(gnb.score(X_test_vectorized.toarray(), y_test))

In [17]:
# gbc = GradientBoostingClassifier()
# gbc.fit(X_train_vectorized.toarray(), y_train)
# print(gbc.score(X_train_vectorized.toarray(), y_train))
# print(gbc.score(X_test_vectorized.toarray(), y_test))

### Sklearn Pipeline Model

In [18]:
# kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# for k, (train_idx, test_idx) in enumerate(kfold.split(X)):
#     X_train, X_test = X[train_idx], X[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

In [19]:
# gboost_pipeline = Pipeline([
#     ('vectorized', TfidfVectorizer(stop_words='english')),
#     ('transformer', FunctionTransformer(csr_matrix.toarray)),
#     ('gboost', GradientBoostingClassifier()),
# ])

bayes_pipeline = Pipeline([
    ('vectorized', TfidfVectorizer(stop_words='english')),
    ('transformer', FunctionTransformer(csr_matrix.toarray)),
    ('bayes', GaussianNB()),
])

In [20]:
# gboost_pipeline.fit(X_train, y_train)
bayes_pipeline.fit(X_train, y_train)

Pipeline(steps=[('vectorized', TfidfVectorizer(stop_words='english')),
                ('transformer',
                 FunctionTransformer(func=<function _cs_matrix.toarray at 0x7f709be96290>)),
                ('bayes', GaussianNB())])

In [21]:
# print(f'Train Accuracy: {pipeline.score(X_train, y_train)}')
# print(f'Test Accuracy: {pipeline.score(X_test, y_test)}')

# y_pred = gboost_pipeline.predict(X_test)
# print('GBoostClassifier Report:\n', classification_report(y_pred=y_pred, y_true=y_test))

y_pred = bayes_pipeline.predict(X_test)
print('GaussianNB Report:\n', classification_report(y_pred=y_pred, y_true=y_test))

GaussianNB Report:
               precision    recall  f1-score   support

           0       0.98      0.89      0.93      1578
           1       0.56      0.87      0.68       261

    accuracy                           0.89      1839
   macro avg       0.77      0.88      0.81      1839
weighted avg       0.92      0.89      0.90      1839



In [25]:
joblib.dump(bayes_pipeline, 'model.joblib')

['model.joblib']

---

In [27]:
model = joblib.load('model.joblib')

In [28]:
df

Unnamed: 0,labels,data,b_labels
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will Ì_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [29]:
df['data'][5567]

'This is the 2nd time we have tried 2 contact u. U have won the å£750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'