In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.metrics import classification_report
from nltk.tokenize import TweetTokenizer
import joblib

In [13]:
dataframe = pd.read_csv("../data/spam.csv",encoding = "ISO-8859-1")
train,test = train_test_split(dataframe,test_size=0.2,stratify=dataframe.v1)

In [14]:
#Count samples per class
dataframe.groupby("v1").count()["v2"]

v1
ham     4825
spam     747
Name: v2, dtype: int64

In [15]:
tokenizer = TweetTokenizer()
pipe = Pipeline([
    ("tfidf",TfidfVectorizer(tokenizer=tokenizer.tokenize)),
    ("svm",svm.SVC())
])
parameters = {"tfidf__ngram_range" : [(1,2),(2,3)]
                  ,"tfidf__max_df":[0.5,0.8,0.95],
                  "tfidf__min_df":[1,2],
                  "tfidf__analyzer":["word"],
                  'svm__kernel':['rbf',"linear"],
                  'svm__C':[10,100,1000,10000]}
clf = GridSearchCV(pipe, parameters,cv=10,n_jobs=-1,verbose=1,scoring="f1_macro")
clf.fit(train.v2, train.v1)
clf.best_score_

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   29.5s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:  7.5min
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:  8.9min finished


0.9742613172479493

In [19]:
print(classification_report(y_pred=clf.best_estimator_.predict(test.v2),y_true=test.v1))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       0.98      0.97      0.97       149

    accuracy                           0.99      1115
   macro avg       0.99      0.98      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [20]:
joblib.dump(clf.best_estimator_,"model.sav")


['model.sav']