In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
%matplotlib inline  
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.metrics import fbeta_score, make_scorer
import xgboost
from xgboost import XGBClassifier

In [5]:
data1 = pd.read_csv("spam.csv",encoding='latin-1')


#spajanje zadnja cetri stupca kako bi ostavili poruke u potpunosti spojenima
data1 = data1.rename(columns={"Unnamed: 2":"two", "Unnamed: 3":"tri","Unnamed: 4":"cetr"})
a =  data1[data1["two"].notnull()]["v2"].map(str) + data1[data1["two"].notnull()]["two"].map(str)
data1.loc[data1["two"].notnull(),"v2"] = a
a =  data1[data1["tri"].notnull()]["v2"].map(str) + data1[data1["tri"].notnull()]["tri"].map(str)
data1.loc[data1["tri"].notnull(),"v2"] = a
a =  data1[data1["cetr"].notnull()]["v2"].map(str) + data1[data1["cetr"].notnull()]["cetr"].map(str)
data1.loc[data1["cetr"].notnull(),"v2"] = a
data1 = data1[["v1","v2"]]
#data1.to_csv("proba.csv")



data1 = data1.rename(columns={"v1":"category_class", "v2":"sms"})
codes = {'ham':0, 'spam':1}
data1["class"] = data1["category_class"].map(codes)

In [6]:
train,test = train_test_split(data1,test_size=0.25,random_state=28)
features_train = train["sms"]
features_test = test["sms"]
labels_train = train["class"]
labels_test = test["class"]

In [7]:
count_vect = CountVectorizer()
count_vect.fit(features_train)
count_vectorizer_train = count_vect.transform(features_train)
count_vectorizer_test = count_vect.transform(features_test)

In [30]:
ftwo_scorer = make_scorer(fbeta_score, beta=0.5)

In [50]:
def crosValBestModel(clf, parameters):
    grid_search = GridSearchCV(clf, parameters, cv = 10, scoring=ftwo_scorer, n_jobs=2, verbose=10)         
    grid_search.fit(count_vectorizer_train, labels_train)
    best = grid_search.best_estimator_
    preds = best.predict(count_vectorizer_test)
   
    print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    print("Accuracy :",accuracy_score(labels_test,preds))
    print("Recall :",recall_score(labels_test,preds))
    print("Presicion :",precision_score(labels_test,preds))
    print("F0.5 score :",fbeta_score(labels_test,preds, beta=0.5))
    #print("F1 score :",f1_score(labels_test,preds))
    print("CONFUSION MATRIX",confusion_matrix(labels_test,preds))

In [32]:
clfs = [('MNB',MultinomialNB()), 
        ('Logistic',LogisticRegression()), 
        ('SVC',SVC(kernel='linear')), 
        ('KNN',KNeighborsClassifier()), 
        ('RandomForrest',RandomForestClassifier(random_state=7))]
for name, clf in clfs:
    clf.fit(count_vectorizer_train, labels_train)
    preds = clf.predict(count_vectorizer_test)
    
    print('============================================================')
    print('___________ '+name+'______________________')
    print("Accuracy :",accuracy_score(labels_test,preds))
    print("Recall :",recall_score(labels_test,preds))
    print("Presicion :",precision_score(labels_test,preds))
    print("F0.5 score :",fbeta_score(labels_test,preds, beta=0.5))
    print("CONFUSION MATRIX",confusion_matrix(labels_test,preds))
    print('============================================================')

___________ MNB______________________
Accuracy : 0.992103374013
Recall : 0.967213114754
Presicion : 0.972527472527
F0.5 score : 0.971459934138
CONFUSION MATRIX [[1205    5]
 [   6  177]]
___________ Logistic______________________
Accuracy : 0.985642498205
Recall : 0.907103825137
Presicion : 0.98224852071
F0.5 score : 0.966239813737
CONFUSION MATRIX [[1207    3]
 [  17  166]]
___________ SVC______________________
Accuracy : 0.984924623116
Recall : 0.912568306011
Presicion : 0.970930232558
F0.5 score : 0.958668197474
CONFUSION MATRIX [[1205    5]
 [  16  167]]
___________ KNN______________________
Accuracy : 0.916726489591
Recall : 0.366120218579
Presicion : 1.0
F0.5 score : 0.742793791574
CONFUSION MATRIX [[1210    0]
 [ 116   67]]
___________ RandomForrest______________________
Accuracy : 0.969849246231
Recall : 0.770491803279
Presicion : 1.0
F0.5 score : 0.943775100402
CONFUSION MATRIX [[1210    0]
 [  42  141]]


In [51]:
parameters = {
    'C': np.linspace(0.1, 10, num=10)
}
crosValBestModel(LogisticRegression(), parameters)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.2s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    2.3s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    2.4s
[Parallel(n_jobs=2)]: Batch computation too fast (0.1795s.) Setting batch_size=2.
[Parallel(n_jobs=2)]: Done  22 tasks      | elapsed:    2.6s
[Parallel(n_jobs=2)]: Batch computation too fast (0.1257s.) Setting batch_size=6.
[Parallel(n_jobs=2)]: Done  52 tasks      | elapsed:    3.2s


Best: 0.964588 using {'C': 8.9000000000000004}
0.943039 (0.019904) with: {'C': 0.10000000000000001}
0.957030 (0.013104) with: {'C': 1.2000000000000002}
0.960315 (0.011266) with: {'C': 2.3000000000000003}
0.962275 (0.011110) with: {'C': 3.4000000000000004}
0.962275 (0.011110) with: {'C': 4.5}
0.962275 (0.011110) with: {'C': 5.5999999999999996}
0.962726 (0.010557) with: {'C': 6.7000000000000002}
0.964165 (0.009889) with: {'C': 7.8000000000000007}
0.964588 (0.010044) with: {'C': 8.9000000000000004}
0.964588 (0.010044) with: {'C': 10.0}
Accuracy : 0.987796123475
Recall : 0.918032786885
Presicion : 0.988235294118
F0.5 score : 0.973348783314
CONFUSION MATRIX [[1208    2]
 [  15  168]]


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    4.1s finished


In [36]:
parameters = {
    'C': np.linspace(0.1, 10, num=10)
}
crosValBestModel(SVC(kernel='linear'), parameters)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    2.1s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    3.9s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    5.6s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    8.1s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   10.7s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   14.1s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   17.5s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   21.2s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   25.6s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   30.0s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   35.2s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   40.3s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   42.7s finished


Best: 0.968620 using {'C': 2.3000000000000003}
0.967356 (0.008416) with: {'C': 0.10000000000000001}
0.967196 (0.010602) with: {'C': 1.2000000000000002}
0.968620 (0.009923) with: {'C': 2.3000000000000003}
0.968620 (0.009923) with: {'C': 3.4000000000000004}
0.968620 (0.009923) with: {'C': 4.5}
0.968620 (0.009923) with: {'C': 5.5999999999999996}
0.968620 (0.009923) with: {'C': 6.7000000000000002}
0.968620 (0.009923) with: {'C': 7.8000000000000007}
0.968620 (0.009923) with: {'C': 8.9000000000000004}
0.968620 (0.009923) with: {'C': 10.0}
Accuracy : 0.984924623116
Recall : 0.912568306011
Presicion : 0.970930232558
F0.5 score : 0.958668197474
CONFUSION MATRIX [[1205    5]
 [  16  167]]


In [None]:
parameters = {
    'learning_rate': [0.01,0.01,0.1,1,10],
    'max_depth' : [1,3,5,7],
    'n_estimators' : [100,300,500,700]
    'reg_lambda' : [0.01,0.01,0.1,1,10]
}

crosValBestModel(XGBClassifier(), parameters)

In [37]:
parameters = {
    'n_neighbors': [2,3,4,5]
}

crosValBestModel(KNeighborsClassifier(), parameters)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    1.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    3.1s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    4.0s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    5.4s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:    6.9s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    8.9s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   11.3s
[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:   12.3s finished


Best: 0.793919 using {'n_neighbors': 3}
0.788413 (0.035278) with: {'n_neighbors': 2}
0.793919 (0.037728) with: {'n_neighbors': 3}
0.720075 (0.053815) with: {'n_neighbors': 4}
0.727183 (0.048220) with: {'n_neighbors': 5}
Accuracy : 0.928930366116
Recall : 0.459016393443
Presicion : 1.0
F0.5 score : 0.809248554913
CONFUSION MATRIX [[1210    0]
 [  99   84]]


In [52]:
n_estimators = [200, 250, 300]
max_depth =  [200, 250, 300]
max_features = ['auto', 'sqrt']
parameters = dict(n_estimators=n_estimators,max_depth=max_depth, max_features = max_features )

crosValBestModel(RandomForestClassifier(), parameters)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    8.5s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   16.6s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   36.9s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   57.5s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  2.1min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  2.8min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  3.4min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  4.4min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:  5.0min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:  6.1min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:  7.1min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed:  8.2min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  9.3min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed: 10.6min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 11.8min
[Parallel(n_jobs=2)]: Do

Best: 0.955723 using {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 300}
0.950871 (0.016830) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 200}
0.954113 (0.014126) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 250}
0.953608 (0.014635) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 300}
0.952980 (0.015716) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 200}
0.953073 (0.014395) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 250}
0.955723 (0.012392) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 300}
0.955035 (0.014728) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 200}
0.952010 (0.015651) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 250}
0.954200 (0.012966) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 300}
0.953304 (0.017687) with: {'max_depth': 250, 'max_features': 'sqrt', 'n_estimators': 200}
0.951586 (0.014