In [19]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, PorterStemmer, WordNetLemmatizer
import matplotlib.pyplot as plt
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from scipy.sparse import csr_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,precision_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, KFold

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
%matplotlib inline  
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from sklearn.metrics import fbeta_score, make_scorer
import xgboost
from xgboost import XGBClassifier

In [20]:
data1 = pd.read_csv("spam.csv",encoding='latin-1')


#spajanje zadnja cetri stupca kako bi ostavili poruke u potpunosti spojenima
data1 = data1.rename(columns={"Unnamed: 2":"two", "Unnamed: 3":"tri","Unnamed: 4":"cetr"})
a =  data1[data1["two"].notnull()]["v2"].map(str) + data1[data1["two"].notnull()]["two"].map(str)
data1.loc[data1["two"].notnull(),"v2"] = a
a =  data1[data1["tri"].notnull()]["v2"].map(str) + data1[data1["tri"].notnull()]["tri"].map(str)
data1.loc[data1["tri"].notnull(),"v2"] = a
a =  data1[data1["cetr"].notnull()]["v2"].map(str) + data1[data1["cetr"].notnull()]["cetr"].map(str)
data1.loc[data1["cetr"].notnull(),"v2"] = a
data1 = data1[["v1","v2"]]
#data1.to_csv("proba.csv")



data1 = data1.rename(columns={"v1":"category_class", "v2":"sms"})
codes = {'ham':0, 'spam':1}
data1["class"] = data1["category_class"].map(codes)
maxLen = max(data1['sms'].apply(len).tolist())

In [21]:
# **NOVO - dodani featuri u dataframe i imas statistike prosjecne za svaki
from nltk.corpus import words
swords = set(words.words())
def countUpperLetters(message):
    return sum(1 for c in message if c.isupper())
def countDigits(message):
    return sum(c.isdigit() for c in message)
def countNonAlphaNumerical(message):
    return sum(not c.isalnum() for c in message)

data1["upper_letters"] = data1["sms"].apply(countUpperLetters)
data1['length'] = data1['sms'].apply(len)
data1["numeric_chars"] = data1["sms"].apply(countDigits)
data1["non_alpha_num"] = data1["sms"].apply(countNonAlphaNumerical)
print(data1.groupby("class").mean())

       upper_letters      length  numeric_chars  non_alpha_num
class                                                         
0           4.173472   71.627979       0.305285      17.500518
1          15.483266  139.148594      15.812584      29.132530


In [22]:
train,test = train_test_split(data1,test_size=0.25,random_state=28)
features_train = train["sms"]
features_test = test["sms"]
labels_train = train["class"]
labels_test = test["class"]

In [5]:
def dodajFeatureVratiArray(current_array,pandas_column):
    return np.hstack((current_array,pandas_column.values.reshape(-1,1)))

In [6]:
def outSystem(count_vect = CountVectorizer(analyzer = 'char', ngram_range=(2,5)) ):
    count_vect.fit(train['sms'])
    count_vectorizer_train = count_vect.transform(train['sms']).todense()
    count_vectorizer_test = count_vect.transform(test['sms']).todense()
        
    train_X = dodajFeatureVratiArray(count_vectorizer_train,train["numeric_chars"])
    train_X = dodajFeatureVratiArray(train_X,train["length"])
    train_X = dodajFeatureVratiArray(train_X,train["non_alpha_num"])
    train_X = dodajFeatureVratiArray(train_X,train["upper_letters"])
    
    test_X = dodajFeatureVratiArray(count_vectorizer_test,test["numeric_chars"])
    test_X = dodajFeatureVratiArray(test_X,test["length"])
    test_X = dodajFeatureVratiArray(test_X,test["non_alpha_num"])
    test_X = dodajFeatureVratiArray(test_X,test["upper_letters"])
    
    return(csr_matrix(train_X), csr_matrix(test_X))

In [67]:
train_data, test_data = outSystem()

In [69]:
ftwo_scorer = make_scorer(fbeta_score, beta=0.5)

def crosValBestModel(clf, d_train, d_test, parameters):
    grid_search = GridSearchCV(clf, parameters, cv = 10, scoring=ftwo_scorer, n_jobs=2, verbose=10)
    grid_search.fit(d_train, labels_train)                
    best = grid_search.best_estimator_
    preds = best.predict(d_test)
   
    print("Best: %f using %s" % (grid_search.best_score_, grid_search.best_params_))
    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    params = grid_search.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    
    print("Accuracy :",accuracy_score(labels_test,preds))
    print("Recall :",recall_score(labels_test,preds))
    print("Presicion :",precision_score(labels_test,preds))
    print("F0.5 score :",fbeta_score(labels_test,preds, beta=0.5))
    print("CONFUSION MATRIX",confusion_matrix(labels_test,preds))

In [77]:
clfs = [('MNB',MultinomialNB()), 
        ('Logistic',LogisticRegression()),
        ('SVC',SVC(kernel='linear')), 
        ('KNN',KNeighborsClassifier()),
        ('RandomForrest',RandomForestClassifier(n_estimators=300, max_depth=300,random_state=7))]
        #('RandomForrest',RandomForestClassifier(random_state=7))]
for name, clf in clfs:
    clf.fit(train_data, labels_train)
    preds = clf.predict(test_data)
    
    print('============================================================')
    print('___________ '+name+'______________________')
    print("Accuracy :",accuracy_score(labels_test,preds))
    print("Recall :",recall_score(labels_test,preds))
    print("Presicion :",precision_score(labels_test,preds))
    print("F0.5 score :",fbeta_score(labels_test,preds, beta=0.5))
    print("CONFUSION MATRIX",confusion_matrix(labels_test,preds))
    print('============================================================')

___________ MNB______________________
Accuracy : 0.991385498923
Recall : 0.956284153005
Presicion : 0.977653631285
F0.5 score : 0.973303670745
CONFUSION MATRIX [[1206    4]
 [   8  175]]
___________ Logistic______________________
Accuracy : 0.992103374013
Recall : 0.950819672131
Presicion : 0.988636363636
F0.5 score : 0.98083427283
CONFUSION MATRIX [[1208    2]
 [   9  174]]
___________ SVC______________________
Accuracy : 0.991385498923
Recall : 0.96174863388
Presicion : 0.972375690608
F0.5 score : 0.970231532525
CONFUSION MATRIX [[1205    5]
 [   7  176]]
___________ KNN______________________
Accuracy : 0.987078248385
Recall : 0.934426229508
Presicion : 0.966101694915
F0.5 score : 0.959595959596
CONFUSION MATRIX [[1204    6]
 [  12  171]]
___________ RandomForrest______________________
Accuracy : 0.990667623833
Recall : 0.928961748634
Presicion : 1.0
F0.5 score : 0.98493626883
CONFUSION MATRIX [[1210    0]
 [  13  170]]


In [75]:
clf = LogisticRegression()
parameters = {
    'C': np.linspace(0.1, 10, num=10)
}
crosValBestModel(clf, train_data, test_data, parameters)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    2.2s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:    4.3s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:    6.1s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:    8.5s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   11.3s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   15.3s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   18.9s
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   23.2s
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:   27.6s
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed:   33.8s
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed:   40.0s
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed:   48.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:   51.2s finished


Best: 0.971218 using {'C': 3.4000000000000004}
0.969520 (0.008374) with: {'C': 0.10000000000000001}
0.969943 (0.009136) with: {'C': 1.2000000000000002}
0.970792 (0.009238) with: {'C': 2.3000000000000003}
0.971218 (0.009415) with: {'C': 3.4000000000000004}
0.971218 (0.009415) with: {'C': 4.5}
0.971218 (0.009415) with: {'C': 5.5999999999999996}
0.971218 (0.009415) with: {'C': 6.7000000000000002}
0.971218 (0.009415) with: {'C': 7.8000000000000007}
0.971218 (0.009415) with: {'C': 8.9000000000000004}
0.970795 (0.009548) with: {'C': 10.0}
Accuracy : 0.992821249103
Recall : 0.956284153005
Presicion : 0.988700564972
F0.5 score : 0.982042648709
CONFUSION MATRIX [[1208    2]
 [   8  175]]


In [None]:
clf = SVC(kernel='linear')
parameters = {
    'C': np.linspace(0.1, 10, num=10)
}
crosValBestModel(clf, train_data, test_data, parameters)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    8.6s


In [10]:
clf = KNeighborsClassifier()
parameters = {
    'n_neighbors': [2,3,4,5]
}

crosValBestModel(clf, train_data, test_data, parameters)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:    7.3s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   10.5s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:   15.5s
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:   20.7s
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:   27.7s
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   35.0s
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:   45.5s
[Parallel(n_jobs=2)]: Done  40 out of  40 | elapsed:   48.9s finished


Best: 0.956579 using {'n_neighbors': 2}
0.956579 (0.010647) with: {'n_neighbors': 2}
0.950772 (0.017047) with: {'n_neighbors': 3}
0.951649 (0.013465) with: {'n_neighbors': 4}
0.943809 (0.008413) with: {'n_neighbors': 5}
Accuracy : 0.987796123475
Recall : 0.907103825137
Presicion : 1.0
F0.5 score : 0.979929161747
CONFUSION MATRIX [[1210    0]
 [  17  166]]


In [None]:
clf = XGBClassifier()

parameters = {
    'learning_rate': [0.01,0.01,0.1,1,10],
    'max_depth' : [1,3,5,7],
    'n_estimators' : [100,300,500,700]
    'reg_lambda' : [0.01,0.01,0.1,1,10]
}

crosValBestModel(clf, train_data, test_data, parameters)

In [11]:
clf = RandomForestClassifier()

parameters = {
    'n_estimators': [200, 250, 300],
    'max_depth': [200, 250, 300],
    'max_features': ['auto', 'sqrt']
}

crosValBestModel(clf,train_data, test_data, parameters)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:   13.6s
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:   26.4s
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed:  1.8min
[Parallel(n_jobs=2)]: Done  21 tasks      | elapsed:  3.1min
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done  37 tasks      | elapsed:  5.6min
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  7.0min
[Parallel(n_jobs=2)]: Done  57 tasks      | elapsed:  9.0min
[Parallel(n_jobs=2)]: Done  68 tasks      | elapsed: 10.5min
[Parallel(n_jobs=2)]: Done  81 tasks      | elapsed: 12.5min
[Parallel(n_jobs=2)]: Done  94 tasks      | elapsed: 14.6min
[Parallel(n_jobs=2)]: Done 109 tasks      | elapsed: 16.6min
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed: 19.1min
[Parallel(n_jobs=2)]: Done 141 tasks      | elapsed: 21.6min
[Parallel(n_jobs=2)]: Done 158 tasks      | elapsed: 24.4min
[Parallel(n_jobs=2)]: Do

Best: 0.976977 using {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 250}
0.975376 (0.005038) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 200}
0.974220 (0.009182) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 250}
0.973670 (0.010738) with: {'max_depth': 200, 'max_features': 'auto', 'n_estimators': 300}
0.976163 (0.006424) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 200}
0.973249 (0.010710) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 250}
0.974538 (0.010701) with: {'max_depth': 200, 'max_features': 'sqrt', 'n_estimators': 300}
0.976005 (0.009370) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 200}
0.976977 (0.007162) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 250}
0.973975 (0.007668) with: {'max_depth': 250, 'max_features': 'auto', 'n_estimators': 300}
0.973715 (0.010254) with: {'max_depth': 250, 'max_features': 'sqrt', 'n_estimators': 200}
0.975034 (0.009