# Imports

In [1]:
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.svm import *
import pandas
import csv

In [2]:
def perform(classifiers, vectorizers, train_data, test_data):
    for classifier in classifiers:
      for vectorizer in vectorizers:
        string = ''
        string += classifier.__class__.__name__ + ' with ' + vectorizer.__class__.__name__

        # train
        vectorize_text = vectorizer.fit_transform(train_data.v2)
        classifier.fit(vectorize_text, train_data.v1)

        # score
        vectorize_text = vectorizer.transform(test_data.v2)
        score = classifier.score(vectorize_text, test_data.v1)
        string += '. Has score: ' + str(score)
        print(string)

In [3]:
# Reading dataset
data = pandas.read_csv('data/spam.csv', encoding='latin-1')

# Lets explore data here

In [4]:
data.shape

(5572, 5)

In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [7]:
# Deviding data into learning and testing sets
learn = data[:4400] # 4400 items
test = data[4400:] # 1172 items

In [8]:
classifiers = [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ]

In [9]:
vectorizers = [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ]

In [10]:

perform(classifiers, vectorizers, learn, test)

BernoulliNB with CountVectorizer. Has score: 0.9778156996587031
BernoulliNB with TfidfVectorizer. Has score: 0.9778156996587031
BernoulliNB with HashingVectorizer. Has score: 0.8728668941979523
RandomForestClassifier with CountVectorizer. Has score: 0.9761092150170648
RandomForestClassifier with TfidfVectorizer. Has score: 0.9752559726962458
RandomForestClassifier with HashingVectorizer. Has score: 0.9675767918088737
AdaBoostClassifier with CountVectorizer. Has score: 0.9718430034129693
AdaBoostClassifier with TfidfVectorizer. Has score: 0.9692832764505119
AdaBoostClassifier with HashingVectorizer. Has score: 0.9735494880546075
BaggingClassifier with CountVectorizer. Has score: 0.9641638225255973
BaggingClassifier with TfidfVectorizer. Has score: 0.9667235494880546
BaggingClassifier with HashingVectorizer. Has score: 0.9692832764505119
ExtraTreesClassifier with CountVectorizer. Has score: 0.9803754266211604
ExtraTreesClassifier with TfidfVectorizer. Has score: 0.9786689419795221
ExtraT



DummyClassifier with HashingVectorizer. Has score: 0.7636518771331058
PassiveAggressiveClassifier with CountVectorizer. Has score: 0.9837883959044369
PassiveAggressiveClassifier with TfidfVectorizer. Has score: 0.984641638225256
PassiveAggressiveClassifier with HashingVectorizer. Has score: 0.9803754266211604
RidgeClassifier with CountVectorizer. Has score: 0.9812286689419796
RidgeClassifier with TfidfVectorizer. Has score: 0.9829351535836177
RidgeClassifier with HashingVectorizer. Has score: 0.9820819112627986
RidgeClassifierCV with CountVectorizer. Has score: 0.9829351535836177
RidgeClassifierCV with TfidfVectorizer. Has score: 0.984641638225256
RidgeClassifierCV with HashingVectorizer. Has score: 0.9803754266211604
SGDClassifier with CountVectorizer. Has score: 0.9837883959044369
SGDClassifier with TfidfVectorizer. Has score: 0.9863481228668942
SGDClassifier with HashingVectorizer. Has score: 0.984641638225256
OneVsRestClassifier with CountVectorizer. Has score: 0.9863481228668942
O

# Highest scoring Classifier and Vectorizer
> *OneVsRestClassifier with TfidfVectorizer. Has score: 0.9880546075085325*

In [18]:
classifier = OneVsRestClassifier(SVC(kernel='linear'))
vectorizer = TfidfVectorizer()

In [21]:
# train
vectorize_text = vectorizer.fit_transform(learn.v2)
classifier.fit(vectorize_text, learn.v1)

OneVsRestClassifier(estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                                  class_weight=None, coef0=0.0,
                                  decision_function_shape='ovr', degree=3,
                                  gamma='scale', kernel='linear', max_iter=-1,
                                  probability=False, random_state=None,
                                  shrinking=True, tol=0.001, verbose=False),
                    n_jobs=None)

In [23]:
# score
# vectorize_text = vectorizer.transform(test_data.v2)
# score = classifier.score(vectorize_text, test_data.v1)
# print(score) # 98,8


csv_arr = []
for index, row in test.iterrows():
    answer = row[0]
    text = row[1]
    vectorize_text = vectorizer.transform([text])
    predict = classifier.predict(vectorize_text)[0]
    if predict == answer:
        result = 'right'
    else:
        result = 'wrong'
    csv_arr.append([len(csv_arr), text, answer, predict, result])

In [24]:

# write csv
with open('test_score.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=';',
            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writerow(['#', 'text', 'answer', 'predict', result])

    for row in csv_arr:
        spamwriter.writerow(row)