# Part 2 -- Classification

The libraries that we are going to use

In [1]:
import pandas as pd 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, auc
from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
import matplotlib.pyplot as plt1
import matplotlib.pyplot as plt2

from wordcloud import STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import csv
import random
import math
import operator
from operator import itemgetter
from collections import Counter

We read our training and testing data:

In [2]:
Train_data = pd.read_csv(sep='\t',filepath_or_buffer='train.tsv')
Test_data = pd.read_csv(sep='\t',filepath_or_buffer='test.tsv')
target = Train_data['Label']

In [3]:
EvaluationMetric = {
    'Statistic Measure':['Accuracy'], 
    'Naive Bayes':[] ,
    'Random Forest':[],
    'SVM':[]}

In [4]:
cross_val_instance = 0

def cross_validate(clf,train_data,target_data):
    global cross_val_instance    # Needed to modify global copy of a global variable
    
    kf = KFold(n_splits=10)
    average_accuracy =0
    fold = 0
    for train_index, test_index in kf.split(train_data):
        cross_val_instance += 1
      
        test = train_data.loc[test_index, train_data.columns]
        train = train_data.loc[train_index, train_data.columns]
        target = target_data[train_index]
        clf_cv = clf.fit(train, target)
        yPred = clf_cv.predict(test)
        fold += 1
        print ("Fold " + str(fold)+"\n\n")
        target = target_data[test_index]
        accuracy = accuracy_score(target, yPred)
        
        print("Accuracy: ", accuracy)
        average_accuracy+= accuracy
    average_accuracy = average_accuracy/10
    print("Average accuracy = ",average_accuracy)
    return average_accuracy

## Data preprocessing

In [5]:
categories = ["Attribute1","Attribute3","Attribute4","Attribute6","Attribute7","Attribute9","Attribute10","Attribute10","Attribute12","Attribute14","Attribute15", "Attribute17", "Attribute19","Attribute20"]

proccessedData_train = Train_data.copy()

for x in categories:
    converted = pd.Categorical(Train_data[x])
    proccessedData_train[x] = converted.codes
    
print(proccessedData_train)

     Attribute1  Attribute2  Attribute3  Attribute4  Attribute5  Attribute6  \
0             0           6           4           4        1169           4   
1             1          48           2           4        5951           0   
2             3          12           4           7        2096           0   
3             0          42           2           3        7882           0   
4             0          24           3           0        4870           0   
5             3          36           2           7        9055           4   
6             3          24           2           3        2835           2   
7             1          36           2           1        6948           0   
8             3          12           2           4        3059           3   
9             1          30           4           0        5234           0   
10            1          12           2           0        1295           0   
11            0          48           2           9 

In [6]:
proccessedData_test = Test_data.copy()

for x in categories:
    converted = pd.Categorical(Test_data[x])
    proccessedData_test[x] = converted.codes
    
print(proccessedData_test)

     Attribute1  Attribute2  Attribute3  Attribute4  Attribute5  Attribute6  \
0             1          18           4           4        1795           0   
1             0          20           4           3        4272           0   
2             3          12           4           4         976           4   
3             1          12           2           0        7472           4   
4             0          36           2           0        9271           0   
5             1           6           2           4         590           0   
6             3          12           4           4         930           4   
7             1          42           1           1        9283           0   
8             1          15           0           0        1778           0   
9             1           8           2           9         907           0   
10            1           6           2           4         484           0   
11            0          36           4           1 

In [7]:
proccessedData_train= proccessedData_train.drop(['Label'],axis=1) #afairoume to column label
print(proccessedData_train.columns)

print(proccessedData_test.shape, proccessedData_train.shape)
print(proccessedData_test.head(), proccessedData_train.head())

Index([u'Attribute1', u'Attribute2', u'Attribute3', u'Attribute4',
       u'Attribute5', u'Attribute6', u'Attribute7', u'Attribute8',
       u'Attribute9', u'Attribute10', u'Attribute11', u'Attribute12',
       u'Attribute13', u'Attribute14', u'Attribute15', u'Attribute16',
       u'Attribute17', u'Attribute18', u'Attribute19', u'Attribute20', u'Id'],
      dtype='object')
((199, 21), (800, 21))
(   Attribute1  Attribute2  Attribute3  Attribute4  Attribute5  Attribute6  \
0           1          18           4           4        1795           0   
1           0          20           4           3        4272           0   
2           3          12           4           4         976           4   
3           1          12           2           0        7472           4   
4           0          36           2           0        9271           0   

   Attribute7  Attribute8  Attribute9  Attribute10  ...    Attribute12  \
0           4           3           1            2  ...        

## Support Vector Machines (SVM) Classification

In [8]:
RANDOM_STATE = 123
classifier = svm.LinearSVC(multi_class = "ovr",random_state=RANDOM_STATE)
classifier.fit(proccessedData_train,target)

predicted = classifier.predict(proccessedData_test)

print("LinearSVC with linear kernel and c=0.2:")
 
print(predicted)

LinearSVC with linear kernel and c=0.2:
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [9]:
average_acc= cross_validate(classifier,proccessedData_train,target)
EvaluationMetric['SVM'].append(average_acc)

Fold 1


('Accuracy: ', 0.75)
Fold 2


('Accuracy: ', 0.77500000000000002)
Fold 3


('Accuracy: ', 0.63749999999999996)
Fold 4


('Accuracy: ', 0.75)
Fold 5


('Accuracy: ', 0.73750000000000004)
Fold 6


('Accuracy: ', 0.69999999999999996)
Fold 7


('Accuracy: ', 0.65000000000000002)
Fold 8


('Accuracy: ', 0.58750000000000002)
Fold 9


('Accuracy: ', 0.75)
Fold 10


('Accuracy: ', 0.67500000000000004)
('Average accuracy = ', 0.70125000000000004)


## Random Forest (RF) Classification

In [10]:
RANDOM_STATE = 123

rndf = RandomForestClassifier(warm_start=True, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE)
rndf.set_params(n_estimators=30)
rndf.fit(proccessedData_train,target)

predicted = rndf.predict(proccessedData_test)
print(predicted)
# for x in range(10):
#     print(test_data['Title'][x] + "---->" + categories[predicted[x]])

[1 2 1 1 2 1 1 2 2 1 1 2 2 2 2 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 2 1 1
 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 1 1 1
 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1
 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 2 1 2 2 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1 1 1 2
 1 1 1 1 1 1 1 2 1 1 1 1 2 2]


In [11]:
average_acc = cross_validate(rndf,proccessedData_train,target)
EvaluationMetric['Random Forest'].append(average_acc)

Fold 1


('Accuracy: ', 1.0)
Fold 2


('Accuracy: ', 1.0)
Fold 3


('Accuracy: ', 1.0)


  warn("Warm-start fitting without increasing n_estimators does not "


Fold 4


('Accuracy: ', 1.0)
Fold 5


('Accuracy: ', 1.0)
Fold 6


('Accuracy: ', 1.0)
Fold 7


('Accuracy: ', 1.0)
Fold 8


('Accuracy: ', 0.98750000000000004)
Fold 9


('Accuracy: ', 1.0)
Fold 10


('Accuracy: ', 1.0)
('Average accuracy = ', 0.99875000000000003)


## Naive Bayes (NB) Classification

We use Multinomial Naive Bayes for our implementation:

In [12]:
mnb = MultinomialNB().fit(proccessedData_train, target)

predicted = mnb.predict(proccessedData_test)

print(predicted)

[1 2 1 2 2 1 1 2 1 1 1 2 1 2 2 1 1 2 1 1 1 1 1 2 1 2 2 2 2 1 1 2 2 1 1 1 1
 1 1 2 1 1 1 1 2 2 1 1 1 1 2 1 1 2 1 1 1 2 2 2 1 1 2 1 1 1 1 2 1 2 1 1 1 2
 1 1 2 1 2 2 2 1 1 2 1 1 2 2 2 1 1 1 2 1 2 1 1 1 2 1 1 2 1 1 1 2 2 2 1 2 2
 1 1 1 2 1 2 1 1 1 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1 2 1 1 1
 2 2 1 2 2 1 1 2 1 2 1 1 1 2 1 1 1 1 2 2 2 1 2 1 2 1 1 1 1 1 1 2 2 1 2 1 1
 2 1 2 1 2 1 1 2 1 1 2 1 1 2]


In [13]:
average_acc = cross_validate(mnb,proccessedData_train,target)
EvaluationMetric['Naive Bayes'].append(average_acc)

Fold 1


('Accuracy: ', 0.63749999999999996)
Fold 2


('Accuracy: ', 0.67500000000000004)
Fold 3


('Accuracy: ', 0.6875)
Fold 4


('Accuracy: ', 0.57499999999999996)
Fold 5


('Accuracy: ', 0.66249999999999998)
Fold 6


('Accuracy: ', 0.625)
Fold 7


('Accuracy: ', 0.625)
Fold 8


('Accuracy: ', 0.46250000000000002)
Fold 9


('Accuracy: ', 0.57499999999999996)
Fold 10


('Accuracy: ', 0.59999999999999998)
('Average accuracy = ', 0.61250000000000004)


## 10-fold Cross Validation

We evaluate and store the performance of each of the above methods using 10-fold Cross Validation with accuracy as a meter.

In [14]:
EvaluationMetric_10fold = pd.DataFrame(data=EvaluationMetric)
EvaluationMetric_10fold = EvaluationMetric_10fold.ix[::, ['Statistic Measure','Naive Bayes','Random Forest','SVM']]

EvaluationMetric_10fold.to_csv(path_or_buf='EvaluationMetric_10fold.csv', sep='\t', index=False)
EvaluationMetric_10fold

Unnamed: 0,Statistic Measure,Naive Bayes,Random Forest,SVM
0,Accuracy,0.6125,0.99875,0.70125


From the above, we observe that the Random Forest (RF) classification method is much better, in terms of accuracy,  than Naive Bayes and Support Vector Machines methods. It is almost always right (~99% accuracy).