# Part 4 -- Beat the Benchmark (bonus)

The libraries that we are going to use:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import matplotlib.pyplot as plt2
import csv
import random
import math
import operator
from operator import itemgetter
from collections import Counter
from wordcloud import STOPWORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, auc
from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn import tree

## Setting up

We load our data, we create our assistant-structures and we define our stopwords, our vectorizer *(count vectorizer)* and our category criterion, the *title*: 

In [2]:
# load our data
test_data = pd.read_csv('test_set.csv', sep='\t')
train_data = pd.read_csv('train_set.csv', sep='\t')

# a list of our categories (taken as facts)
categories = ['Politics','Football','Business','Technology','Film']

# we will use a number to represent each of our categories
category_dict = {'Politics':0, 'Football':1, 'Business':2, 'Technology':3, 'Film':4}

# for our text data, we use a count vectorizer
stopwords = set(STOPWORDS) | set(ENGLISH_STOP_WORDS)
# some additional stopwords based on our own observations
stopwords.add('said')
stopwords.add('say')
stopwords.add('says')
stopwords.add('set')

# our count vectorizer
count_vect = CountVectorizer(stop_words=stopwords)

# we will classify using the 'Title' as a criterion
category_criterion = 'Title'

## Data Preprocessing

We preprocess our training and testing data. We then create a *'target'* array where we will note the category of each of our training data and we print a small part of it:

In [3]:
# DATA PREPROCESSING
# for training
X_train_counts = count_vect.fit_transform(train_data[category_criterion])
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_counts.shape)
print(X_train_counts.shape)

# for testing
X_test_counts = count_vect.transform(test_data[category_criterion])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_counts.shape)
print(X_test_tfidf.shape)

# we create a 'target' array where we will note the category of each of our training data
target = []
for x in train_data['Category']:
    target.append(category_dict[x])

target = np.array(target)
print("target[] sample:")
print(target[:40])

(12266, 13712)
(12266, 13712)
(3067, 13712)
(3067, 13712)
target[] sample:
[2 2 2 1 1 2 0 1 2 4 2 4 4 4 2 4 0 2 0 0 1 3 0 2 4 1 0 4 2 3 1 0 0 2 1 3 2
 0 3 3]


## We will use the Decision Tree Classifier to Beat the Benchmark

By experimenting with various classifiers in Part 3 we ended up choosing the *Random Forest Classifier* as we saw that it produces better evaluation metrics than all the others. For preproccesing we used the *Count Vectorizer* excluding the stopwords that we have setted since Part 1 of this project, we also used the *TfidfTransformer (term-frequency times inverse document-frequency transformer)* because we observed that increases the accuracy of our classifier more than anything else that we tried. We also observed that *Decision Tree Classifier* under the conditions described above can Beat the Random Forest Classifier, which can be proved by the following results.

In [4]:
# RANDOM FOREST (RF) CLASSIFIER
RANDOM_STATE = 123

rndf = RandomForestClassifier(warm_start=True, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE)
rndf.set_params(n_estimators=30)
rndf.fit(X_train_tfidf, target)

predicted = rndf.predict(X_test_tfidf)

We experiment with the Latent Semantic Indexing (LSI) for various number of components:

In [5]:
print("Latent Semantic Indexing (LSI) for various number of components: ")

accuracy = []
components = []
for i in range(6):
    components.append(i*100+100)
    print("For ", components[i], " components:")
    rndf = RandomForestClassifier(warm_start=True, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE)

    svd = TruncatedSVD(n_components=components[i])
    X_lsi = svd.fit_transform(X_train_tfidf)
    clfSVD = tree.DecisionTreeClassifier().fit(X_lsi, target)
    X_test_lsi = svd.transform(X_train_counts)
    predictedSVD = clfSVD.predict(X_test_lsi)
    print("Accuracy:")
    acc = accuracy_score(target, predictedSVD)
    accuracy.append(acc)
    print("acc == ",acc)
    print(accuracy[i])

Latent Semantic Indexing (LSI) for various number of components: 
('For ', 100, ' components:')
Accuracy:
('acc == ', 0.5131257133539866)
0.513125713354
('For ', 200, ' components:')
Accuracy:
('acc == ', 0.47578672753954021)
0.47578672754
('For ', 300, ' components:')
Accuracy:
('acc == ', 0.43665416598728191)
0.436654165987
('For ', 400, ' components:')
Accuracy:
('acc == ', 0.44040436980270664)
0.440404369803
('For ', 500, ' components:')
Accuracy:
('acc == ', 0.38529267894994296)
0.38529267895
('For ', 600, ' components:')
Accuracy:
('acc == ', 0.40200554377955322)
0.40200554378


In [6]:
X_lsi

array([[ 0.00115585,  0.00079303,  0.00109195, ..., -0.02479855,
        -0.0298099 , -0.02092812],
       [ 0.0029537 ,  0.0036768 ,  0.01050912, ...,  0.02084935,
        -0.00964101,  0.01041619],
       [ 0.00362246,  0.0031201 ,  0.00949418, ...,  0.07918788,
        -0.00520475, -0.03225572],
       ..., 
       [ 0.00215124,  0.00457309,  0.02670653, ..., -0.02355548,
        -0.0115988 ,  0.00904519],
       [ 0.01173553,  0.02352226,  0.11453489, ..., -0.00206927,
        -0.00839041, -0.01279061],
       [ 0.00057599,  0.00324241,  0.00214254, ..., -0.00492281,
        -0.00052916, -0.03024586]])

In [7]:
# DECISION TREE (DT) CLASSIFIER
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_lsi, target)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

#### Our Cross Validation function:

In [8]:
cross_val_instance = 0

def cross_validate(clf):
    global cross_val_instance    # Needed to modify global copy of a global variable
    
    kf = KFold(n_splits=10)

    fold = 0
    for train_index, test_index in kf.split(train_data[category_criterion]):
        cross_val_instance += 1
        
        X_train_counts = count_vect.transform(train_data[category_criterion][train_index])
        X_test_counts = count_vect.transform(train_data[category_criterion][test_index].values.astype('U'))

        tfidf_transformer = TfidfTransformer(use_idf=False)
        X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
        
        clf_cv = clf.fit(X_train_tfidf, target[train_index])
        X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
        
        yPred = clf_cv.predict(X_test_tfidf)
        fold += 1
        print ("Fold " + str(fold))
        
        accuracy = accuracy_score(target[test_index], yPred)
        print("Accuracy: ", accuracy)
        
        A = auc(target[test_index], yPred, reorder=True)
        print("AUC: ", A)
        
        p = metrics.precision_score(target[test_index], yPred, average='macro')
        print("PRESICION: ", p)
        
        recall = metrics.recall_score(target[test_index], yPred, average='macro') 
        print("Recall: ", recall)
        f_1 = metrics.f1_score(target[test_index], yPred, average='micro') 
        print("F-1: ", f_1)
        
        fpr, tpr, thresholds = metrics.roc_curve(target[test_index], yPred, pos_label=2)
        roc_auc = metrics.auc(fpr, tpr)
        print("Roc: ",roc_auc)

In [9]:
cross_validate(dt_clf)

Fold 1
('Accuracy: ', 0.72697636511817443)
('AUC: ', 8.0)
('PRESICION: ', 0.7965741169240802)
('Recall: ', 0.73688816792454226)
('F-1: ', 0.72697636511817443)
('Roc: ', 0.50337887130339953)
Fold 2
('Accuracy: ', 0.74001629991850038)
('AUC: ', 8.5)
('PRESICION: ', 0.78739513224066682)
('Recall: ', 0.72265702888536099)
('F-1: ', 0.74001629991850038)
('Roc: ', 0.60895853335162065)
Fold 3
('Accuracy: ', 0.74246128769356157)
('AUC: ', 8.5)
('PRESICION: ', 0.77861927041124768)
('Recall: ', 0.72269671322491214)
('F-1: ', 0.74246128769356157)
('Roc: ', 0.60498039995690478)
Fold 4
('Accuracy: ', 0.73594132029339854)
('AUC: ', 8.0)
('PRESICION: ', 0.78411216864538336)
('Recall: ', 0.72391124527797435)
('F-1: ', 0.73594132029339865)
('Roc: ', 0.59531980690903608)
Fold 5
('Accuracy: ', 0.7351263243683781)
('AUC: ', 8.0)
('PRESICION: ', 0.78027521344946427)
('Recall: ', 0.71432202938595601)
('F-1: ', 0.73512632436837799)
('Roc: ', 0.61477221300346963)
Fold 6
('Accuracy: ', 0.73105134474327627)
('AU