In [2]:
import pandas as pd 
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import classification_report, accuracy_score, auc
from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier


import csv
import random
import math
import operator

In [3]:
# load our data
test_data = pd.read_csv('test_set.csv', sep='\t')
train_data = pd.read_csv('train_set.csv', sep='\t')

# a list of our categories (taken as facts)
categories = ['Politics','Football','Business','Technology','Film']

# we will use a number to represent each of our categories
category_dict = {'Politics':0, 'Football':1, 'Business':2, 'Technology':3, 'Film':4}

# for our text data, we use a count vectorizer
count_vect = CountVectorizer()

# we will classify using the 'Title' as a criterion
category_criteria = 'Title'

In [4]:
# DATA PREPROCESSING
# for training
X_train_counts = count_vect.fit_transform(train_data[category_criteria])
tfidf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_counts.shape)
print(X_train_counts.shape)

# for testing
X_test_counts = count_vect.transform(test_data[category_criteria])
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_test_counts.shape)
print(X_test_tfidf.shape)

# we create a 'target' array where we will the category of each of our training data
target =[]
for x in train_data['Category']:
    target.append(category_dict[x])
target = np.array(target)
print("target[] sample:")
print(target[:40])

(12266, 13975)
(12266, 13975)
(3067, 13975)
(3067, 13975)
target[] sample:
[2 2 2 1 1 2 0 1 2 4 2 4 4 4 2 4 0 2 0 0 1 3 0 2 4 1 0 4 2 3 1 0 0 2 1 3 2
 0 3 3]


In [5]:
# experimenting with Latent Semantic Indexing (LSI) for various number of components
# 100 components
svd1 = TruncatedSVD(n_components=100)
X_lsi1 = svd1.fit_transform(X_train_tfidf)
clfSVD1 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(X_lsi1, target)
X_test_lsi1 = svd1.transform(count_vect.transform(train_data[category_criteria]))
predictedSVD1 = clfSVD1.predict(X_test_lsi1)
print("Mean1 = ")
print(np.mean(predictedSVD1 == target))
# 200 components
svd2 = TruncatedSVD(n_components=200)
X_lsi2 = svd2.fit_transform(X_train_tfidf)
clfSVD2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(X_lsi2, target)
X_test_lsi2 = svd2.transform(count_vect.transform(train_data[category_criteria]))
predictedSVD2 = clfSVD2.predict(X_test_lsi2)
print("Mean2 = ")
print(np.mean(predictedSVD2 == target))
# 350 components
svd3 = TruncatedSVD(n_components=350)
X_lsi3 = svd3.fit_transform(X_train_tfidf)
clfSVD3 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(X_lsi3, target)
X_test_lsi3 = svd3.transform(count_vect.transform(train_data[category_criteria]))
predictedSVD3 = clfSVD3.predict(X_test_lsi3)
print("Mean3 = ")
print(np.mean(predictedSVD3 == target))
# 500 components
svd4 = TruncatedSVD(n_components=500)
X_lsi4 = svd4.fit_transform(X_train_tfidf)
clfSVD4 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(X_lsi4, target)
X_test_lsi4 = svd4.transform(count_vect.transform(train_data[category_criteria]))
predictedSVD4 = clfSVD4.predict(X_test_lsi4)
print("Mean4 = ")
print(np.mean(predictedSVD4 == target))
# 1000 components
svd5 = TruncatedSVD(n_components=1000)
X_lsi5 = svd5.fit_transform(X_train_tfidf)
clfSVD5 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42).fit(X_lsi5, target)
X_test_lsi5 = svd5.transform(count_vect.transform(train_data[category_criteria]))
predictedSVD5 = clfSVD5.predict(X_test_lsi5)
print("Mean5 = ")
print(np.mean(predictedSVD5 == target))

Mean1 = 
0.792434371433
Mean2 = 
0.826349258112
Mean3 = 
0.847872166966
Mean4 = 
0.864992662645
Mean5 = 
0.886189466819


In [11]:
# 10-FOLD CROSS VALIDATION FOR RF CLASSIFICATION
def Cross_valid(clf):

    kf = KFold(n_splits=10)

    fold = 0

    for train_index, test_index in kf.split(train_data[category_criteria]):
        X_train_counts = count_vect.transform(train_data[category_criteria][train_index])
        X_test_counts = count_vect.transform(train_data[category_criteria][test_index].values.astype('U'))

        clf_cv = clf.fit(X_train_counts, target[train_index])
        yPred = clf_cv.predict(X_test_counts)
        fold += 1
        print ("Fold " + str(fold))
        print("(I) Precision / Recall / F-Measure / Support:")
        print(classification_report(yPred, target[test_index], target_names=categories))
        print("(II) Accuracy:")
        print(accuracy_score(yPred, target[test_index]))
        print("(III) AUC")
        print(auc(yPred, target[test_index], reorder=True))

In [14]:
# RANDOM FOREST CLASSIFICATION
RANDOM_STATE = 123

clf = RandomForestClassifier(warm_start=True, oob_score=True, max_features="sqrt", random_state=RANDOM_STATE)
clf.set_params(n_estimators=30)
clf.fit(X_train_tfidf, target)

predicted = clf.predict(X_test_tfidf)

for x in range(10):
    print(test_data['Title'][x] + "---->" + categories[predicted[x]])

Syria airstrikes: Jeremy Corbyn gives Labour MPs free vote ---->Politics
Apple faces damages bill after jury finds iPhone and iPad chip violates processor patent ---->Technology
'I'm sitting next to a weirdo on the bus' and other true meanings of emoji ---->Film
Black Friday 2015: UK retailers serve up alternative options ---->Business
A third of boardroom positions should be held by women, UK firms told ---->Business
Marks and Spencer customers hit by delays to online shopping orders ---->Business
Argos owner sees distorting effect of Black Friday on sales ---->Business
TalkTalk says hackers accessed fraction of data originally thought ---->Technology
Gameover Zeus returns: thieving malware rises a month after police action ---->Business
TalkTalk boss says cybersecurity 'head and shoulders' above competitors ---->Business


In [15]:
Cross_valid(clf)

  warn("Warm-start fitting without increasing n_estimators does not "


Fold 1
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.99      0.99      0.99       247
   Football       0.99      1.00      1.00       339
   Business       0.99      0.96      0.98       280
 Technology       0.97      1.00      0.98       146
       Film       0.98      0.98      0.98       215

avg / total       0.99      0.99      0.99      1227

(II) Accuracy:
0.986145069275
(III) AUC
8.0
Fold 2
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.99      0.98      0.98       269
   Football       1.00      0.99      1.00       311
   Business       0.97      0.97      0.97       278
 Technology       0.97      1.00      0.99       138
       Film       0.98      0.99      0.99       231

avg / total       0.98      0.98      0.98      1227

(II) Accuracy:
0.984515077425
(III) AUC
8.5
Fold 3
(I) Precision / Recall / F-Measure / Support:
     

In [18]:
# SUPPORT VECTOR MACHINES (SVM) CLASSIFICATION (I)
# with linear kernel and c=0.2
lnr = svm.SVC(C=0.2, kernel='linear')
lnr.fit(X_train_tfidf, target)

predicted = lnr.predict(X_test_tfidf)

print("SVM with linear kernel and c=0.2:")
for x in range(10):
    print(test_data['Title'][x] + "---->" + categories[predicted[x]])

SVM with linear kernel and c=0.2:
Syria airstrikes: Jeremy Corbyn gives Labour MPs free vote ---->Politics
Apple faces damages bill after jury finds iPhone and iPad chip violates processor patent ---->Technology
'I'm sitting next to a weirdo on the bus' and other true meanings of emoji ---->Film
Black Friday 2015: UK retailers serve up alternative options ---->Business
A third of boardroom positions should be held by women, UK firms told ---->Business
Marks and Spencer customers hit by delays to online shopping orders ---->Business
Argos owner sees distorting effect of Black Friday on sales ---->Business
TalkTalk says hackers accessed fraction of data originally thought ---->Business
Gameover Zeus returns: thieving malware rises a month after police action ---->Business
TalkTalk boss says cybersecurity 'head and shoulders' above competitors ---->Business


In [19]:
Cross_valid(lnr)

Fold 1
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.90      0.88      0.89       252
   Football       0.95      0.99      0.97       328
   Business       0.87      0.88      0.87       270
 Technology       0.85      0.91      0.88       141
       Film       0.95      0.86      0.90       236

avg / total       0.91      0.91      0.91      1227

(II) Accuracy:
0.908720456398
(III) AUC
8.0
Fold 2
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.90      0.92      0.91       262
   Football       0.97      0.96      0.97       311
   Business       0.89      0.89      0.89       277
 Technology       0.81      0.91      0.86       126
       Film       0.95      0.88      0.92       251

avg / total       0.92      0.92      0.92      1227

(II) Accuracy:
0.916055419723
(III) AUC
7.5
Fold 3
(I) Precision / Recall / F-Measure / Support:
     

In [20]:
# SUPPORT VECTOR MACHINES (SVM) CLASSIFICATION (II)
# with rbf kernel, c=5000000.0 and gamma=100000
rbf = svm.SVC(C=5000000.0, kernel='rbf', gamma=100000)
rbf.fit(X_train_tfidf, target)

predicted = rbf.predict(X_test_tfidf)

print("SVM with rbf kernel, c=5000000.0 and gamma=100000:")
for x in range(10):
    print(test_data['Title'][x] + "---->" + categories[predicted[x]])

SVM with rbf kernel, c=5000000.0 and gamma=100000:
Syria airstrikes: Jeremy Corbyn gives Labour MPs free vote ---->Football
Apple faces damages bill after jury finds iPhone and iPad chip violates processor patent ---->Football
'I'm sitting next to a weirdo on the bus' and other true meanings of emoji ---->Football
Black Friday 2015: UK retailers serve up alternative options ---->Football
A third of boardroom positions should be held by women, UK firms told ---->Football
Marks and Spencer customers hit by delays to online shopping orders ---->Football
Argos owner sees distorting effect of Black Friday on sales ---->Football
TalkTalk says hackers accessed fraction of data originally thought ---->Football
Gameover Zeus returns: thieving malware rises a month after police action ---->Football
TalkTalk boss says cybersecurity 'head and shoulders' above competitors ---->Football


In [21]:
Cross_valid(rbf)

Fold 1
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.01      1.00      0.02         2
   Football       1.00      0.28      0.44      1216
   Business       0.01      1.00      0.02         3
 Technology       0.02      1.00      0.04         3
       Film       0.01      1.00      0.03         3

avg / total       0.99      0.29      0.44      1227

(II) Accuracy:
0.287693561532
(III) AUC
9.0
Fold 2
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.01      1.00      0.01         2
   Football       1.00      0.26      0.41      1213
   Business       0.00      1.00      0.01         1
 Technology       0.02      1.00      0.04         3
       Film       0.03      1.00      0.07         8

avg / total       0.99      0.26      0.40      1227

(II) Accuracy:
0.264058679707
(III) AUC
9.0
Fold 3
(I) Precision / Recall / F-Measure / Support:
     

  'recall', 'true', average, warn_for)


Fold 4
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.00      0.00      0.00         0
   Football       1.00      0.24      0.38      1226
   Business       0.00      0.00      0.00         0
 Technology       0.00      0.00      0.00         0
       Film       0.00      1.00      0.01         1

avg / total       1.00      0.24      0.38      1227

(II) Accuracy:
0.237163814181
(III) AUC
12.0
Fold 5
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.00      0.00      0.00         0
   Football       1.00      0.26      0.41      1222
   Business       0.00      0.00      0.00         0
 Technology       0.00      0.00      0.00         0
       Film       0.02      1.00      0.05         5

avg / total       1.00      0.26      0.41      1227

(II) Accuracy:
0.259983700081
(III) AUC
12.0
Fold 6
(I) Precision / Recall / F-Measure / Support:
   

In [22]:
# NAIVE BAYES CLASSIFICATION (I)
# Multinomial Naive Bayes
mnb = MultinomialNB().fit(X_train_tfidf, target)

predicted = mnb.predict(X_test_tfidf)

print("Multinomial NB:")
for x in range(10):
    print(test_data['Title'][x] + "---->" + categories[predicted[x]]) 

Multinomial NB:
Syria airstrikes: Jeremy Corbyn gives Labour MPs free vote ---->Politics
Apple faces damages bill after jury finds iPhone and iPad chip violates processor patent ---->Technology
'I'm sitting next to a weirdo on the bus' and other true meanings of emoji ---->Politics
Black Friday 2015: UK retailers serve up alternative options ---->Business
A third of boardroom positions should be held by women, UK firms told ---->Business
Marks and Spencer customers hit by delays to online shopping orders ---->Business
Argos owner sees distorting effect of Black Friday on sales ---->Business
TalkTalk says hackers accessed fraction of data originally thought ---->Business
Gameover Zeus returns: thieving malware rises a month after police action ---->Business
TalkTalk boss says cybersecurity 'head and shoulders' above competitors ---->Business
Black Friday scuffles: 'I got a Dyson but I don’t even know if I want it' ---->Film
Shoppers flock to laptops rather than stores for holiday shoppi

Cable's red tape exemptions get muted welcome ---->Politics
Tory MP cautioned for domestic abuse faces disciplinary investigation ---->Politics
Elysium – review ---->Film
Zach Braff Kickstarter controversy deepens after financier bolsters budget ---->Business
8 Minutes Idle turns to Kickstarter to escape distribution limbo ---->Football
Oldboy – first look review ---->Film
Nationwide says London housing market is starting to slow down ---->Business
Tesco's empire: expansion checked in UK and beyond ---->Business
Spotify app Guilty Pledgers raises money for charity from cheesy tunes ---->Technology
Mary Portas Kinky Knickers makers in administration ---->Business
China's corruption crackdown hurts James Packer's gambling empire ---->Business
Eric Cantona: Fifa’s corruption divides Brazilian football from its roots ---->Football
England 3-0 Peru ---->Football
Gérard Depardieu ducks out of Edinburgh film festival appearance after drinking session in Skye pub ---->Film
Cannes 2014 review: 

In [23]:
Cross_validoss_valid(mnb)

Fold 1
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.92      0.89      0.90       254
   Football       0.96      0.97      0.96       337
   Business       0.92      0.87      0.89       288
 Technology       0.81      0.91      0.86       134
       Film       0.92      0.93      0.92       214

avg / total       0.92      0.92      0.92      1227

(II) Accuracy:
0.916055419723
(III) AUC
8.0
Fold 2
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.93      0.90      0.91       277
   Football       0.98      0.98      0.98       310
   Business       0.93      0.89      0.91       287
 Technology       0.82      0.94      0.87       124
       Film       0.92      0.94      0.93       229

avg / total       0.93      0.93      0.93      1227

(II) Accuracy:
0.927465362673
(III) AUC
8.5
Fold 3
(I) Precision / Recall / F-Measure / Support:
     

In [24]:
# NAIVE BAYES CLASSIFICATION (II)
# Bernoulli Naive Bayes
bnb = BernoulliNB().fit(X_train_tfidf, target)

predicted = bnb.predict(X_test_tfidf)

print("Bernoulli NB:")
for x in range(10):
    print(test_data['Title'][x] + "---->" + categories[predicted[x]])

Bernoulli NB:
Syria airstrikes: Jeremy Corbyn gives Labour MPs free vote ---->Politics
Apple faces damages bill after jury finds iPhone and iPad chip violates processor patent ---->Technology
'I'm sitting next to a weirdo on the bus' and other true meanings of emoji ---->Film
Black Friday 2015: UK retailers serve up alternative options ---->Business
A third of boardroom positions should be held by women, UK firms told ---->Business
Marks and Spencer customers hit by delays to online shopping orders ---->Business
Argos owner sees distorting effect of Black Friday on sales ---->Business
TalkTalk says hackers accessed fraction of data originally thought ---->Technology
Gameover Zeus returns: thieving malware rises a month after police action ---->Business
TalkTalk boss says cybersecurity 'head and shoulders' above competitors ---->Business
Black Friday scuffles: 'I got a Dyson but I don’t even know if I want it' ---->Film
Shoppers flock to laptops rather than stores for holiday shopping -

Ryanair profits endure bumpy landing with first drop in five years ---->Business
Sports Direct forced to advertise zero-hours contract terms ---->Business
AstraZeneca at risk from Pfizer tax avoidance plans, says company chief ---->Business
Burberry set for tough reception over chief executive's pay packet ---->Business
Morrisons shocks markets with huge fall in sales ---->Business
Tesco to be investigated by FCA over accounting scandal ---->Business
Benedict Cumberbatch in line to play Julian Assange in biopic ---->Film
AbbVie-Shire takeover at risk following US calls to halt tax inversion ---->Business
Sainsbury's sales fall again amid sluggish retail environment ---->Business
Samsung plans to sue Dyson over 'copycat' allegations on vacuum cleaner ---->Business
Corporate inversion - moving the head office for tax purposes ---->Business
Sainsbury’s - what the analysts say ---->Business
Dyson accuses Samsung of vacuum cleaner 'rip-off' ---->Business
Morrisons' sales slump by more than 

Miliband launches Labour election campaign with promise of 4m chats ---->Politics
Cameron hints at early referendum on Britain’s EU membership ---->Politics
Invictus ---->Football
How Bollywood is starting to deal with India's caste system ---->Business
David Cameron arrives in Libya on surprise visit ---->Politics
David Cameron pledges UK support in Libya visit ---->Politics
How Candy Crush gets you hooked - six addictive tricks ---->Technology
Sick of freemium? Here are 20 paid iOS and Android games worth supporting ---->Technology
Youth unemployment pushes graduates to start up own businesses ---->Business
London retains crown as favourite city of world's ultra-rich ---->Business
Tories face calls to hand back £160k given by Russian for tennis match ---->Politics
Less Gulf, more golf: job cuts likely as oil price drop shreds profits ---->Business
Michael Heseltine: 'People are deeply frustrated after six years of being beaten up by austerity' ---->Politics
Mark Carney: the Bank of E

How Michel Gondry became cinema's most versatile director ---->Film
Wake In Fright: the almost-forgotten film that kickstarted the Aussie new wave ---->Film
3D films set for popularity slide ---->Film
3D films lose lustre as home-grown hits win cinema box-office battle ---->Film
The life aquatic: Wes Anderson to tour on Queen Mary 2 ---->Film
The Grand Budapest Hotel review – Wes Anderson's new film is a 'deeply pleasurable immersion' ---->Film
The Wolverine claws back Hugh Jackman for another sequel ---->Film
Deezer Elite high-resolution streaming service goes global with Sonos ---->Business
X-Men supervillain Apocalypse to hit cinemas in 2016 ---->Film
The Master – review ---->Film
SNP avalanche sweeps aside Douglas Alexander and Jim Murphy ---->Politics
Ed Miliband to summon up ghosts of Labour's past to try to avoid SNP rout ---->Politics
Election 2015: Top Labour figures question Miliband's SNP stance ---->Politics
General election 2010: Nick Clegg voices regret at Lib Dem losses 

In [25]:
Cross_valid(bnb)

Fold 1
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.92      0.89      0.91       255
   Football       0.96      0.95      0.96       348
   Business       0.92      0.84      0.88       301
 Technology       0.66      0.97      0.79       103
       Film       0.92      0.90      0.91       220

avg / total       0.91      0.90      0.90      1227

(II) Accuracy:
0.902200488998
(III) AUC
7.5
Fold 2
(I) Precision / Recall / F-Measure / Support:
             precision    recall  f1-score   support

   Politics       0.91      0.89      0.90       273
   Football       0.98      0.94      0.96       324
   Business       0.93      0.85      0.89       303
 Technology       0.63      0.96      0.76        94
       Film       0.90      0.90      0.90       233

avg / total       0.91      0.90      0.90      1227

(II) Accuracy:
0.900570497148
(III) AUC
9.0
Fold 3
(I) Precision / Recall / F-Measure / Support:
     