In [1]:
from sklearn.feature_extraction.text import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sklearn.naive_bayes as naive_bayes
import json
import numpy as np
import nltk.corpus
import nltk
import string

In [2]:
stop_words = nltk.corpus.stopwords.words('english') + list(string.punctuation)
def clean_article(s):
    return [i for i in nltk.word_tokenize(s) if i not in stop_words]

def load_data():
    data = json.load(open('stories.json'))
    for dataset in ['train', 'test', 'dev']:
        for idx, article in enumerate(data[dataset]):
            data[dataset][idx]['Content'] = clean_article(article['Content'])
    return data

In [3]:
data = load_data()

In [4]:
# vectorize input data
corpus = [' '.join(article['Content']) for article in data['train']]
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', ngram_range=(1,2))

['contract',
 'Bovine',
 'Spongiform',
 'Encephalopathy',
 '(',
 'BSE',
 ')',
 '--',
 'mad',
 'cow',
 'disease',
 '.']

In [12]:
inv_map = {v: k for k, v in vectorizer.vocabulary_.items()}

AttributeError: 'TfidfVectorizer' object has no attribute 'vocabulary_'

In [6]:
vectorizer.fit_transform(['world trump donald'])[0, 5130]


ValueError: max_df corresponds to < documents than min_df

# K-NN classifier

In [7]:
def get_all_classes(data, t):
    all_classes = set()
    for typename in t:
        articles = data[typename]
        for article in articles:
            all_classes = all_classes.union(set(article['Topics']))
    return all_classes

def get_classifier(classifier, vectorizer, data, t, topic_name):
    articles = []
    for type_name in t:
        articles = articles + data[type_name]
    
    corpus = [' '.join(article['Content']) for article in articles]
    mat_corpus = vectorizer.fit_transform(corpus)
    print(mat_corpus)
    
    y = np.asarray([topic_name in article['Topics'] for article in articles]).astype(int)
    
    classifier.fit(mat_corpus, y)
    return classifier

def get_result(classifier, data, t, topic_name):
    articles = []
    for type_name in t:
        articles = articles + data[type_name]
    
    corpus = [' '.join(article['Content']) for article in articles]
    mat_corpus = vectorizer.transform(corpus)
    y = np.asarray([topic_name in article['Topics'] for article in articles]).astype(int)
    
    return classifier.predict(mat_corpus), y
    
all_classes = get_all_classes(data, ['train'])

In [8]:
for class_name in sorted(all_classes):
    classifier = get_classifier(KNeighborsClassifier(n_neighbors=9), vectorizer, data, ['train', 'dev'], class_name)
    predict_result = get_result(classifier, data, ['test'], class_name)
    precision = 1 - np.sum(np.abs(predict_result[0] - predict_result[1])) / float(len(predict_result[0]))
    result = [predict_result[0][i] == 1 and predict_result[1][i] == 1 for i in range(len(predict_result[0]))]
    recall = (np.sum(result) + 1) / float(np.sum(predict_result[1]) + 1)
    f1 = 2. / (1. / precision + 1. / recall)
    print(class_name, precision, recall, f1)
    

  (0, 6437)	0.0676953209106179
  (0, 2797)	0.05086281087377959
  (0, 4790)	0.08774284100021455
  (0, 5083)	0.13019531656207167
  (0, 6602)	0.05834429295270955
  (0, 2077)	0.10030432083204595
  (0, 4557)	0.13749100836836897
  (0, 2813)	0.15916138250367806
  (0, 4860)	0.1566593942130152
  (0, 6667)	0.11854145776717946
  (0, 5265)	0.11724064654737718
  (0, 2314)	0.2065557539261549
  (0, 2384)	0.30191561307632186
  (0, 4234)	0.1744765233230123
  (0, 4417)	0.06597367718905846
  (0, 1803)	0.1069579464758946
  (0, 1804)	0.14386440730695113
  (0, 5775)	0.06514753707648614
  (0, 6144)	0.0496872586641892
  (0, 2321)	0.096675246404412
  (0, 6867)	0.08814222340842616
  (0, 5273)	0.06862699302419215
  (0, 6160)	0.10345006566403972
  (0, 6264)	0.07266091484417453
  (0, 5587)	0.02845329722268384
  :	:
  (747, 4473)	0.13546285948505934
  (747, 7146)	0.08000654700335033
  (747, 1588)	0.07127594874813836
  (747, 4534)	0.08000654700335033
  (747, 7218)	0.07240482833196343
  (747, 5772)	0.0764621425791009

  (0, 6437)	0.0676953209106179
  (0, 2797)	0.05086281087377959
  (0, 4790)	0.08774284100021455
  (0, 5083)	0.13019531656207167
  (0, 6602)	0.05834429295270955
  (0, 2077)	0.10030432083204595
  (0, 4557)	0.13749100836836897
  (0, 2813)	0.15916138250367806
  (0, 4860)	0.1566593942130152
  (0, 6667)	0.11854145776717946
  (0, 5265)	0.11724064654737718
  (0, 2314)	0.2065557539261549
  (0, 2384)	0.30191561307632186
  (0, 4234)	0.1744765233230123
  (0, 4417)	0.06597367718905846
  (0, 1803)	0.1069579464758946
  (0, 1804)	0.14386440730695113
  (0, 5775)	0.06514753707648614
  (0, 6144)	0.0496872586641892
  (0, 2321)	0.096675246404412
  (0, 6867)	0.08814222340842616
  (0, 5273)	0.06862699302419215
  (0, 6160)	0.10345006566403972
  (0, 6264)	0.07266091484417453
  (0, 5587)	0.02845329722268384
  :	:
  (747, 4473)	0.13546285948505934
  (747, 7146)	0.08000654700335033
  (747, 1588)	0.07127594874813836
  (747, 4534)	0.08000654700335033
  (747, 7218)	0.07240482833196343
  (747, 5772)	0.0764621425791009

  (0, 6437)	0.0676953209106179
  (0, 2797)	0.05086281087377959
  (0, 4790)	0.08774284100021455
  (0, 5083)	0.13019531656207167
  (0, 6602)	0.05834429295270955
  (0, 2077)	0.10030432083204595
  (0, 4557)	0.13749100836836897
  (0, 2813)	0.15916138250367806
  (0, 4860)	0.1566593942130152
  (0, 6667)	0.11854145776717946
  (0, 5265)	0.11724064654737718
  (0, 2314)	0.2065557539261549
  (0, 2384)	0.30191561307632186
  (0, 4234)	0.1744765233230123
  (0, 4417)	0.06597367718905846
  (0, 1803)	0.1069579464758946
  (0, 1804)	0.14386440730695113
  (0, 5775)	0.06514753707648614
  (0, 6144)	0.0496872586641892
  (0, 2321)	0.096675246404412
  (0, 6867)	0.08814222340842616
  (0, 5273)	0.06862699302419215
  (0, 6160)	0.10345006566403972
  (0, 6264)	0.07266091484417453
  (0, 5587)	0.02845329722268384
  :	:
  (747, 4473)	0.13546285948505934
  (747, 7146)	0.08000654700335033
  (747, 1588)	0.07127594874813836
  (747, 4534)	0.08000654700335033
  (747, 7218)	0.07240482833196343
  (747, 5772)	0.0764621425791009

  (0, 6437)	0.0676953209106179
  (0, 2797)	0.05086281087377959
  (0, 4790)	0.08774284100021455
  (0, 5083)	0.13019531656207167
  (0, 6602)	0.05834429295270955
  (0, 2077)	0.10030432083204595
  (0, 4557)	0.13749100836836897
  (0, 2813)	0.15916138250367806
  (0, 4860)	0.1566593942130152
  (0, 6667)	0.11854145776717946
  (0, 5265)	0.11724064654737718
  (0, 2314)	0.2065557539261549
  (0, 2384)	0.30191561307632186
  (0, 4234)	0.1744765233230123
  (0, 4417)	0.06597367718905846
  (0, 1803)	0.1069579464758946
  (0, 1804)	0.14386440730695113
  (0, 5775)	0.06514753707648614
  (0, 6144)	0.0496872586641892
  (0, 2321)	0.096675246404412
  (0, 6867)	0.08814222340842616
  (0, 5273)	0.06862699302419215
  (0, 6160)	0.10345006566403972
  (0, 6264)	0.07266091484417453
  (0, 5587)	0.02845329722268384
  :	:
  (747, 4473)	0.13546285948505934
  (747, 7146)	0.08000654700335033
  (747, 1588)	0.07127594874813836
  (747, 4534)	0.08000654700335033
  (747, 7218)	0.07240482833196343
  (747, 5772)	0.0764621425791009

KeyboardInterrupt: 

In [173]:
for class_name in sorted(all_classes):
    classifier = get_classifier(KNeighborsClassifier(n_neighbors=9, metric='l2'), vectorizer, data, ['train', 'dev'], class_name)
    predict_result = get_result(classifier, data, ['test'], class_name)
    precision = 1 - np.sum(np.abs(predict_result[0] - predict_result[1])) / float(len(predict_result[0]))
    result = [predict_result[0][i] == 1 and predict_result[1][i] == 1 for i in range(len(predict_result[0]))]
    recall = (np.sum(result) + 1) / float(np.sum(predict_result[1]) + 1)
    f1 = 2. / (1. / precision + 1. / recall)
    print(class_name, precision, recall, f1)
        

Accidents and Natural Disasters 0.9840425531914894 0.9444444444444444 0.9638369598528961
Arts and Entertainment 0.9946808510638298 0.5 0.6654804270462633
Business 0.9361702127659575 0.2 0.3295880149812734
Climate and Environment 0.9893617021276596 0.5 0.6642857142857143
Context 0.9627659574468085 0.25 0.3969298245614035
Economy 0.9308510638297872 0.5 0.6505576208178439
Elections 0.9308510638297872 0.45 0.6067026194144838
France 0.9946808510638298 0.5 0.6654804270462633
Health and Medicine 0.9680851063829787 0.4 0.5660964230171073
Immigration 0.9627659574468085 0.6666666666666666 0.7878128400435256
Islamic State 0.973404255319149 0.16666666666666666 0.2846034214618973
Law &amp; Justice 0.8085106382978724 0.631578947368421 0.7091757387247278
North Korea 0.9840425531914894 0.8333333333333334 0.9024390243902439
Politics 0.8138297872340425 0.625 0.7070240295748613
Religion 0.973404255319149 0.16666666666666666 0.2846034214618973
Russia 0.9468085106382979 0.7391304347826086 0.830178326474622

# Naive Bayes

In [174]:
for class_name in sorted(all_classes):
    classifier = get_classifier(naive_bayes.BernoulliNB(alpha=1), vectorizer, data, ['train', 'dev'], class_name)
    predict_result = get_result(classifier, data, ['test'], class_name)
    precision = 1 - np.sum(np.abs(predict_result[0] - predict_result[1])) / float(len(predict_result[0]))
    result = [predict_result[0][i] == 1 and predict_result[1][i] == 1 for i in range(len(predict_result[0]))]
    recall = (np.sum(result) + 1) / float(np.sum(predict_result[1]) + 1)
    f1 = 2. / (1. / precision + 1. / recall)
    print(class_name, precision, recall, f1)
        

Accidents and Natural Disasters 0.9574468085106383 0.6111111111111112 0.746043707611153
Arts and Entertainment 0.9946808510638298 0.5 0.6654804270462633
Business 0.9202127659574468 0.5333333333333333 0.675286655281776
Climate and Environment 0.9680851063829787 0.25 0.39737991266375544
Context 0.9521276595744681 0.5 0.6556776556776557
Economy 0.9095744680851063 0.35 0.5054898648648648
Elections 0.9202127659574468 0.45 0.6044254658385093
France 0.9946808510638298 0.5 0.6654804270462633
Health and Medicine 0.9361702127659575 0.1 0.18069815195071867
Immigration 0.9414893617021276 0.5 0.6531365313653137
Islamic State 0.973404255319149 0.16666666666666666 0.2846034214618973
Law &amp; Justice 0.8297872340425532 0.7192982456140351 0.7706024096385543
North Korea 0.9787234042553191 0.6666666666666666 0.793103448275862
Politics 0.7925531914893618 0.6875 0.7362982929020665
Religion 0.973404255319149 0.16666666666666666 0.2846034214618973
Russia 0.9414893617021276 0.6956521739130435 0.8001130103121

# SVM

In [23]:
svm_classifier = SVC(kernel='poly', degree=4)
for class_name in sorted(all_classes):
    classifier = get_classifier(svm_classifier, vectorizer, data, ['train', 'dev'], class_name)
    predict_result = get_result(classifier, data, ['test'], class_name)
    precision = 1 - np.sum(np.abs(predict_result[0] - predict_result[1])) / float(len(predict_result[0]))
    result = [predict_result[0][i] > 0.5 and predict_result[1][i] == 1 for i in range(len(predict_result[0]))]
    recall = (np.sum(result) + 1) / float(np.sum(predict_result[1]) + 1)
    f1 = 2. / (1. / precision + 1. / recall)
    print(class_name, precision, recall, f1)

<class 'sklearn.svm.classes.SVC'>
(748, 7286)




Accidents and Natural Disasters 0.9680851063829787 0.14285714285714285 0.24897400820793433
(748, 7286)
Arts and Entertainment 0.9946808510638298 0.5 0.6654804270462633
(748, 7286)
Business 0.8882978723404256 0.045454545454545456 0.08648368720870016
(748, 7286)
Climate and Environment 0.9893617021276596 0.3333333333333333 0.49865951742627346
(748, 7286)
Context 0.925531914893617 0.06666666666666667 0.12437455325232309
(748, 7286)
Economy 0.9202127659574468 0.0625 0.11705006765899864
(748, 7286)
Elections 0.8563829787234043 0.03571428571428571 0.06856899488926746
(748, 7286)
France 0.9840425531914894 0.25 0.3987068965517241
(748, 7286)
Health and Medicine 0.9574468085106383 0.1111111111111111 0.19911504424778761
(748, 7286)
Immigration 0.9202127659574468 0.0625 0.11705006765899864
(748, 7286)
Islamic State 0.9574468085106383 0.1111111111111111 0.19911504424778761
(748, 7286)
Law &amp; Justice 0.6702127659574468 0.015873015873015872 0.031011567807039134
(748, 7286)
North Korea 0.968085106

In [24]:
SVC

sklearn.svm.classes.SVC

In [176]:
svm_classifier = SVC(kernel='rbf')
for class_name in sorted(all_classes):
    classifier = get_classifier(svm_classifier, vectorizer, data, ['train', 'dev'], class_name)
    predict_result = get_result(classifier, data, ['test'], class_name)
    precision = 1 - np.sum(np.abs(predict_result[0] - predict_result[1])) / float(len(predict_result[0]))
    result = [predict_result[0][i] == 1 and predict_result[1][i] == 1 for i in range(len(predict_result[0]))]
    recall = (np.sum(result) + 1) / float(np.sum(predict_result[1]) + 1)
    f1 = 2. / (1. / precision + 1. / recall)
    print(class_name, precision, recall, f1)
    

Accidents and Natural Disasters 0.9095744680851063 0.05555555555555555 0.10471524800979792
Arts and Entertainment 0.9946808510638298 0.5 0.6654804270462633
Business 0.925531914893617 0.06666666666666667 0.12437455325232309
Climate and Environment 0.9840425531914894 0.25 0.3987068965517241
Context 0.9627659574468085 0.125 0.2212713936430318
Economy 0.898936170212766 0.05 0.09473094170403587
Elections 0.898936170212766 0.05 0.09473094170403587
France 0.9946808510638298 0.5 0.6654804270462633
Health and Medicine 0.9521276595744681 0.1 0.18099089989888775
Immigration 0.9095744680851063 0.05555555555555555 0.10471524800979792
Islamic State 0.973404255319149 0.16666666666666666 0.2846034214618973
Law &amp; Justice 0.7021276595744681 0.017543859649122806 0.03423236514522822
North Korea 0.973404255319149 0.16666666666666666 0.2846034214618973
Politics 0.6648936170212766 0.015625 0.03053248656570591
Religion 0.973404255319149 0.16666666666666666 0.2846034214618973
Russia 0.8829787234042553 0.04