In [1]:
import os
import json
import numpy as np
from datetime import datetime
from collections import defaultdict
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
categories = set()
top_level_categories = set()

train_X = []
train_specific_Y = []
train_top_Y = []
with open(os.path.join('..', 'features', 'nouns', '2016.json'), 'r') as f:
    for line in f:
        row = json.loads(line)
        
        train_X.append(row['nouns'])
        train_specific_Y.append(row['categories'])
        train_top_Y.append(row['top_level_categories'])

test_X = []
test_specific_Y = []
test_top_Y = []    
with open(os.path.join('..', 'features', 'nouns', '2017.json'), 'r') as f:
    for line in f:
        row = json.loads(line)
        
        test_X.append(row['nouns'])
        test_specific_Y.append(row['categories'])
        test_top_Y.append(row['top_level_categories'])

In [None]:
mlb = MultiLabelBinarizer(list(categories))
bin_train_Y = mlb.fit_transform(train_Y)
bin_test_Y = mlb.fit_transform(test_Y)

In [None]:
### One vs rest classifier
### Features: TFIDF (top 10000 nouns)
### 

NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features = 10000)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

nb_predictions = []
for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))
    # train the model using X_dtm & y
    NB_pipeline.fit(train_X, bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = NB_pipeline.predict(test_X)
    nb_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

In [None]:
nb_predictions = []

for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))

    vectorizer = TfidfVectorizer(max_features = 5000)
    tfidf_matrix =  vectorizer.fit_transform(np.array(train_X)[bin_train_Y[:,i] == 1])
    
    clf = MultinomialNB(fit_prior=True, class_prior=None)
    clf.fit(vectorizer.transform(train_X), bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = clf.predict(vectorizer.transform(test_X))
    nb_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))