In [1]:
import os
import json
import itertools
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

In [2]:
# function code from: https://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #    print("Normalized confusion matrix")
    #else:
    #    print('Confusion matrix, without normalization')

    #print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

In [3]:
#compute accuracy, precision, and recall
def evaluate_predictions(y_true, y_pred):
    exact_match = 0
    accuracy = 0
    precision = 0
    recall = 0

    for i in range(len(y_true)):
        y = set(y_true[i])
        z = set(y_pred[i])

        if y == z:
            exact_match += 1

        accuracy += len(y.intersection(z)) / len(y.union(z))

        if len(z) > 0:
            precision += len(y.intersection(z)) / len(z)
        recall += len(y.intersection(z)) / len(y)

    exact_match /= len(y_true)
    accuracy /= len(y_true)
    precision /= len(y_true)
    recall /= len(y_true)

    print('Exact match: {0:.2f}'.format(exact_match))
    print('Accuracy: {0:.2f}'.format(accuracy))
    print('Precision: {0:.2f}'.format(precision))
    print('Recall: {0:.2f}'.format(recall))

In [4]:
# Read training and test data

categories = set()
top_level_categories = set()

train_X = []
train_specific_Y = []
train_top_Y = []
with open(os.path.join('..', 'features', 'nouns', '2016.json'), 'r') as f:
    for line in f:
        row = json.loads(line)
        
        train_X.append(row['nouns'])
        train_specific_Y.append(row['categories'])
        train_top_Y.append(row['top_level_categories'])
        
        for category in row['categories']:
            categories.add(category)
        
        for category in row['top_level_categories']:
            top_level_categories.add(category)

test_X = []
test_specific_Y = []
test_top_Y = []    
with open(os.path.join('..', 'features', 'nouns', '2017.json'), 'r') as f:
    for line in f:
        row = json.loads(line)
        
        test_X.append(row['nouns'])
        test_specific_Y.append(row['categories'])
        test_top_Y.append(row['top_level_categories'])
        
        for category in row['categories']:
            categories.add(category)
        
        for category in row['top_level_categories']:
            top_level_categories.add(category)

In [5]:
# get sorted list of categories
top_level_categories = list(top_level_categories)
top_level_categories.sort()

specific_categories = list(categories)
specific_categories.sort()

# binarize training and test labels
mlb_top = MultiLabelBinarizer(top_level_categories)
bin_train_top_Y = mlb_top.fit_transform(train_top_Y)
bin_test_top_Y = mlb_top.fit_transform(test_top_Y)

mlb_specific = MultiLabelBinarizer(specific_categories)
bin_train_specific_Y = mlb_specific.fit_transform(train_specific_Y)
bin_test_specific_Y = mlb_specific.fit_transform(test_specific_Y)

In [6]:
### Predict top-level categories
### One vs rest classifier
### Features: TFIDF (top 10000 nouns from the WHOLE training set)

pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features = 10000)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

predictions_all_tfidf = []
for i in range(len(top_level_categories)):
    print('{}'.format(mlb_top.classes_[i].upper()))
    pipeline.fit(train_X, bin_train_top_Y[: ,i])
    
    prediction = pipeline.predict(test_X)
    predictions_all_tfidf.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_top_Y[:,i])))
    print('Test accuracy is {0:.2f}'.format(accuracy_score(bin_test_top_Y[:,i], prediction)))
    print('Precision is {0:.2f}'.format(precision_score(bin_test_top_Y[:,i], prediction)))
    print('Recall is {0:.2f}\n'.format(recall_score(bin_test_top_Y[:,i], prediction)))

ASTRO-PH
Instances: 15127
Test accuracy is 0.98
Precision is 0.94
Recall is 0.86

COND-MAT
Instances: 17576
Test accuracy is 0.95
Precision is 0.88
Recall is 0.78

CS
Instances: 30686
Test accuracy is 0.94
Precision is 0.90
Recall is 0.85

ECON
Instances: 109
Test accuracy is 1.00
Precision is 0.00
Recall is 0.00

EESS


  'precision', 'predicted', average, warn_for)


Instances: 698
Test accuracy is 0.99
Precision is 0.00
Recall is 0.00

GR-QC


  'precision', 'predicted', average, warn_for)


Instances: 4589
Test accuracy is 0.98
Precision is 0.82
Recall is 0.57

HEP-EX
Instances: 2514
Test accuracy is 0.98
Precision is 0.70
Recall is 0.46

HEP-LAT
Instances: 1044
Test accuracy is 0.99
Precision is 0.86
Recall is 0.40

HEP-PH
Instances: 6539
Test accuracy is 0.98
Precision is 0.83
Recall is 0.67

HEP-TH
Instances: 6209
Test accuracy is 0.97
Precision is 0.81
Recall is 0.55

MATH
Instances: 38456
Test accuracy is 0.92
Precision is 0.89
Recall is 0.86

MATH-PH
Instances: 3765
Test accuracy is 0.97
Precision is 0.60
Recall is 0.10

NLIN
Instances: 1834
Test accuracy is 0.99
Precision is 0.80
Recall is 0.18

NUCL-EX
Instances: 1216
Test accuracy is 0.99
Precision is 0.67
Recall is 0.30

NUCL-TH
Instances: 2301
Test accuracy is 0.99
Precision is 0.76
Recall is 0.42

PHYSICS
Instances: 14548
Test accuracy is 0.92
Precision is 0.77
Recall is 0.46

Q-BIO
Instances: 2482
Test accuracy is 0.99
Precision is 0.76
Recall is 0.37

Q-FIN
Instances: 895
Test accuracy is 0.99
Precision is 0

In [7]:
predictions_all_tfidf = mlb_top.inverse_transform(np.transpose(np.asarray(predictions_all_tfidf)))
evaluate_predictions(test_top_Y, predictions_all_tfidf)

Exact match: 0.63
Accuracy: 0.74
Precision: 0.82
Recall: 0.77


In [8]:
### Predict top-level categories
### One vs rest classifier
### Features: TFIDF (top 10000 nouns from the POSITIVE training set)

predictions_pos_tfidf = []
for i in range(len(top_level_categories)):
    print('{}'.format(mlb_top.classes_[i].upper()))

    vectorizer = TfidfVectorizer(max_features = 10000)
    tfidf_matrix =  vectorizer.fit_transform(np.array(train_X)[bin_train_top_Y[:,i] == 1])
    
    clf = LogisticRegression(solver='sag')
    clf.fit(vectorizer.transform(train_X), bin_train_top_Y[:,i])
    
    # compute the testing accuracy
    prediction = clf.predict(vectorizer.transform(test_X))
    predictions_pos_tfidf.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_top_Y[:,i])))
    print('Test accuracy is {0:.2f}'.format(accuracy_score(bin_test_top_Y[:,i], prediction)))
    print('Precision is {0:.2f}'.format(precision_score(bin_test_top_Y[:,i], prediction)))
    print('Recall is {0:.2f}\n'.format(recall_score(bin_test_top_Y[:,i], prediction)))

ASTRO-PH
Instances: 15127
Test accuracy is 0.98
Precision is 0.94
Recall is 0.85

COND-MAT
Instances: 17576
Test accuracy is 0.95
Precision is 0.88
Recall is 0.78

CS
Instances: 30686
Test accuracy is 0.94
Precision is 0.90
Recall is 0.85

ECON
Instances: 109
Test accuracy is 1.00
Precision is 0.00
Recall is 0.00

EESS


  'precision', 'predicted', average, warn_for)


Instances: 698
Test accuracy is 0.99
Precision is 0.00
Recall is 0.00

GR-QC
Instances: 4589
Test accuracy is 0.98
Precision is 0.83
Recall is 0.52

HEP-EX
Instances: 2514
Test accuracy is 0.98
Precision is 0.71
Recall is 0.43

HEP-LAT
Instances: 1044
Test accuracy is 0.99
Precision is 0.91
Recall is 0.30

HEP-PH
Instances: 6539
Test accuracy is 0.98
Precision is 0.85
Recall is 0.65

HEP-TH
Instances: 6209
Test accuracy is 0.97
Precision is 0.81
Recall is 0.52

MATH
Instances: 38456
Test accuracy is 0.92
Precision is 0.89
Recall is 0.86

MATH-PH
Instances: 3765
Test accuracy is 0.97
Precision is 0.59
Recall is 0.09

NLIN
Instances: 1834
Test accuracy is 0.99
Precision is 0.81
Recall is 0.14

NUCL-EX
Instances: 1216
Test accuracy is 0.99
Precision is 0.70
Recall is 0.25

NUCL-TH
Instances: 2301
Test accuracy is 0.99
Precision is 0.77
Recall is 0.37

PHYSICS
Instances: 14548
Test accuracy is 0.92
Precision is 0.78
Recall is 0.45

Q-BIO
Instances: 2482
Test accuracy is 0.98
Precision is 0

In [9]:
predictions_pos_tfidf = mlb_top.inverse_transform(np.transpose(np.asarray(predictions_pos_tfidf)))
evaluate_predictions(test_top_Y, predictions_pos_tfidf)

Exact match: 0.63
Accuracy: 0.73
Precision: 0.81
Recall: 0.76


In [10]:
# ### Predict specific categories
# ### One vs rest classifier
# ### Features: TFIDF (top 10000 nouns from the WHOLE training set)

# pipeline = Pipeline([
#                 ('tfidf', TfidfVectorizer(max_features = 10000)),
#                 ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
#             ])

# predictions_all_tfidf = []
# for i in range(len(specific_categories)):
#     print('{}'.format(mlb_specific.classes_[i].upper()))
#     pipeline.fit(train_X, bin_train_specific_Y[: ,i])
    
#     prediction = pipeline.predict(test_X)
#     predictions_all_tfidf.append(prediction)
    
#     print('Instances: {}'.format(np.sum(bin_test_specific_Y[:,i])))
#     print('Test accuracy is {0:.2f}'.format(accuracy_score(bin_test_specific_Y[:,i], prediction)))
#     print('Precision is {0:.2f}'.format(precision_score(bin_test_specific_Y[:,i], prediction)))
#     print('Recall is {0:.2f}\n'.format(recall_score(bin_test_specific_Y[:,i], prediction)))

In [11]:
# predictions_all_tfidf = mlb_specific.inverse_transform(np.transpose(np.asarray(predictions_all_tfidf)))
# evaluate_predictions(test_top_Y, predictions_all_tfidf)

In [12]:
# ### Predict specific categories
# ### One vs rest classifier
# ### Features: TFIDF (top 10000 nouns from the POSITIVE training set)

# predictions_pos_tfidf = []
# for i in range(len(specific_categories)):
#     print('{}'.format(mlb_specific.classes_[i].upper()))

#     vectorizer = TfidfVectorizer(max_features = 10000)
#     tfidf_matrix =  vectorizer.fit_transform(np.array(train_X)[bin_train_specific_Y[:,i] == 1])
    
#     clf = LogisticRegression(solver='sag')
#     clf.fit(vectorizer.transform(train_X), bin_train_specific_Y[:,i])
    
#     # compute the testing accuracy
#     prediction = clf.predict(vectorizer.transform(test_X))
#     predictions_pos_tfidf.append(prediction)
    
#     print('Instances: {}'.format(np.sum(bin_test_specific_Y[:,i])))
#     print('Test accuracy is {0:.2f}'.format(accuracy_score(bin_test_specific_Y[:,i], prediction)))
#     print('Precision is {0:.2f}'.format(precision_score(bin_test_specific_Y[:,i], prediction)))
#     print('Recall is {0:.2f}\n'.format(recall_score(bin_test_specific_Y[:,i], prediction)))

In [13]:
# predictions_pos_tfidf = mlb_specific.inverse_transform(np.transpose(np.asarray(predictions_pos_tfidf)))
# evaluate_predictions(test_specific_Y, predictions_pos_tfidf)

In [14]:
# true_predictions = np.sum(bin_test_top_Y + transpose_predictions_all_tfidf == 2)
# all_predictions = np.count_nonzero(transpose_predictions_all_tfidf)
# total_labels = np.count_nonzero(bin_test_top_Y)

# print('Exact match: {0:.2f}'.format((len(bin_test_top_Y) - np.count_nonzero(np.sum(bin_test_top_Y != transpose_predictions_all_tfidf, axis = 1))) / len(bin_test_top_Y)))
# print('Precision: {0:.2f}'.format(true_predictions/total_labels))
# print('Recall: {0:.2f}'.format(true_predictions/all_predictions))

In [15]:
# cnf_matrix = confusion_matrix(list(bin_test_top_Y.argmax(axis=1)), list(transpose_predictions_all_tfidf.argmax(axis=1)))

# plt.figure(figsize=(16,12))
# plot_confusion_matrix(cnf_matrix, classes=top_level_categories, normalize=True, title='Normalized confusion matrix')
# plt.show()