In [2]:
import os
import json
import numpy as np
from datetime import datetime
from collections import defaultdict
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import pos_tag
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [3]:
noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
def is_noun(pos):
    if pos in noun_tags:
        return True
    else:
        return False

In [4]:
def extract_nouns(text):
    nouns = []
    
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        sentence_nouns = [word.lower() for (word, pos) in pos_tag(tokens) if is_noun(pos)] 
        nouns += sentence_nouns
    return ' '.join(nouns)

In [5]:
def get_top_level_categories(categories):
    specific_categories = categories.split()
    top_level_categories = set([category.split('.')[0] for category in specific_categories])
    
    return top_level_categories

In [6]:
# Training set: 2016
# Test set: 2017

categories = set()

train_X = []
train_Y = []
with open(os.path.join('data-by-year', '2016.json'), 'r') as f:
    for line in f:
        row = json.loads(line)['arXiv']
        year = datetime.strptime(row['created'],"%Y-%m-%d").year
        
        row_categories = get_top_level_categories(row['categories'])
        categories = categories.union(row_categories)

        train_X.append(extract_nouns(row['title'] + '\n ' + row['abstract']))
        train_Y.append(row_categories)

test_X = []
test_Y = []    
with open(os.path.join('data-by-year', '2017.json'), 'r') as f:
    for line in f:
        row = json.loads(line)['arXiv']
        year = datetime.strptime(row['created'],"%Y-%m-%d").year
        
        row_categories = get_top_level_categories(row['categories'])
        categories = categories.union(row_categories)

        test_X.append(extract_nouns(row['title'] + '\n ' + row['abstract']))
        test_Y.append(row_categories)

print(len(train_X))
print(len(test_X))
print(len(train_Y))
print(len(test_Y))

113436
123781
113436
123781


In [7]:
mlb = MultiLabelBinarizer(list(categories))
bin_train_Y = mlb.fit_transform(train_Y)
bin_test_Y = mlb.fit_transform(test_Y)

In [9]:
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features = 2000)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

nb_predictions = []
for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))
    # train the model using X_dtm & y
    NB_pipeline.fit(train_X, bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = NB_pipeline.predict(test_X)
    nb_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

... Processing gr-qc
Instances: 4589
Test accuracy is 0.9756263077532092
Precision is 0.7771509167842031
Recall is 0.48027892787099585
... Processing stat
Instances: 7538
Test accuracy is 0.9529006875045443
Precision is 0.6800927878532265
Recall is 0.4278323162642611
... Processing math
Instances: 38456
Test accuracy is 0.9004370622308755
Precision is 0.8508404489554804
Recall is 0.8239806532140628
... Processing math-ph
Instances: 3765
Test accuracy is 0.9695106680346741
Precision is 0.48632218844984804
Recall is 0.04249667994687915
... Processing q-fin
Instances: 895
Test accuracy is 0.9931815060469701
Precision is 0.7802197802197802
Recall is 0.07932960893854749
... Processing nlin
Instances: 1834
Test accuracy is 0.9854985821733545
Precision is 0.819672131147541
Recall is 0.027262813522355506
... Processing eess
Instances: 698
Test accuracy is 0.9943610085554325
Precision is 0.0
Recall is 0.0
... Processing hep-lat


  'precision', 'predicted', average, warn_for)


Instances: 1044
Test accuracy is 0.9927856456160477
Precision is 0.8132780082987552
Recall is 0.18773946360153257
... Processing econ
Instances: 109
Test accuracy is 0.9991194125108054
Precision is 0.0
Recall is 0.0
... Processing nucl-ex


  'precision', 'predicted', average, warn_for)


Instances: 1216
Test accuracy is 0.9894167925610554
Precision is 0.44835164835164837
Recall is 0.3355263157894737
... Processing astro-ph
Instances: 15127
Test accuracy is 0.9712152915229316
Precision is 0.9388947927736451
Recall is 0.8176770013882462
... Processing cond-mat
Instances: 17576
Test accuracy is 0.940653250498865
Precision is 0.8198874296435272
Recall is 0.7459035047792444
... Processing hep-ph
Instances: 6539
Test accuracy is 0.9701973647005598
Precision is 0.7372627372627373
Recall is 0.6771677626548402
... Processing quant-ph
Instances: 7020
Test accuracy is 0.9605755325938553
Precision is 0.8465025906735751
Recall is 0.37236467236467236
... Processing cs
Instances: 30686
Test accuracy is 0.9233000218127176
Precision is 0.8336902437488191
Recall is 0.8627061200547481
... Processing q-bio
Instances: 2482
Test accuracy is 0.9819843110008806
Precision is 0.7635983263598326
Recall is 0.14705882352941177
... Processing physics
Instances: 14548
Test accuracy is 0.903151533757

In [36]:
nb_predictions = []

for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))

    vectorizer = TfidfVectorizer(max_features = 5000)
    tfidf_matrix =  vectorizer.fit_transform(np.array(train_X)[bin_train_Y[:,i] == 1])
    
    clf = MultinomialNB(fit_prior=True, class_prior=None)
    clf.fit(vectorizer.transform(train_X), bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = clf.predict(vectorizer.transform(test_X))
    nb_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

... Processing gr-qc
Instances: 4589
Test accuracy is 0.9751738958321552
Precision is 0.7809488510007413
Recall is 0.4591414251470909
... Processing stat
Instances: 7538
Test accuracy is 0.9545164443654519
Precision is 0.7200184501845018
Recall is 0.4141682143804723
... Processing math
Instances: 38456
Test accuracy is 0.9083138769277999
Precision is 0.8871433060070267
Recall is 0.8076242978988974
... Processing math-ph
Instances: 3765
Test accuracy is 0.9702539161906917
Precision is 0.562406015037594
Recall is 0.09933598937583002
... Processing q-fin
Instances: 895
Test accuracy is 0.9943933236926508
Precision is 0.8316831683168316
Recall is 0.28156424581005585
... Processing nlin
Instances: 1834
Test accuracy is 0.9861772000549357
Precision is 0.8682634730538922
Recall is 0.07906215921483097
... Processing eess
Instances: 698
Test accuracy is 0.9943610085554325
Precision is 0.0
Recall is 0.0
... Processing hep-lat


  'precision', 'predicted', average, warn_for)


Instances: 1044
Test accuracy is 0.9925271245183025
Precision is 0.7989949748743719
Recall is 0.15229885057471265
... Processing econ
Instances: 109
Test accuracy is 0.9991194125108054
Precision is 0.0
Recall is 0.0
... Processing nucl-ex
Instances: 1216
Test accuracy is 0.9911618099708356
Precision is 0.5938461538461538
Recall is 0.31743421052631576
... Processing astro-ph
Instances: 15127
Test accuracy is 0.9728229695995346
Precision is 0.9562485454968582
Recall is 0.8149005090236002
... Processing cond-mat
Instances: 17576
Test accuracy is 0.9455570725717194
Precision is 0.8493327316098253
Recall is 0.7495448338643604
... Processing hep-ph
Instances: 6539
Test accuracy is 0.9712556854444543
Precision is 0.7593527057595267
Recall is 0.6673803333843096
... Processing quant-ph
Instances: 7020
Test accuracy is 0.9619489259256268
Precision is 0.8704297626683771
Recall is 0.3866096866096866
... Processing cs
Instances: 30686
Test accuracy is 0.9277514319645179
Precision is 0.8628792683333

In [43]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features = 2000)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
svc_predictions = []
for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))
    # train the model using X_dtm & y
    SVC_pipeline.fit(train_X, bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(test_X)
    svc_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

... Processing cs
Instances: 30686
Test accuracy is 0.9346103198390706
Precision is 0.8839303922235062
Recall is 0.8475200417128332
... Processing eess
Instances: 698
Test accuracy is 0.9943610085554325
Precision is 0.0
Recall is 0.0
... Processing physics


  'precision', 'predicted', average, warn_for)


Instances: 14548
Test accuracy is 0.9132015414320453
Precision is 0.7213171980451478
Recall is 0.4261066813307671
... Processing q-bio
Instances: 2482
Test accuracy is 0.9843756311550238
Precision is 0.6900138696255201
Recall is 0.4008863819500403
... Processing q-fin
Instances: 895
Test accuracy is 0.9944014024769553
Precision is 0.7295454545454545
Recall is 0.358659217877095
... Processing hep-ex
Instances: 2514
Test accuracy is 0.9845856795469418
Precision is 0.65650826446281
Recall is 0.505568814638027
... Processing cond-mat
Instances: 17576
Test accuracy is 0.9502912401741785
Precision is 0.8487938931297709
Recall is 0.7907942649066909
... Processing math-ph
Instances: 3765
Test accuracy is 0.970229679837778
Precision is 0.6234567901234568
Recall is 0.053652058432934926
... Processing nucl-ex
Instances: 1216
Test accuracy is 0.991783876362285
Precision is 0.6335570469798658
Recall is 0.3881578947368421
... Processing hep-lat
Instances: 1044
Test accuracy is 0.9949749961625775
Pre

  'precision', 'predicted', average, warn_for)


Instances: 6209
Test accuracy is 0.969704558857983
Precision is 0.7792414263002498
Recall is 0.5525849573200193
... Processing nlin
Instances: 1834
Test accuracy is 0.9867427149562534
Precision is 0.7480719794344473
Recall is 0.15866957470010906
... Processing gr-qc
Instances: 4589
Test accuracy is 0.9794394939449511
Precision is 0.7923340961098398
Recall is 0.6036173458269776
... Processing astro-ph
Instances: 15127
Test accuracy is 0.9755697562630775
Precision is 0.9306810903138567
Recall is 0.8644807298208501
... Processing math
Instances: 38456
Test accuracy is 0.9168531519376965
Precision is 0.8738650241597197
Recall is 0.8559132515082172


In [44]:
svc_predictions = []

for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))

    vectorizer = TfidfVectorizer(max_features = 10000)
    tfidf_matrix =  vectorizer.fit_transform(np.array(train_X)[bin_train_Y[:,i] == 1])
    
    clf = LinearSVC()
    clf.fit(vectorizer.transform(train_X), bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = clf.predict(vectorizer.transform(test_X))
    svc_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

... Processing gr-qc
Instances: 4589
Test accuracy is 0.9718050427771628
Precision is 0.6222469410456062
Recall is 0.6095009806057965
... Processing stat
Instances: 7538
Test accuracy is 0.9469304659034908
Precision is 0.5698026221005619
Recall is 0.5246749801008225
... Processing math
Instances: 38456
Test accuracy is 0.91023662759228
Precision is 0.8570309439874657
Recall is 0.8534428957769918
... Processing math-ph
Instances: 3765
Test accuracy is 0.9587658849096388
Precision is 0.2940633651184251
Recall is 0.2539176626826029
... Processing q-fin
Instances: 895
Test accuracy is 0.9936258391837196
Precision is 0.5619158878504673
Recall is 0.5374301675977654
... Processing nlin
Instances: 1834
Test accuracy is 0.9802716087283185
Precision is 0.3401682439537329
Recall is 0.35278080697928027
... Processing eess
Instances: 698
Test accuracy is 0.9943367722025189
Precision is 0.2
Recall is 0.0014326647564469914
... Processing hep-lat
Instances: 1044
Test accuracy is 0.9918404278524168
Pre

In [48]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_features = 2000)),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

logreg_predictions = []
for i in range(len(categories)):
    print('... Processing {}'.format(mlb.classes_[i]))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(train_X, bin_train_Y[:,i])
    
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(test_X)
    logreg_predictions.append(prediction)
    
    print('Instances: {}'.format(np.sum(bin_test_Y[:,i])))
    print('Test accuracy is {}'.format(accuracy_score(bin_test_Y[:,i], prediction)))
    print('Precision is {}'.format(precision_score(bin_test_Y[:,i], prediction)))
    print('Recall is {}'.format(recall_score(bin_test_Y[:,i], prediction)))

... Processing cs
Instances: 30686
Test accuracy is 0.9334388961149126
Precision is 0.8860454717435421
Recall is 0.8394707684285994
... Processing eess
Instances: 698
Test accuracy is 0.9943610085554325
Precision is 0.0
Recall is 0.0
... Processing physics


  'precision', 'predicted', average, warn_for)


Instances: 14548
Test accuracy is 0.9124663720603323
Precision is 0.7234862164439629
Recall is 0.41311520483915315
... Processing q-bio
Instances: 2482
Test accuracy is 0.9841817403317149
Precision is 0.7258620689655172
Recall is 0.33924254633360196
... Processing q-fin
Instances: 895
Test accuracy is 0.9940944086733828
Precision is 0.7546583850931677
Recall is 0.27150837988826815
... Processing hep-ex
Instances: 2514
Test accuracy is 0.9845533644097236
Precision is 0.6764361078546307
Recall is 0.4590294351630867
... Processing cond-mat
Instances: 17576
Test accuracy is 0.9492329194302841
Precision is 0.8556311413454271
Recall is 0.7728720983158853
... Processing math-ph
Instances: 3765
Test accuracy is 0.9699953950929464
Precision is 0.5452930728241563
Recall is 0.08154050464807437
... Processing nucl-ex
Instances: 1216
Test accuracy is 0.9915091976959307
Precision is 0.6513761467889908
Recall is 0.29194078947368424
... Processing hep-lat
Instances: 1044
Test accuracy is 0.99431253584

  'precision', 'predicted', average, warn_for)


Instances: 6209
Test accuracy is 0.9689693894862701
Precision is 0.7857142857142857
Recall is 0.5244000644226123
... Processing nlin
Instances: 1834
Test accuracy is 0.9866861634661216
Precision is 0.7657142857142857
Recall is 0.14612868047982552
... Processing gr-qc
Instances: 4589
Test accuracy is 0.978566985240061
Precision is 0.8044025157232705
Recall is 0.5574199171932883
... Processing astro-ph
Instances: 15127
Test accuracy is 0.974269071990047
Precision is 0.9361577794010226
Recall is 0.8472268129834072
... Processing math
Instances: 38456
Test accuracy is 0.9161906916247243
Precision is 0.8774056553058811
Recall is 0.848840232993551


In [45]:
print('Naive Bayes')
np_nb_predictions = np.transpose(np.asarray(nb_predictions))
print(accuracy_score(bin_test_Y, np_nb_predictions))

print('\nSVM')
np_svc_predictions = np.transpose(np.asarray(svc_predictions))
print(accuracy_score(bin_test_Y, np_svc_predictions))

print('\nLogistic Regression')
np_logreg_predictions = np.transpose(np.asarray(logreg_predictions))
print(accuracy_score(bin_test_Y, np_logreg_predictions))

Naive Bayes
0.5769140659713526

SVM
0.5521687496465532

Logistic Regression


NameError: name 'logreg_predictions' is not defined

In [41]:
true_predictions = np.sum(bin_test_Y + np_nb_predictions == 2)
all_predictions = np.count_nonzero(np_nb_predictions)
total_labels = np.count_nonzero(bin_test_Y)

print('Naive Bayes')
print('Precision: {}'.format(true_predictions/total_labels))
print('Recall: {}'.format(true_predictions/all_predictions))

true_predictions = np.sum(bin_test_Y + np_svc_predictions == 2)
all_predictions = np.count_nonzero(np_svc_predictions)
total_labels = np.count_nonzero(bin_test_Y)

print('\nSVM')
print('Precision: {}'.format(true_predictions/total_labels))
print('Recall: {}'.format(true_predictions/all_predictions))

true_predictions = np.sum(bin_test_Y + np_logreg_predictions == 2)
all_predictions = np.count_nonzero(np_logreg_predictions)
total_labels = np.count_nonzero(bin_test_Y)

print('\nLogistic Regression')
print('Precision: {}'.format(true_predictions/total_labels))
print('Recall: {}'.format(true_predictions/all_predictions))


Naive Bayes
Precision: 0.6544269918738571
Recall: 0.8413792029645546

SVM
Precision: 0.7263996705945043
Recall: 0.8542476678772342


NameError: name 'np_logreg_predictions' is not defined

In [18]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=10000, lower=True)
tokenizer.fit_on_texts(train_X)
sequences = tokenizer.texts_to_sequences(train_X)
X_nn = pad_sequences(sequences, maxlen=180)

sequences = tokenizer.texts_to_sequences(test_X)
test_X_nn = pad_sequences(sequences, maxlen=180)

In [33]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
from keras import models

max_words = 10000
maxlen = 180
num_classes = len(categories)
filter_length = 300

model = Sequential()
model.add(Embedding(max_words, 20, input_length=maxlen))
model.add(Dropout(0.15))
#model.add(Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPool1D())
model.add(Dense(num_classes, activation='sigmoid'))

model.compile(optimizer=Adam(0.015), loss='binary_crossentropy', metrics=['accuracy', 'categorical_accuracy'])
callbacks = [
    ReduceLROnPlateau(),
    EarlyStopping(patience=4),
    ModelCheckpoint(filepath='model-simple.h5', save_best_only=True)
]

history = model.fit(X_nn, bin_train_Y,
                    #class_weight=class_weight,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    callbacks=callbacks)

simple_model = models.load_model('model-simple.h5')
metrics = simple_model.evaluate(test_X_nn, bin_test_Y)
print("{}: {}".format(simple_model.metrics_names[0], metrics[0]))
print("{}: {}".format(simple_model.metrics_names[1], metrics[1]))

Train on 102092 samples, validate on 11344 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
loss: 0.09639169582160143
acc: 0.9669266766610648


In [31]:
simple_model = models.load_model('model-simple.h5')
metrics = simple_model.evaluate(test_X_nn, bin_test_Y)
print("{}: {}".format(simple_model.metrics_names[0], metrics[0]))
print("{}: {}".format(simple_model.metrics_names[1], metrics[1]))

loss: 0.09661265691515634
acc: 0.966862449445216


In [28]:
print("{}: {}".format(simple_model.metrics_names[2], metrics[2]))

categorical_accuracy: 0.6885790226296379


In [None]:
terms_pos = defaultdict(int)
terms_neg = defaultdict(int)
for i in range(len(train_y)):
    if 'eess' in train_y[i]:
        for word in train_x[i].split():
            terms_pos[word] += 1
    else:
        for word in train_x[i].split():
            terms_neg[word] += 1

In [None]:
terms_pos