Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import scipy as sp

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
from nltk.corpus import stopwords
s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()



In [2]:
query = session.query(Article).filter(or_(Article.journal_ref.like('Physics Review Letters%'),
                                          Article.journal_ref.like('Phys. Rev. Lett.%'),
                                          Article.journal_ref.like('PRL%')))
abstractPRL = [x.abstract for x in query.all()]
titlePRL = [x.title for x in query.all()]

In [None]:
query = session.query(Article).filter(or_(Article.journal_ref.like('Nature%'),
                                          Article.journal_ref.like('Nat.%'),
                                          Article.journal_ref.like('Science%')))
abstractNatureScience = [x.abstract for x in query.all()]
titleNatureScience = [x.title for x in query.all()]

In [None]:
session.close_all()

In [None]:
# # Train with 80% of the data, test with 20%
# # First start with abstracts.

# indPRL = len(abstractPRL)*4/5
# indNatureScience = len(abstractNatureScience)*4/5

# train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
# train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
# train_target = [0]*indPRL + [1]*indNatureScience
# train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

# test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
# test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
# test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
# test_target_names = ['PRL', 'Nature/Science']

In [None]:
# Train with 80% of the Nature data, test with 20% of the Nature data
# Choose the same number of PRL and Nature articles in the test sets.

indNatureScience = len(abstractNatureScience)*4/5
indPRL = len(abstractPRL) - (len(abstractNatureScience) - indNatureScience)

train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
train_target = [0]*indPRL + [1]*indNatureScience
train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
test_target_names = ['PRL', 'Nature/Science']

In [None]:
print len(abstractNatureScience)
print len(abstractPRL)

In [None]:
#SVC(kernel='linear') is good
text_abstract_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC(C=3,penalty='l1',dual=False))])
text_abstract_clf.fit(train_abstract, train_target)
predict_abstract = text_abstract_clf.predict(test_abstract)
print text_abstract_clf.predict(train_abstract)

In [None]:
text_title_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC(C=3,penalty='l1',dual=False))])
text_title_clf.fit(train_title, train_target)
predict_title = text_title_clf.predict(test_title)
print text_abstract_clf.predict(train_title)

In [None]:
print predict_abstract
print predict_title

In [None]:
#SVC(kernel='linear')
print(metrics.classification_report(test_target, predict_abstract,
                                    target_names=test_target_names))
print('Accuracy score: %0.2f' % metrics.accuracy_score(test_target, predict_abstract))

In [None]:
#Am I overfitting?
print(metrics.classification_report(train_target, text_abstract_clf.predict(train_abstract),
                                    target_names=test_target_names))
# print('Accuracy score: %0.2f' % metrics.accuracy_score(test_target, predict_abstract))

In [None]:
#SVC(kernel='linear')
print(metrics.classification_report(test_target, predict_title,
                                    target_names=test_target_names))
print('Accuracy score: %0.2f' % metrics.accuracy_score(test_target, predict_abstract))

In [None]:
print metrics.confusion_matrix(test_target, predict_abstract)

In [None]:
def inverseVectorizer(val):
    return (key for key, value in text_abstract_clf.named_steps['vect'].vocabulary_.iteritems() if value == val).next()

# This is super inefficient!!!
sorted_coefs = sorted( ( (i,v) for i, v in np.ndenumerate(text_abstract_clf.named_steps['clf'].coef_.todense()) ),
                      key=lambda x: x[1] )
# print sorted_coefs
# print sorted_coefs[-50:]
# print list(reversed(sorted_coefs[-50:]))
print "Top 50 indicators of PRL:"
bottom = sorted_coefs[:50]
print ", ".join([ inverseVectorizer(item[0][1]) for item in bottom])
print ""
print "Top 50 indicators of Nature/Science:"
top = list(reversed(sorted_coefs[-50:]))
print ", ".join([ inverseVectorizer(item[0][1]) for item in top])

In [None]:
print text_abstract_clf.predict(['Here we report quantum information star radio'])
print text_abstract_clf.predict(['We consider a condensate model of tensor squeezing'])