Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the cond-mat section.

In [85]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import scipy as sp

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
from nltk.corpus import stopwords
s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [16]:
query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%cond-mat%'),
                            or_(Article.journal_ref.like('Physics Review Letters%'),
                                Article.journal_ref.like('Phys. Rev. Lett.%'),
                                Article.journal_ref.like('PRL%')))
abstractPRL = [x.article.abstract for x in query.all()]
titlePRL = [x.article.title for x in query.all()]

In [17]:
query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%cond-mat%'),
                            or_(Article.journal_ref.like('Nature%'),
                                Article.journal_ref.like('Nat.%'),
                                Article.journal_ref.like('Science%')))
abstractNatureScience = [x.article.abstract for x in query.all()]
titleNatureScience = [x.article.title for x in query.all()]

In [18]:
session.close_all()

In [19]:
# # Train with 80% of the data, test with 20%
# # First start with abstracts.

# indPRL = len(abstractPRL)*4/5
# indNatureScience = len(abstractNatureScience)*4/5

# train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
# train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
# train_target = [0]*indPRL + [1]*indNatureScience
# train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

# test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
# test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
# test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
# test_target_names = ['PRL', 'Nature/Science']

In [20]:
# Train with 80% of the Nature data, test with 20% of the Nature data
# Choose the same number of PRL and Nature articles in the test sets.

indNatureScience = len(abstractNatureScience)*4/5
indPRL = len(abstractPRL) - (len(abstractNatureScience) - indNatureScience)

train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
train_target = [0]*indPRL + [1]*indNatureScience
train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
test_target_names = ['PRL', 'Nature/Science']

In [21]:
print len(abstractNatureScience)
print len(abstractPRL)

2690
14180


In [22]:
#SVC(kernel='linear') is good
text_abstract_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', SVC(kernel='linear'))])
text_abstract_clf.fit(train_abstract, train_target)
predict_abstract = text_abstract_clf.predict(test_abstract)
print text_abstract_clf.predict(train_abstract)

[0 0 0 ..., 1 1 1]


In [22]:
#SVC(kernel='linear') is good
text_abstract_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', SVC(kernel='linear'))])
text_abstract_clf.fit(train_abstract, train_target)
predict_abstract = text_abstract_clf.predict(test_abstract)
print text_abstract_clf.predict(train_abstract)

[0 0 0 ..., 1 1 1]


In [23]:
text_title_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC())])
text_title_clf.fit(train_title, train_target)
predict_title = text_title_clf.predict(test_title)
print text_abstract_clf.predict(train_title)

[0 0 0 ..., 0 0 0]


In [24]:
print predict_abstract
print predict_title

[0 1 0 ..., 0 0 0]
[0 0 0 ..., 0 0 0]


In [43]:
#SVC(kernel='linear')
print(metrics.classification_report(test_target, predict_abstract,
                                    target_names=test_target_names))
print('Accuracy score: %0.2f' % metrics.accuracy_score(test_target, predict_abstract))

                precision    recall  f1-score   support

           PRL       0.79      0.93      0.86       538
Nature/Science       0.91      0.76      0.83       538

   avg / total       0.85      0.84      0.84      1076

Accuracy score: 0.84


In [37]:
#SVC(kernel='linear')
print(metrics.classification_report(test_target, predict_title,
                                    target_names=test_target_names))

                precision    recall  f1-score   support

           PRL       0.53      0.96      0.68       538
Nature/Science       0.78      0.14      0.24       538

   avg / total       0.65      0.55      0.46      1076



In [41]:
print metrics.confusion_matrix(test_target, predict_abstract)

[[499  39]
 [129 409]]


In [200]:
def inverseVectorizer(val):
    return (key for key, value in text_abstract_clf.named_steps['vect'].vocabulary_.iteritems() if value == val).next()

# This is super inefficient!!!
sorted_coefs = sorted( ((i,v) for i, v in np.ndenumerate(text_abstract_clf.named_steps['clf'].coef_.todense()) ),
                      key=lambda x: x[1] )

print "Top 50 indicators of PRL:"
bottom = sorted_coefs[:50]
print ", ".join([ inverseVectorizer(item[0][1]) for item in bottom])
print ""
print "Top 50 indicators of Nature/Science:"
top = list(reversed(sorted_coefs[-50:]))
print ", ".join([ inverseVectorizer(item[0][1]) for item in top])

Top 50 indicators of PRL:
model, we, of the, propose, we propose, behavior, to the, we study, study the, parameters, consider, we consider, investigated, present, we discuss, scheme, method, numerical, study, investigate, condensate, mesoscopic, edge, calculations, dynamical, which is, discuss, simulations, the spin, dipolar, letter, show that, qc, conditions, distribution, scattering, gapless, values, show, we have, we investigate, show how, optomechanical, crossover, we show that, we study the, entanglement, instability, we derive, discussed

Top 50 indicators of Nature/Science:
here, here we, here we report, and, of, devices, materials, to, high, in, have, quantum, superconductivity, however, here we demonstrate, been, fundamental, has, physics, because, electronic, matter, new, their, we report, electrons, information, control, as, understanding, many, or, phenomena, report, by, graphene, here we show, superconductors, material, applications, challenge, these, such, superconducting

In [173]:
print text_abstract_clf.predict(['Here we report material physics devices'])
print text_abstract_clf.predict(['We consider mesocopic dipolar model'])

[1]
[0]
