Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
from nltk.corpus import stopwords
s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [4]:
query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%quant-ph'),
                            or_(Article.journal_ref.like('Physics Review Letters%'),
                                Article.journal_ref.like('Phys. Rev. Lett.%'),
                                Article.journal_ref.like('PRL%')))
abstractPRL = [x.article.abstract for x in query.all()]
titlePRL = [x.article.title for x in query.all()]

In [5]:
query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%quant-ph'),
                            or_(Article.journal_ref.like('Nature%'),
                                Article.journal_ref.like('Nat.%'),
                                Article.journal_ref.like('Science%')))
abstractNatureScience = [x.article.abstract for x in query.all()]
titleNatureScience = [x.article.title for x in query.all()]

In [6]:
session.close_all()

In [13]:
# Train with 80% of the data, test with 20%
# First start with abstracts.

indPRL = len(abstractPRL)*4/5
indNatureScience = len(abstractNatureScience)*4/5

train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
train_target = [0]*indPRL + [1]*indNatureScience
train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
test_target_names = ['PRL', 'Nature/Science']

In [25]:
text_abstract_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC())])
text_abstract_clf.fit(train_abstract, train_target)
predict_abstract = text_abstract_clf.predict(test_abstract)
print text_abstract_clf.predict(train_abstract)

[0 0 0 ..., 1 1 1]


In [30]:
text_title_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                              ('tfidf', TfidfTransformer()),
                              ('clf', LinearSVC())])
text_title_clf.fit(train_title, train_target)
predict_title = text_title_clf.predict(test_title)
print text_abstract_clf.predict(train_title)

[0 0 0 ..., 0 0 0]


In [31]:
print predict_abstract
print predict_title

[0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 

In [32]:
print(metrics.classification_report(test_target, predict_abstract,
                                    target_names=test_target_names))

                precision    recall  f1-score   support

           PRL       0.87      0.94      0.91       529
Nature/Science       0.66      0.44      0.53       129

   avg / total       0.83      0.84      0.83       658



In [33]:
print(metrics.classification_report(test_target, predict_title,
                                    target_names=test_target_names))

                precision    recall  f1-score   support

           PRL       0.82      0.90      0.86       529
Nature/Science       0.30      0.19      0.23       129

   avg / total       0.72      0.76      0.73       658



In [42]:
a = text_abstract_clf.transform('quantum')
print text_abstract_clf.inverse_transform(a)

AttributeError: 'LinearSVC' object has no attribute 'inverse_transform'

In [69]:
text_abstract_clf.predict(['strong, long-range ' +\
'dipole-dipole interactions, such a gas brings fundamentally new abilities to ' +\
'quantum-gas-based studies of strongly correlated many-body physics, where' +\
'quantum phase transitions and new states of matter can emerge.'])

array([1])

In [56]:
print text_abstract_clf.predict([test_abstract[-4]])
print test_abstract[-4]

[1]
  Chemical reaction rates often depend strongly on stereodynamics, namely the
orientation and movement of molecules in three-dimensional space. An ultracold
molecular gas, with a temperature below 1 uK, provides a highly unusual regime
for chemistry, where polar molecules can easily be oriented using an external
electric field and where, moreover, the motion of two colliding molecules is
strictly quantized. Recently, atom-exchange reactions were observed in a
trapped ultracold gas of KRb molecules. In an external electric field, these
exothermic and barrierless bimolecular reactions, KRb+KRb -> K2+Rb2, occur at a
rate that rises steeply with increasing dipole moment. Here we show that the
quantum stereodynamics of the ultracold collisions can be exploited to suppress
the bimolecular chemical reaction rate by nearly two orders of magnitude. We
use an optical lattice trap to con?fine the fermionic polar molecules in a
quasi-two-dimensional, pancake-like geometry, with the dipoles ori