Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [5]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.externals import joblib

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [2]:
categories = ['atom-ph', 'quant-ph', 'cond-mat', 'quant-gas', 'hep-th', 'hep-ex', '']
categories = ['quant-ph']
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'PL': ['Physics Letters%',
                        'Phys. Lett.%',
                        'Phys.Lett.%'],
                 'All': ['%'],
                 }


In [3]:
def get_abstracts(journal, category):
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%' + category + '%'),
                            or_(*[Article.journal_ref.like(x) for x in journals_dict[journal]]))

    # Don't need to clean up text: CountVectorizer will do everything
    return [ result.article.abstract for result in query ]


def learn(journals, category):
    abstracts1 = get_abstracts(journals[0], category)
    abstracts2 = get_abstracts(journals[1], category)
    
    half_test_size = int(round(0.2*min(len(abstracts1),len(abstracts2))))
#     print half_test_size

    X1_train, X1_test, y1_train, y1_test = train_test_split(abstracts1, [0]*len(abstracts1), test_size=half_test_size, random_state=42)
    X2_train, X2_test, y2_train, y2_test = train_test_split(abstracts2, [1]*len(abstracts2), test_size=half_test_size, random_state=42)

    X_train = X1_train + X2_train
    X_test = X1_test + X2_test
    y_train = np.array(y1_train + y2_train)
    y_test = np.array(y1_test + y2_test)
    target_names = journals
    
    
    clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True))])
    transform = clf.fit_transform(X_train, y_train)
    y_predict_train = clf.predict(X_train)
    y_predict_test = clf.predict(X_test)
    
    X_train_tfidf = clf.named_steps['tfidf'].transform(
                        clf.named_steps['vect'].transform(X_train))
    
    print (metrics.classification_report(y_test, y_predict_test,
                                    target_names=target_names))
#                                     target_names=test_target_names))
    print metrics.confusion_matrix(y_test, y_predict_test)
    print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))

#     most_important_words = clf.named_steps['clf'].coef_.argsort()[:, ::-1]

#     print np.squeeze(X_train_tfidf[y_train==0, 3].toarray()).shape
#     print np.squeeze(X_train_tfidf[y_train==0, 3].toarray()).shape
    
    
#     terms =  clf.named_steps['vect'].get_feature_names()
#     result = [{'name': terms[word],
#                'value': clf.named_steps['clf'].coef_[0,word],
#                'vector1': np.squeeze(X_train_tfidf[y_train==0, word].toarray()).tolist(),
#                'vector2': np.squeeze(X_train_tfidf[y_train==1, word].toarray()).tolist(),
#               }
#                 for word in (np.concatenate((most_important_words[0, :15],
#                              most_important_words[0, -15:]))) ]
    return clf



In [9]:
categories = ['']
# categories = ['atom-ph', 'quant-ph', 'cond-mat', 'str-el']
# journals = ['PRL', 'PL', 'Nature']
journals = ['PRL', 'Nature']

start = time.time()

clf_journal = learn(journals, categories[0])

print (time.time() - start)

# final_dict = dict()
# for category in categories:
#     current_dict = dict()
#     for journal_pair in combinations(journals, 2):
#         print (category, journal_pair)
#         current_dict['/'.join(journal_pair)] = learn(journal_pair, category)
# #         current_dict[journal] = keywords(journal, category)
#     if category == '':
#         category = 'All'
#     final_dict[category.replace('-', '_')] = current_dict

# print (time.time() - start)

# # print json.dumps(final_dict)
# with open('svm_journal.json', 'wb') as f:
#     json.dump(final_dict, f)
# # learn(['Nature', 'PRL'], 'atom-ph')

             precision    recall  f1-score   support

        PRL       0.79      0.97      0.87      1159
     Nature       0.96      0.74      0.84      1159

avg / total       0.87      0.85      0.85      2318

[[1122   37]
 [ 301  858]]
Accuracy: 0.854185
138.313899994


In [10]:
joblib.dump(clf_journal, 'svm_journal.pkl', compress=1)

['svm_journal.pkl']