Use fancy machine learning to predict whether an article makes it into Nature/Science or PRL. This time we'll only look at articles in the physics.atom-ph section.

In [82]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../../')

%matplotlib inline
import matplotlib.pyplot as plt 
import time
import numpy as np
import re
from itertools import combinations
import json

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.cross_validation import train_test_split

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [65]:
categories = ['atom-ph', 'quant-ph', 'cond-mat', 'quant-gas', 'hep-th', 'hep-ex', '']
categories = ['quant-ph']
journals_dict = {'PRL': ['Physics Review Letters%',
                    'Phys. Rev. Lett.%',
                    'Phys.Rev.Lett.%',
                    'PRL%'],
                 'PR':  ['Physics Review%',
                         'Phys. Rev.%',
                         'Phys.Rev.%',
                         'PR%'],
                 'Nature': ['Nature%',
                            'Nat.%',
                            'Science%'],
                 'APL': ['APL%',
                         'Appl.Phys.Lett.%',
                         'Appl. Phys. Lett.%',
                         'Applied Physics Letters%'],
                 'AP': ['AP%',
                        'Appl.Phys.%',
                        'Appl. Phys.%',
                        'Applied Physics%'],
                 'All': ['%'],
                 }


In [73]:
def get_abstracts(journal, category):
    query = session.query(Article_Category)\
                    .join(Category)\
                    .join(Article)\
                    .filter(Category.name.like('%' + category + '%'),
                            or_(*[Article.journal_ref.like(x) for x in journals_dict[journal]]))

    # Don't need to clean up text: CountVectorizer will do everything
    return [ result.article.abstract for result in query ]


def learn(journals, category):
    abstracts1 = get_abstracts(journals[0], category)
    abstracts2 = get_abstracts(journals[1], category)
    
    half_test_size = int(round(0.2*min(len(abstracts1),len(abstracts2))))
#     print half_test_size

    X1_train, X1_test, y1_train, y1_test = train_test_split(abstracts1, [0]*len(abstracts1), test_size=half_test_size, random_state=42)
    X2_train, X2_test, y2_train, y2_test = train_test_split(abstracts2, [1]*len(abstracts2), test_size=half_test_size, random_state=42)

    X_train = X1_train + X2_train
    X_test = X1_test + X2_test
    y_train = np.array(y1_train + y2_train)
    y_test = np.array(y1_test + y2_test)
    target_names = journals
    
    
    clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                    ('tfidf', TfidfTransformer()),
                    ('clf', LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True))])
    transform = clf.fit_transform(X_train, y_train)
    y_predict_train = clf.predict(X_train)
    y_predict_test = clf.predict(X_test)
    
    print (metrics.classification_report(y_test, y_predict_test,
                                    target_names=target_names))
#                                     target_names=test_target_names))
    print metrics.confusion_matrix(y_test, y_predict_test)
    print 'Accuracy: %f' % (metrics.accuracy_score(y_test, y_predict_test))

    most_important_words = clf.named_steps['clf'].coef_.argsort()[:, ::-1]

    terms =  clf.named_steps['vect'].get_feature_names()
    result = [{'name': terms[word],
               'value': clf.named_steps['clf'].coef_[0,word]}
                for word in (np.concatenate((most_important_words[0, :15],
                             most_important_words[0, -15:]))) ]
    return result



In [91]:
categories = ['atom-ph', 'quant-ph', 'cond-mat', 'str-el']
journals = ['PRL', 'APL', 'Nature']

start = time.time()

final_dict = dict()
for category in categories:
    current_dict = dict()
    for journal_pair in combinations(journals, 2):
        print (category, journal_pair)
        current_dict['/'.join(journal_pair)] = learn(journal_pair, category)
#         current_dict[journal] = keywords(journal, category)
    if category == '':
        category = 'All'
    final_dict[category.replace('-', '_')] = current_dict

print (time.time() - start)

# print json.dumps(final_dict)
with open('svm_journal.json', 'wb') as f:
    json.dump(final_dict, f)
# learn(['Nature', 'PRL'], 'atom-ph')

('atom-ph', ('PRL', 'APL'))
6
             precision    recall  f1-score   support

        PRL       0.50      1.00      0.67         6
        APL       0.00      0.00      0.00         6

avg / total       0.25      0.50      0.33        12

[[6 0]
 [6 0]]
0.5
('atom-ph', ('PRL', 'Nature'))
28
             precision    recall  f1-score   support

        PRL       0.76      1.00      0.86        28
     Nature       1.00      0.68      0.81        28

avg / total       0.88      0.84      0.84        56

[[28  0]
 [ 9 19]]
0.839285714286
('atom-ph', ('APL', 'Nature'))
6
             precision    recall  f1-score   support

        APL       0.00      0.00      0.00         6
     Nature       0.50      1.00      0.67         6

avg / total       0.25      0.50      0.33        12

[[0 6]
 [0 6]]
0.5
('quant-ph', ('PRL', 'APL'))
45
             precision    recall  f1-score   support

        PRL       0.57      1.00      0.73        45
        APL       1.00      0.24      0.39     

In [10]:
abstracts1 = get_abstracts('Nature' ,'quant-ph')
abstracts2 = get_abstracts('PRL' ,'quant-ph')

In [83]:
print len(abstracts1)
print len(abstracts2)
half_test_size = 500

X1_train, X1_test, y1_train, y1_test = train_test_split(abstracts1, [0]*len(abstracts1), test_size=half_test_size, random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(abstracts2, [1]*len(abstracts2), test_size=half_test_size, random_state=42)

X_train = X1_train + X2_train
X_test = X1_test + X2_test
y_train = np.array(y1_train + y2_train)
y_test = np.array(y1_test + y2_test)
target_names = ['Nature', 'PRL']

# print len(y_test)
# print y_test

770
3353


In [6]:
session.close_all()

In [7]:
# # Train with 80% of the Nature data, test with 20% of the Nature data
# # Choose the same number of PRL and Nature articles in the test sets.

# indNatureScience = len(abstractNatureScience)*4/5
# indPRL = len(abstractPRL) - (len(abstractNatureScience) - indNatureScience)

# train_abstract = abstractPRL[:indPRL] + abstractNatureScience[:indNatureScience]
# train_title = titlePRL[:indPRL] + titleNatureScience[:indNatureScience]
# train_target = [0]*indPRL + [1]*indNatureScience
# train_target_names = ['PRL']*indPRL + ['Nature']*indNatureScience

# test_abstract = abstractPRL[indPRL:] + abstractNatureScience[indNatureScience:]
# test_title = titlePRL[indPRL:] + titleNatureScience[indNatureScience:]
# test_target = [0]*len(abstractPRL[indPRL:]) + [1]*len(abstractNatureScience[indNatureScience:])
# test_target_names = ['PRL', 'Nature/Science']

In [14]:
#SVC(kernel='linear') is good
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))),
                ('tfidf', TfidfTransformer()),
                ('clf', LinearSVC(C=1,penalty='l1',dual=False,fit_intercept=True))])
transform = clf.fit_transform(X_train, y_train)
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)
# print text_abstract_clf.predict(X_train)
print transform.shape

(3123, 204)


In [15]:
# vect = CountVectorizer(ngram_range=(1,1))
# x_train_counts = vect.fit_transform(X_train)
# # tfidf = TfidfTransformer.fit_transform(vect)
# print x_train_counts.shape
# x_train_tfidf = TfidfTransformer().fit_transform(x_train_counts)
# print x_train_tfidf.shape
# clf = LinearSVC(C=1,penalty='l1',dual=False,)
# print clf.fit_transform(x_train_tfidf, y_train).shape

In [16]:
#SVC(kernel='linear')
print(metrics.classification_report(y_test, y_predict_test,
                                    target_names=target_names))
#                                     target_names=test_target_names))
print metrics.confusion_matrix(y_test, y_predict_test)
print metrics.accuracy_score(y_test, y_predict_test)

             precision    recall  f1-score   support

     Nature       0.97      0.35      0.51       500
        PRL       0.60      0.99      0.75       500

avg / total       0.78      0.67      0.63      1000

[[174 326]
 [  6 494]]
0.668
