Can I predict the existence of subfields with some cool unsupervised learning algorithm? 

For starters, let's just use regular n-grams. A more advanced version would be to look for noun phrases or J&K POS tags.

In [1]:
#Need to add parent directoy to sys.path to find 'metadataDB'
import sys
sys.path.append('../../')

%matplotlib inline
# import matplotlib.pyplot as plt 
import time
import numpy as np
# import scipy as sp
import re
from collections import Counter
import itertools
import random

# Natural language processing toolkit
# To use this, run nltk.download() and download 'stopwords'
# from nltk.corpus import stopwords
# s=stopwords.words('english') + ['']

# Machine learning
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import SparsePCA
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
# from sklearn import metrics

# SQL
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from metadataDB.declareDatabase import *
from sqlalchemy import or_, and_

engine = create_engine("sqlite:///../../arXiv_metadata.db", echo=False)
Base.metadata.bind = engine
DBsession = sessionmaker(bind=engine)
session = DBsession()

In [2]:
# What are the available categories?
categories = sorted([x.name for x in session.query(Category)])
print categories 

[u'acc-phys', u'adap-org', u'alg-geom', u'ao-sci', u'astro-ph', u'astro-ph.CO', u'astro-ph.EP', u'astro-ph.GA', u'astro-ph.HE', u'astro-ph.IM', u'astro-ph.SR', u'atom-ph', u'bayes-an', u'chao-dyn', u'chem-ph', u'comp-gas', u'cond-mat', u'cond-mat.dis-nn', u'cond-mat.mes-hall', u'cond-mat.mtrl-sci', u'cond-mat.other', u'cond-mat.quant-gas', u'cond-mat.soft', u'cond-mat.stat-mech', u'cond-mat.str-el', u'cond-mat.supr-con', u'cs.AI', u'cs.AR', u'cs.CC', u'cs.CE', u'cs.CG', u'cs.CL', u'cs.CR', u'cs.CV', u'cs.CY', u'cs.DB', u'cs.DC', u'cs.DL', u'cs.DM', u'cs.DS', u'cs.ET', u'cs.FL', u'cs.GL', u'cs.GR', u'cs.GT', u'cs.HC', u'cs.IR', u'cs.IT', u'cs.LG', u'cs.LO', u'cs.MA', u'cs.MM', u'cs.MS', u'cs.NA', u'cs.NE', u'cs.NI', u'cs.OH', u'cs.PF', u'cs.PL', u'cs.RO', u'cs.SC', u'cs.SD', u'cs.SE', u'cs.SI', u'cs.SY', u'dg-ga', u'funct-an', u'gr-qc', u'hep-ex', u'hep-lat', u'hep-ph', u'hep-th', u'math-ph', u'math.AC', u'math.AG', u'math.AP', u'math.AT', u'math.CA', u'math.CO', u'math.CT', u'math.CV',

In [3]:
abstract_all_tmp = {'category': [], 'abstract': []}
category_list = sorted(['atom-ph', 'quant-ph', 'optics', 'nlin', 'str-el', 'stat'])
# category_list = sorted(['quant-ph', 'str-el', 'hep-', 'mtrl-sci', 'supr-con'])
category_len = len(category_list)

start = time.time()
for item in category_list:
    query = session.query(Article_Category)\
                        .join(Category)\
                        .join(Article)\
                        .filter(Category.name.like('%' + item + '%'),
                                or_(Article.journal_ref.like('Physics Review Letters%'),
                                              Article.journal_ref.like('Phys. Rev. Lett.%'),
                                              Article.journal_ref.like('PRL%')))
#     query = session.query(Article_Category)\
#                         .join(Category)\
#                         .join(Article)\
#                         .filter(Category.name.like('%' + item + '%'))
    result = [' '.join(x.article.abstract.split()) for x in query]
    abstract_all_tmp['abstract'].extend(result)
    abstract_all_tmp['category'].extend([item]*len(result))
print time.time() - start
# for item in query:
#     abstract_all['category'].append(item.category.name)
#     abstract_all['abstract'].append(' '.join(item.article.abstract.split()))
# print time.time() - start
# abstract_all['atom-ph'] = [x.article.abstract for x in query.all()]
# session.close_all()

24.8727078438


In [4]:
# Breakdown of categories?
count = Counter(abstract_all_tmp['category'])
for key, val in count.iteritems():
    print '{:<15}{}'.format(key, val)
print '{:<15}{}'.format('Total', len(abstract_all_tmp['abstract']))

stat           1850
atom-ph        575
str-el         3269
nlin           384
optics         504
quant-ph       3092
Total          9674


In [5]:
##Oops! How many overlapping articles do we have? I forgot that arXiv categories aren't unique.
# Let's remove all duplicates.
# This is slow but I am tired.

counter_duplicate = Counter(abstract_all_tmp['abstract'])

abstract_all = {'category': [], 'abstract': []}
for cat, abstract in itertools.izip(abstract_all_tmp['category'], abstract_all_tmp['abstract']):
    if counter_duplicate[abstract] == 1:
        abstract_all['category'].append(cat)
        abstract_all['abstract'].append(abstract)
print len(abstract_all['category'])
print len(abstract_all['abstract'])

7070
7070


In [6]:
# Breakdown of categories? That's a lot of repetition!!!
count = Counter(abstract_all['category'])
for key, val in count.iteritems():
    print '{:<15}{}'.format(key, val)
print '{:<15}{}'.format('Total', len(abstract_all['abstract']))

stat           1301
atom-ph        268
str-el         2846
nlin           191
optics         242
quant-ph       2222
Total          7070


In [7]:
# Train on 80% of the data. Random_state ensures that we always get the same result.
x_train, x_test, y_train, y_test = train_test_split(abstract_all['abstract'],
                                                    abstract_all['category'],
                                                    random_state=42,
                                                    train_size=0.8)

counter_train = Counter(y_train)

Okay, I lied, I'm starting with supervised learning (as a comparison). We're looking at ~60-70% accuracy for these cateogories.

In [9]:
#SVC(kernel='linear') is good
clf_supervised = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
                           ('tfidf', TfidfTransformer()),
#                            ('clf', LinearSVC())])
                           ('clf', LinearSVC(C=1,penalty='l1',dual=False,))])
start = time.time()
clf_supervised.fit(x_train, y_train)
print time.time() - start

start = time.time()
predict = clf_supervised.predict(x_test)
print time.time() - start
#print text_abstract_clf.predict(train_abstract)

17.4500420094
1.16515302658


In [10]:
print(classification_report(y_test, predict))
print('Accuracy score: %0.2f' % accuracy_score(y_test, predict))

             precision    recall  f1-score   support

    atom-ph       0.76      0.58      0.66        60
       nlin       0.71      0.23      0.35        43
     optics       0.78      0.62      0.69        50
   quant-ph       0.88      0.89      0.88       448
       stat       0.78      0.80      0.79       266
     str-el       0.87      0.94      0.90       547

avg / total       0.84      0.85      0.84      1414

Accuracy score: 0.85


Find the most important words.

In [11]:
# Most important chunks. See http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
most_important_words = clf_supervised.named_steps['clf'].coef_.argsort()[:, ::-1]

terms =  clf_supervised.named_steps['vect'].get_feature_names()
for i in range(len(category_list)):
    print "Category %s:" % (category_list[i])
    print ', '.join([terms[x] for x in most_important_words[i, :20]])
    print ''

Category atom-ph:
ionization, the fine structure, collisions, 2s, molecules, limits on, ultracold, clock, positron, laser, atoms, the factor, feshbach, attosecond, atom, dipole, edm, helium, precision, 10

Category nlin:
chaotic, turbulence, patterns, random matrix, oscillators, periodic, nonlinear, billiards, orbits, flow, numerical, turbulent, flows, solution, kicked, semiclassical, solitons, synchronization, structure functions, frequencies

Category optics:
photonic, plasmonic, metamaterial, metamaterials, radiation, optical, media, polariton, electromagnetic, lasing, optically, light, resonators, transmission, beam, plasmon, pulses, material, generation, wavelength

Category quant-ph:
quantum, qubit, qubits, entanglement, bell, detuning, entangled, scheme, casimir, photon, photons, operators, measurement, cavity, optimal, inequality, variable, nitrogen, detection, vacuum

Category stat:
simulations, granular, stochastic, hard, dna, growth, random, equilibrium, thermodynamics, perc

Now, try KMeans clustering. 
See: http://scikit-learn.org/stable/auto_examples/text/document_clustering.html

In [12]:
n_clusters = 10
# Reduce n_init to 10 for testing purposes.
clf_unsupervised = Pipeline([('vect', CountVectorizer(ngram_range=(1,3), stop_words='english')),
                             ('tfidf', TfidfTransformer()),
                             ('clf', KMeans(n_clusters=n_clusters, n_init=50))])
start = time.time()
clf_unsupervised.fit(x_train)
print time.time() - start

start = time.time()
predict_train = clf_unsupervised.predict(x_train)
predict = clf_unsupervised.predict(x_test)
print time.time() - start


446.83970499
3.38381505013


In [13]:
# Which clusters most closely align with which the original categories?
# Find the strongest correlation, and assign that cluster to the category. Iterate.
# matrix_train = [[sum((a==cat and b==x for a,b in zip(y_train, predict_train)))
#                    for x in range(0,n_clusters)] 
#                    for cat in category_list]


counter_category = Counter(y_train)
counter_cluster = Counter(predict_train)
accuracy_train_initial = np.array(
#     [[sum((a==cat and b==x for a,b in zip(y_train, predict_train)))*1./counter_cluster[x]
    [[sum((a==cat and b==x for a,b in zip(y_train, predict_train)))*1./counter_category[cat]
           for x in range(0,n_clusters)] 
           for cat in category_list])
clusterToCategory = dict()
# categoryToCluster = dict()

for cluster, item in enumerate(np.argmax(accuracy_train_initial, axis=0).tolist()):
    clusterToCategory[cluster] = category_list[item]


# category_list_remaining = list(category_list)
# cluster_list_remaining = range(0, n_clusters)
# for x in range(0, len(category_list)):
#     accuracy_train = np.array(
#                         [[sum((a==cat and b==x for a,b in zip(y_train, predict_train)))*1./counter_cluster[x]
#                            for x in cluster_list_remaining] 
#                            for cat in category_list_remaining])

    
#     # Find largest value in the category axis
#     cat_ind, cluster_ind = np.unravel_index(np.argmax(accuracy_train), accuracy_train.shape)
#     clusterToCategory[cluster_list_remaining[cluster_ind]] = category_list_remaining[cat_ind]
#     categoryToCluster[category_list_remaining[cat_ind]] = cluster_list_remaining[cluster_ind]
    
#     # Remove those entries from the lists.
#     category_list_remaining.remove(category_list_remaining[cat_ind])
#     cluster_list_remaining.remove(cluster_list_remaining[cluster_ind])
# #     break

# # The remaining clusters predict empty strings
# for item in cluster_list_remaining:
#     clusterToCategory[item] = ''
# clusterToCategory_list = sorted(clusterToCategory.values())

In [14]:
# The table is normalized by number of elements in each cluster.
print 'Training data'
print ('{:<10}' + '{:<10}'  *n_clusters).format('', *range(0, n_clusters))
print ('{:<10}' + '{:<10}'  *n_clusters).format('', *[clusterToCategory[x] for x in range(0, n_clusters)])
for cat, item in zip(category_list, accuracy_train_initial):
    print ('{:<10}' + '{:<10.2}'*n_clusters).format(cat, *item)

Training data
          0         1         2         3         4         5         6         7         8         9         
          str-el    atom-ph   nlin      atom-ph   str-el    quant-ph  str-el    str-el    quant-ph  stat      
atom-ph   0.0       0.72      0.067     0.053     0.048     0.0       0.0048    0.072     0.0       0.034     
nlin      0.0068    0.11      0.76      0.0068    0.014     0.0       0.0068    0.02      0.034     0.047     
optics    0.083     0.69      0.13      0.016     0.026     0.0       0.0052    0.026     0.0052    0.016     
quant-ph  0.025     0.24      0.067     0.028     0.055     0.16      0.0051    0.0051    0.39      0.029     
stat      0.011     0.032     0.61      0.052     0.021     0.0048    0.014     0.016     0.0077    0.23      
str-el    0.12      0.014     0.036     0.025     0.099     0.0052    0.15      0.33      0.0026    0.22      


In [15]:
# Is there overlap between the clusters and existing categories ('ground truth')?
matrix = [[sum((a==cat and b==x for a,b in zip(y_test, predict)))
           for x in range(0,n_clusters)] 
           for cat in category_list]

print 'Test data'
print ('{:<10}' + '{:<10}'  *n_clusters).format('', *range(0, n_clusters))
print ('{:<10}' + '{:<10}'  *n_clusters).format('', *[clusterToCategory[x] for x in range(0, n_clusters)])
for cat, item in zip(category_list, matrix):
    print ('{:<10}' + '{:<10}'*n_clusters).format(cat, *item)
    
    
# # Oops, this is the same as the confusion matrix?
# tmp_reverse_category = dict([(y,x) for x,y in enumerate(category_list)])
# y_test_num = [tmp_reverse_category[x] for x in y_test]
# print ''
# print 'Confusion matrix:'
# print confusion_matrix(y_test_num, predict)

Test data
          0         1         2         3         4         5         6         7         8         9         
          str-el    atom-ph   nlin      atom-ph   str-el    quant-ph  str-el    str-el    quant-ph  stat      
atom-ph   0         47        3         1         6         0         0         3         0         0         
nlin      1         5         26        0         2         0         0         4         2         3         
optics    1         38        7         1         1         0         0         0         1         1         
quant-ph  12        113       15        9         22        68        1         4         189       15        
stat      2         12        146       10        5         0         3         5         5         78        
str-el    50        5         27        20        69        1         69        191       2         113       


In [16]:
# We can now make a prediction based on these categories.
predict_category = [clusterToCategory[y] for y in predict]

print(classification_report(y_test, predict_category))
print('Accuracy score: %0.2f' % accuracy_score(y_test, predict_category))
print confusion_matrix(y_test, predict_category)

             precision    recall  f1-score   support

    atom-ph       0.18      0.80      0.30        60
       nlin       0.12      0.60      0.19        43
     optics       0.00      0.00      0.00        50
   quant-ph       0.96      0.57      0.72       448
       stat       0.37      0.29      0.33       266
     str-el       0.84      0.69      0.76       547

avg / total       0.71      0.56      0.60      1414

Accuracy score: 0.56
[[ 48   3   0   0   0   9]
 [  5  26   0   2   3   7]
 [ 39   7   0   1   1   2]
 [122  15   0 257  15  39]
 [ 22 146   0   5  78  15]
 [ 25  27   0   3 113 379]]


  'precision', 'predicted', average, warn_for)


In [17]:
# Most important chunks. See http://scikit-learn.org/stable/auto_examples/text/document_clustering.html
order_centroids = clf_unsupervised.named_steps['clf'].cluster_centers_.argsort()[:, ::-1]

terms =  clf_unsupervised.named_steps['vect'].get_feature_names()
for i in range(n_clusters):
    print "Cluster %d (%s):" % (i, clusterToCategory[i])
    print ', '.join([terms[x] for x in order_centroids[i, :20]])
    print ''

Cluster 0 (str-el):
topological, hall, states, quantum hall, quantum, edge, fractional, nu, state, spin, phase, fractional quantum, fractional quantum hall, symmetry, majorana, surface, phases, abelian, chiral, insulator

Cluster 1 (atom-ph):
photon, optical, atoms, cavity, laser, quantum, light, single, state, field, frequency, atom, atomic, states, pulse, coupling, photons, pulses, time, using

Cluster 2 (nlin):
time, dynamics, model, distribution, systems, networks, non, random, particles, equilibrium, energy, study, particle, results, network, density, scaling, simulations, theory, law

Cluster 3 (atom-ph):
phys rev, rev, phys, lett, phys rev lett, rev lett, et al, et, al, comment, al phys, al phys rev, et al phys, reply, cond, mat, cond mat, arxiv, reply comment, recent

Cluster 4 (str-el):
spin, spin orbit, orbit, coupling, magnetic, quantum, state, electron, excitations, interaction, spins, nuclear, liquid, polarization, exchange, field, model, lattice, interactions, spin liquid

PCA is another interesting approach.

In [18]:
# clf_pca = Pipeline([('vect', CountVectorizer(ngram_range=(1,3))),
#                              ('tfidf', TfidfTransformer()),
#                              ('clf', KMeans(n_components=6))])
# X = clf_pca.fit(x_train)
# # predict = clf_pca.predict(x_test)