### In this notebook, we'll compare the clustering of atom-ph articles (clustering-atom-ph.ipynb) with DAMOP2016 sessions

In [1]:
from collections import Counter
import json
from sklearn.externals import joblib

In [2]:
# First, load cluster predictor for atom-ph articles
# clf = joblib.load('cluster-AMO-winner.pkl') 
# clf = joblib.load('cluster-AMO-optics.pkl')
clf = joblib.load('cluster-AMO.pkl')

In [3]:
# Second, load articles from DAMOP
with open('../../damop data/damop2016.json') as f:
    damop = json.load(f)

In [4]:
exclude_list = ['Graduate Student Symposium',
                'DAMOP Prize Session',
                'DAMOP Thesis Prize Session',
               ]

In [5]:
sessions_all = 0
sessions_one_majority = 0
sessions_two_majority = 0

n_clusters = clf.get_params()['clf__n_clusters']
cluster_to_session = dict((x, []) for x in range(n_clusters))
sessions_unclassified = []

for session in damop:
    abstracts = map(lambda x: x['abstract'], session['abstracts'])
    if (len(abstracts) > 4) and (len(abstracts) < 40):
        y = clf.predict(abstracts)
        count = Counter(y)
        session_number_name = "{}: {}".format(session['number'], session['name'])
        print session_number_name
        sessions_all += 1

        if 1.*count.most_common(1)[0][1] >= 0.5*len(abstracts):
            print 'Majority cluster: {}'.format(count.most_common(1)[0][0])
            sessions_one_majority += 1
            
            cluster_to_session[count.most_common(1)[0][0]].append(session_number_name + ' (*)')
            
        elif 1.*(count.most_common(2)[0][1] + count.most_common(2)[1][1]) >= 0.5*len(abstracts):
            print 'Majority clusters: {}, {}'.format(count.most_common(2)[0][0], count.most_common(2)[1][0])
            sessions_two_majority += 1
            
            cluster_to_session[count.most_common(2)[0][0]].append(session_number_name)
            cluster_to_session[count.most_common(2)[1][0]].append(session_number_name)
            
        else:
            print y
            sessions_unclassified.append(session_number_name)
        print ''
        
        if session['number'] == 'A1':
            break

1A: Graduate Student Symposium
Majority cluster: 9

B3: Quantum Gases with Dipolar Interactions
[ 2  5  5  9 11 14 10]

B4: Quantum Optics I
Majority cluster: 9

B5: Many-Body Localization and Disorder
Majority cluster: 14

B6: Progress in Spin-Orbit Coupling
Majority cluster: 12

B7: Nonlinear Optics and Lasers
Majority cluster: 9

B9: Photoionization, Photodetachment and Photodissociation
Majority cluster: 18

C4: Hybrid Quantum Systems
Majority clusters: 3, 9

C5: BEC with Strong Interactions
Majority clusters: 5, 8

C6: Quantum Gas Microscope
Majority clusters: 11, 10

C7: Atomic Clocks
Majority cluster: 0

C9: Strong-Field Physics in Atoms, Molecules, and Clusters
Majority cluster: 18

G4: Quantum Measurement
Majority clusters: 9, 3

G5: Atomic Magnetometers I
Majority cluster: 3

G6: One-Dimensional Gases and Nanofibers
Majority clusters: 2, 9

G7: Interaction Effects in Spin-Orbit Coupled Gases
Majority clusters: 12, 1

G8: Time-Resolved Electron Dynamics and Attosecond Spectros

#### Print DAMOP sessions that fall into each cluster.

In [6]:
order_centroids = clf.named_steps['clf'].cluster_centers_.argsort()[:, ::-1]

terms =  clf.named_steps['vect'].get_feature_names()

for cluster, val in cluster_to_session.iteritems():
    print "Cluster {}: {}".format(cluster, ', '.join([terms[x] for x in order_centroids[cluster, :10]]))
    for session in val:
        print '    {}'.format(session)
    print ''

Cluster 0: frequency, laser, clock, optical, nm, transition, spectroscopy, clocks, 10, comb
    C7: Atomic Clocks (*)
    P9: Quantum Control II
    T7: Spectroscopy, Lifetimes, Oscillator Strengths

Cluster 1: spin, spin orbit, orbit, orbit coupling, spin orbit coupling, coupling, rashba, soc, phase, orbit coupled
    G7: Interaction Effects in Spin-Orbit Coupled Gases
    H7: Few-body Systems

Cluster 2: fermi, gas, density, fermi gas, temperature, bose, interacting, interaction, bose gas, dimensional
    G6: One-Dimensional Gases and Nanofibers

Cluster 3: spin, magnetic, magnetic field, field, fields, atoms, magnetic fields, quantum, state, states
    C4: Hybrid Quantum Systems
    G4: Quantum Measurement
    G5: Atomic Magnetometers I (*)
    H6: Two-Dimensional Gases
    J6: Spinor Gases
    M5: Quantum Control I
    T6: Atomic Magnetometers II (*)

Cluster 4: vortex, vortices, condensate, bose einstein, einstein, bose, condensates, turbulence, rotating, dynamics
    J6: Spinor G

In [7]:
print 'Sessions without clusters'
for session in sessions_unclassified:
    print session
    print ''
print ''
print 'Clusters without sessions'
for cluster, session in cluster_to_session.iteritems():
    if len(session) == 0:
        print "Cluster {}: {}".format(cluster, ', '.join([terms[x] for x in order_centroids[cluster, :10]]))

Sessions without clusters
B3: Quantum Gases with Dipolar Interactions

J8: Impurities in Quantum Gases

N7: Long-range or Anisotropic Interactions in Cold Gases


Clusters without sessions
Cluster 15: solitons, soliton, dark, bright, dark solitons, dark soliton, nonlinear, einstein, bose einstein, bright solitons


#### What fraction of the DAMOP sessions are covered by one or two clusters?

In [8]:
print (sessions_one_majority)*1./sessions_all
print (sessions_one_majority + sessions_two_majority)*1./sessions_all
print sessions_all

0.542372881356
0.949152542373
59
