In [177]:
import corex_topic as ct
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd
import os
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import re
from collections import defaultdict

## Read arXiv papers

Note that these have already been collected via arXiv's OAI API, and text fields have been cleaned (lemmatized + nltk stopwords removed)

In [None]:
data = []
top_dir = "data/"
for file_name in os.listdir(top_dir):
    if not (file_name.startswith("arxiv") and file_name.endswith(".json")):
        continue
    _df = pd.read_json(top_dir+file_name,orient="records")
    condition = _df["categories"].str.contains("stat.ML") | _df["categories"].str.contains("cs.")
    condition = condition & (_df.summary.apply(lambda s : len(s) > 20))
    new_df = _df.loc[condition].copy()
    data.append(new_df)
    del (_df)
df = pd.concat(data)

In [5]:
len(df)

253047

## Topic modelling
### Preprocessing: Generate a one-hot bag of words

In [6]:
# Apply basic settings, including ngram generation
count_vectoriser = CountVectorizer(binary=True, min_df=0.001, ngram_range=(1,3))
count_vectoriser.fit(map(lambda x : " ".join(x), df.summary))

# Generate vocab mapping
X = count_vectoriser.transform(map(lambda x : " ".join(x), df.summary))
vocab = {v:k for k,v in count_vectoriser.vocabulary_.items()}
print("Got vocab of size",len(vocab))

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.001,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

### Topic modelling using CorEx 
#### Finding the optimal number of topics

In [55]:
# Commented out since process is slow, see results below
# for i in range(10,50):
#     topic_model = ct.Corex(n_hidden=i)
#     topic_model.fit(X)
#     print(i, topic_model.tc)

In [10]:
topic_model = ct.Corex(n_hidden=28)
topic_model.fit(X)

21.138360113264966

#### Get the topics, and generate topic weights for each arXiv paper

In [11]:
topics = topic_model.get_topics()
# Build topic names
topic_names = ["_".join(vocab[i] for i,_ in topic) for topic in topics]

NOTE: 'words' not provided to CorEx. Returning topics as lists of column indices


In [18]:
for t in topic_names:
    print(t,"\n")

energy_electron_wave_field_magnetic_particle_temperature_surface_atom_ion  

learning_neural_neural network_state art_art_training_image_task_deep_classification  

optical_laser_beam_light_photon_pulse_detector_frequency_resonance_wavelength  

channel_wireless_transmission_receiver_interference_communication_transmitter_transmit_mimo_antenna  

polynomial_prove_np_polynomial time_log_known_given_np hard_number_bounded  

algorithm_problem_bound_optimal_optimization_lower bound_upper bound_optimization problem_complexity_upper  

network_world_social_real world_user_service_real_node_social network_resource  

research_software_development_year_science_web_challenge_tool_scientific_application  

estimation_sparse_matrix_bayesian_likelihood_error_estimator_markov_convergence_estimate  

dynamic_interaction_evolution_observed_dynamical_law_force_behavior_phenomenon_formation  

system_design_technology_device_high_control_sensor_mobile_environment_hardware  

quantum_equation_matter_me

In [52]:
# top_dir = "../data/"
# ifile = 0
# nfiles = len(os.listdir(top_dir))
# all_dfs = []
# # Iterate over files in the data dir
# for file_name in os.listdir(top_dir):    
#     if not (file_name.startswith("arxiv") and file_name.endswith(".json")):
#         continue
#     ifile += 1    
#     print("\r",ifile,"   (",ifile/nfiles,")               ",end="")
#     x_df = pd.read_json(top_dir+file_name,orient="records")
#     # Filter out relevant arXiv categories
#     condition = x_df.categories.apply(lambda x : any(y.startswith("cs.") or y == "stat.ML"
#                                                     for y in x.split()))
#     # Require at least 20 words per summary
#     condition = condition & (x_df.summary.apply(lambda s : len(s) > 20))
#     # Make a copy of the subset
#     _df = x_df.loc[condition].copy()
#     _df.reset_index(inplace=True)
#     del x_df
#     # Bag up the words and generate the topic weights
#     X_all = count_vectoriser.transform(map(lambda x : " ".join(x), _df.summary))    
#     topic_weights = defaultdict(list)
#     for t,_t in zip(topic_names,topics):
#         # Weight is given by the the sum of weights for the topic in this summary
#         for bow,(irow,row) in zip(X_all,_df.iterrows()):
#             total_weight = 0
#             for idx, weight in _t:                
#                 total_weight += bow[0,idx]*weight
#             topic_weights[t].append(total_weight)
#     # Assign the weights to the DF
#     for t,w in topic_weights.items():
#         _df[t] = w
#     all_dfs.append(_df)

# # Bring it all together
# _df = pd.concat(all_dfs)

In [95]:
# Bag up the words and generate the topic weights
topic_weights = defaultdict(list)
df_corex = df.copy()
for t, _t in zip(topic_names, topics):
    # Weight is given by the the sum of weights for the topic in this summary
    for bow, (irow, row) in zip(X, df_corex.iterrows()):
        total_weight = 0
        for idx, weight in _t:
            total_weight += bow[0, idx]*weight
        topic_weights[t].append(total_weight)
# Assign the weights to the DF
for t, w in topic_weights.items():
    df_corex[t] = w
df_corex.to_json("arxiv_corex_with_weights.json",orient="records")
del df_corex

In [97]:
df_corex = pd.read_json("arxiv_corex_with_weights.json",orient="records")

253047

### Topic modelling using LDA 
#### Finding the optimal number of topics

In [132]:
# #corpus = [[(idx, 1) for idx in row.indices] for row in X]
# gendict = Dictionary(df.summary.values)
# corpus = [gendict.doc2bow(text) for text in df.summary.values]

In [180]:
## need to preprocess
texts = []
for row in X:
    sentence = [vocab[w] for w in row.indices]
    texts.append(sentence)

In [None]:
gendict = Dictionary(texts)
corpus = [gendict.doc2bow(text) for text in texts]

In [None]:
for n in np.arange(20,30,1):
    lda_model = LdaMulticore(corpus, num_topics=n, id2word=gendict, 
                             iterations=1000, chunksize=20000)
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, 
                                         texts=texts, coherence='c_uci')
    print(n, coherence_model_lda.get_coherence())
    del lda_model

20 -4.992124503469249
21 -5.059349407257665


In [142]:
lda_model = LdaMulticore(corpus, num_topics=26, id2word=gendict, 
                         iterations=1000, chunksize=20000)

In [146]:
for i in range(0,26):
    print("_".join(gendict[idx] for idx,_  in lda_model.get_topic_terms(i)))

model_network_algorithm_graph_time_based_method_problem_approach_system
model_energy_state_scale_time_two_flow_system_number_show
algorithm_problem_method_image_data_learning_based_system_model_paper
time_algorithm_n_method_problem_two_result_space_set_show
model_result_system_bound_show_problem_two_one_field_also
system_time_quantum_model_state_based_show_paper_equation_approach
g_k_method_graph_n_two_network_result_algorithm_state
model_data_method_based_learning_problem_show_system_approach_using
field_model_time_state_energy_two_quantum_effect_wave_system
method_model_equation_two_system_field_result_problem_using_based
time_two_system_frequency_result_field_energy_electron_model_using
method_energy_result_x_using_based_model_high_detector_ha
system_model_based_user_paper_two_approach_algorithm_using_time
mode_model_optical_laser_wave_field_high_frequency_beam_energy
network_data_model_learning_neural_based_feature_deep_image_result
algorithm_problem_result_time_number_network_mode

In [174]:
lda_model = LdaMulticore(corpus, num_topics=28, id2word=gendict, 
                         iterations=100, chunksize=20000)

In [175]:
for i in range(0,28):
    print("_".join(gendict[idx] for idx,_  in lda_model.get_topic_terms(i)))

system_time_show_also_model_one_state_two_using_study
problem_show_algorithm_time_paper_graph_result_two_also_given
network_data_method_neural_neural network_based_model_state_paper_using
paper_problem_system_show_result_based_algorithm_performance_time_also
model_result_study_show_also_time_structure_based_system_two
result_show_simulation_paper_model_using_study_also_network_numerical
model_two_system_dynamic_different_state_convolutional neural network_convolutional neural_based_result
network_paper_show_model_ha_method_structure_system_result_new
present_paper_data_time_based_model_system_new_one_ha
network_paper_based_method_show_approach_system_result_propose_data
based_proposed_result_performance_show_using_algorithm_paper_method_model
high_using_optical_system_result_state_based_used_field_ha
paper_show_model_learning_result_time_based_problem_ha_used
algorithm_result_method_data_based_show_problem_paper_two_new
network_model_show_paper_information_problem_learning_also_result_

In [179]:
texts[0]

['algorithm',
 'algorithmic',
 'also',
 'also present',
 'appear',
 'attention',
 'attention recent',
 'based',
 'characterization',
 'color',
 'colored',
 'concerning',
 'connection',
 'decomposition',
 'describe',
 'describe new',
 'ell',
 'expose',
 'family',
 'game',
 'generalize',
 'give',
 'give new',
 'graph',
 'graph algorithm',
 'increased',
 'instance',
 'lee',
 'nash',
 'new',
 'new algorithm',
 'obtain',
 'particular',
 'present',
 'present new',
 'previous',
 'previous result',
 'problem',
 'proof',
 'received',
 'recent',
 'recent year',
 'result',
 'rigidity',
 'solution',
 'sparse',
 'sparsity',
 'special',
 'strengthen',
 'theory',
 'tree',
 'use',
 'work',
 'work also',
 'year']