In [1]:
import corex_topic as ct
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
import re
from collections import defaultdict
from gensim.corpora.dictionary import Dictionary

## Read arXiv papers

Note that these have already been collected via arXiv's OAI API, and text fields have been cleaned (lemmatized + nltk stopwords removed)

In [3]:
# data = []
# top_dir = "data/"
# for file_name in os.listdir(top_dir):
#     if not (file_name.startswith("arxiv") and file_name.endswith(".json")):
#         continue
#     _df = pd.read_json(top_dir+file_name,orient="records")
#     _df.categories = _df.categories.apply(lambda x: x.split())
#     condition = _df.categories.apply(lambda x : "stat.ML" in x) 
#     condition = condition | _df.categories.apply(lambda x : any(y.startswith("cs.") for y in x))
#     condition = condition & (_df.summary.apply(lambda s : len(s) > 20))
#     new_df = _df.loc[condition].copy()
#     data.append(new_df)
#     del (_df)
# df = pd.concat(data)

In [2]:
#del data
df = pd.read_json("data/cs_arxiv.json", orient="records")
len(df)

168527

## Topic modelling
### Preprocessing: Generate a one-hot bag of words

In [5]:
# Apply basic settings, including ngram generation
count_vectoriser = CountVectorizer(binary=True, min_df=0.001, ngram_range=(1,3))
count_vectoriser.fit(map(lambda x : " ".join(x), df.summary))

# Generate vocab mapping
X = count_vectoriser.transform(map(lambda x : " ".join(x), df.summary))
vocab = {v:k for k,v in count_vectoriser.vocabulary_.items()}
print("Got vocab of size",len(vocab))

Got vocab of size 10830


### Topic modelling using CorEx 
#### Finding the optimal number of topics

In [211]:
# Commented out since process is slow, see results below
for i in np.arange(36,40,1):
    topic_model = ct.Corex(n_hidden=i)
    topic_model.fit(X)
    print(i, topic_model.tc)

36 23.286560655789977
37 23.183222340984532
38 23.580441233237128
39 24.284446097361407


In [6]:
topic_model = ct.Corex(n_hidden=39)
topic_model.fit(X)

<corex_topic.Corex at 0x11a504550>

#### Get the topics, and generate topic weights for each arXiv paper

In [7]:
topics = topic_model.get_topics()
# Build topic names
topic_names = ["_".join(vocab[i] for i,_ in topic) for topic in topics]

NOTE: 'words' not provided to CorEx. Returning topics as lists of column indices


In [8]:
for t in topic_names:
    print(t,"\n")

channel_interference_transmission_wireless_receiver_rate_antenna_transmitter_mimo_transmit 

learning_neural_neural network_training_machine learning_classification_trained_machine_learn_learning algorithm 

image_recognition_feature_task_dataset_visual_text_segmentation_vision_semantic 

state art_art_state_deep_convolutional_convolutional neural_convolutional neural network_deep learning_datasets_deep neural 

service_security_mobile_device_traffic_technology_resource_attack_internet_management 

bound_upper_upper bound_np_polynomial time_constant_np hard_case_time algorithm_known 

algorithm_problem_optimization_optimization problem_solve_solution_solving_approximation_efficient_convergence 

graph_vertex_edge_undirected_directed_subgraph_shortest path_path_graph vertex_connected 

lower bound_code_lower_log_decoding_coding_length_omega_frac_log log 

research_year_ha_recent_human_community_attention_recent year_challenge_become 

dimensional_sparse_high dimensional_space_sparsity_l

In [13]:
# Bag up the words and generate the topic weights
topic_weights = defaultdict(list)
df_corex = df.copy().add_prefix("arxiv_")
for t, _t in zip(topic_names, topics):
    # Weight is given by the the sum of weights for the topic in this summary
    for bow, (irow, row) in zip(X, df_corex.iterrows()):
        total_weight = 0
        for idx, weight in _t:
            total_weight += bow[0, idx]*weight
        topic_weights[t].append(total_weight)
# Assign the weights to the DF
for t, w in topic_weights.items():
    df_corex["TOPIC_"+t] = w
df_corex.to_json("data/topics_corex.json",orient="records")
del df_corex

### Topic modelling using LDA 
#### Finding the optimal number of topics

In [132]:
# #corpus = [[(idx, 1) for idx in row.indices] for row in X]
# gendict = Dictionary(df.summary.values)
# corpus = [gendict.doc2bow(text) for text in df.summary.values]

In [5]:
## need to preprocess
texts = []
for row in X:
    sentence = [vocab[w] for w in row.indices]
    texts.append(sentence)

In [8]:
gendict = Dictionary(texts)
corpus = [gendict.doc2bow(text) for text in texts]

In [9]:
for n in np.arange(20,50,5):
    lda_model = LdaMulticore(corpus, num_topics=n, id2word=gendict, 
                             iterations=100, chunksize=20000)
    uci = CoherenceModel(model=lda_model, corpus=corpus,                                          
                         texts=texts, coherence='c_uci')
    umass = CoherenceModel(model=lda_model, corpus=corpus, 
                           texts=texts, coherence='c_uci')    
    print(n, uci.get_coherence(), umass.get_coherence(), lda_model.log_perplexity(corpus))
    del lda_model

NameError: name 'np' is not defined

In [183]:
for nit in [10, 100, 1000, 2500, 5000, 10000, 25000]:
    lda_model = LdaMulticore(corpus, num_topics=28, id2word=gendict, 
                             iterations=nit, chunksize=20000)
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, 
                                         texts=texts, coherence='c_uci')    
    print(nit, coherence_model_lda.get_coherence(), lda_model.log_perplexity(corpus))
    del lda_model

10 -4.969543411635572 -8.556184136224147
100 -5.266592622573094 -8.01440828604216
1000 -5.001288498955499 -8.038297273689187
2500 -5.076360498203264 -8.039477225850383
5000 -5.127604011808473 -8.018711878659518
10000 -5.161609156027235 -8.043763480447423
25000 -4.963276170377199 -8.01910197369833


In [None]:
ldamodel.log_perplexity(c_test)

In [189]:
lda_model = LdaMulticore(corpus, num_topics=26, id2word=gendict, 
                         iterations=100)

In [190]:
for i in range(0,26):
    print("_".join(gendict[idx] for idx,_  in lda_model.get_topic_terms(i)))

model_result_surface_simulation_numerical_using_study_two_dynamic_flow
bound_algorithm_lower_random_log_number_probability_time_case_lower bound
game_optimal_show_problem_strategy_result_player_study_algorithm_equilibrium
ray_energy_wa_cosmic_matter_low_radiation_experiment_dark_section
language_system_paper_model_theory_based_present_program_also_approach
equation_field_energy_flow_velocity_particle_time_magnetic_scale_model
paper_two_show_result_finite_property_class_set_function_one
material_field_optical_device_application_wave_electromagnetic_design_light_based
high_detector_resolution_using_time_performance_energy_based_measurement_result
method_data_analysis_result_using_used_one_ground_accuracy_time
frequency_method_time_using_analysis_result_function_two_signal_data
role_time_ha_play_year_model_dynamic_important_system_recent
network_model_structure_study_dynamic_show_node_complex_different_distribution
optical_order magnitude_magnitude_measurement_using_order_frequency_high_l

In [191]:
lda_model = LdaMulticore(corpus, num_topics=100, id2word=gendict,
                         iterations=100, chunksize=20000)

  diff = np.log(self.expElogbeta)


In [192]:
for i in range(0,100):
    print("_".join(gendict[idx] for idx,_  in lda_model.get_topic_terms(i)))

result_two_method_based_paper_system_network_using_show_also
two_based_result_non_show_case_also_time_function_method
method_based_result_show_using_paper_problem_technique_ha_two
based_ha_using_paper_high_energy_result_time_used_order
system_model_field_show_using_based_state_study_paper_two
using_paper_result_used_time_model_high_show_based_two
model_result_paper_time_using_show_based_system_ha_study
two_show_result_one_time_present_problem_study_based_paper
result_using_time_also_based_show_model_ha_used_data
paper_result_show_algorithm_approach_model_method_using_problem_based
paper_system_based_show_result_time_two_model_proposed_method
network_based_method_using_state art_paper_art_state_data_convolutional neural
show_also_result_based_model_paper_problem_two_one_time
model_result_based_show_paper_data_method_network_proposed_using
field_using_two_optical_quantum_time_demonstrate_high_based_result
show_result_system_two_time_optical_field_frequency_using_state
based_method_result

In [194]:
lda_model.log_perplexity(corpus)

-8.220221997591302

In [196]:
lda_model = LdaMulticore(corpus, num_topics=10, id2word=gendict,
                         iterations=100, chunksize=20000)
lda_model.log_perplexity(corpus)

-8.054961983726416

In [199]:
coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, 
                                     texts=texts, coherence='c_v')

In [200]:
coherence_model_lda.get_coherence()

0.3270612354163167