In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import preprocessing
from gensim import corpora

import os
from gensim.models.doc2vec import TaggedDocument

def texts_corpus(textdir='data/texts/'):
    """Create a preprocessed corpus for doc2vec training.
    
    Parameters:
        textdir: str
            Path of text files
    Returns:
        corpus: list of gensim.models.doc2vec.TaggedDocument objects
            Tagged by arxiv ID
    """
    files = os.listdir(textdir)
    corpus = []
    for i, file in enumerate(files):
        if '.txt' not in file:
            continue
        with open(textdir + file) as f:
            t = f.read()
            corpus.append(
                TaggedDocument(
                    words=preprocessing.doc_preprocessor(t, lemmatize=False),
                    tags=[file.strip('.txt')])
            )
            
    return corpus
            
train_corpus = texts_corpus()

In [3]:
from gensim.models import Doc2Vec

"""model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=10)
model.build_vocab(train_corpus)

%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)"""
#model.save('data/doc2vec.model_1')
model = Doc2Vec.load('data/doc2vec_1.model')

In [4]:
def arxiv_id_to_tag():
    fu_dict = {
        paper['id'].split('/')[-1]: paper \
        for paper in preprocessing.flat_unique()
    }
    id_term = {}
    for key in fu_dict.keys():
        id_term[key] = [
            tag['term'].split('.')[1] for tag in fu_dict[key]['tags'] \
             if tag['term'].split('.')[0] == 'q-fin'
        ][0]

    return id_term


def arxiv_id_to_title():
    id_to_title = {
        paper['id'].split('/')[-1]: paper['title'] \
        for paper in preprocessing.flat_unique()
    }
    return id_to_title


id_term = arxiv_id_to_tag()
id_to_title = arxiv_id_to_title()

In [5]:
#set(id_term.values())
"""colors = {
    'CP': 'red', 'EC': 'blue', 
    'GN': 'green', 'MF': 'grey', 
    'PM': 'black', 'PR': 'yellow',
    'RM': 'violet', 'ST': 'orange',
    'TR': 'pink'
}"""
colors = {
    'CP': '#e6194b', 'EC': '#4363d8', 
    'GN': '#f58231', 'MF': '#ffe119', 
    'PM': '#806080', 'PR': '#3cb44b',
    'RM': '#008080', 'ST': '#bcf60c',
    'TR': '#911eb4'
}

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import json
from pprint import pprint
from tqdm import tqdm


G = nx.Graph()

json_graph = {"nodes": [], "edges": []}
created_nodes = []
edge_pairs = []
for i in range(0, len(model.docvecs), 5):
    ivec = model.infer_vector(doc_words=train_corpus[i].words) # > Not robust enough
    sims = model.docvecs.most_similar([ivec], topn=len(model.docvecs))
    
    this_node = sims[0][0] # > Not robust enough
    #this_node = model.docvecs.doctags[i]
    
    if this_node not in created_nodes:
        # To vis.js
        json_graph["nodes"].append({
            "id": this_node, 
            "label": id_to_title[this_node], 
            "shape": "box",
            "color": colors[id_term[this_node]]
        })
        created_nodes.append(this_node)
    
    # Add most similar distance
    for j in range(1, 10):
        edge_pair = set([this_node, sims[j][0]])
        if edge_pair not in edge_pairs:
            # To vis.js
            json_graph["edges"].append({
                "from": this_node, 
                "to": sims[j][0], 
                "length": (1-sims[j][1]),
                "color": {"color": colors[id_term[sims[j][0]]]}
            })
            # Networkx edge append
            G.add_edge(
                this_node, 
                sims[j][0],        
                attr_dict={
                    'distance': (1-sims[j][1]), 
                    'color': colors[id_term[sims[j][0]]]}
            )
            edge_pairs.append(edge_pair)
        else:
            #print('Skipped double edge.')
            pass

        
with open('data/graph.json', 'w') as fp:
    json.dump(json_graph, fp)

In [23]:
to_delete = {'nodes': [], 'edges': []}
for node in json_graph['nodes']:
    connected_edges = [edge for edge in json_graph['edges'] \
     if node['id'] in [edge['to'], edge['from']]]
    
    #print(len(connected_edges))
    """
    for edge in json_graph['edges']:
        if node['id'] in [edge['to'], edge['from']]:
            break
    """
    to_delete['nodes'].append(node)
    
len(to_delete['nodes'])

760