In [3]:
%load_ext autoreload
%autoreload 1

In [4]:
import pandas as pd
import numpy as np
import networkx as nx
import pickle
import datetime
from gensim.models import word2vec, KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from multiprocessing import cpu_count
from pandarallel import pandarallel
import sys

%aimport preprocessing
%aimport utils
%aimport p2v

from utils import prinT
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)

In [5]:
model = p2v.P2V()

2023-09-13 07:25:51 -- start loading Mag_venue_info_df
2023-09-13 07:25:51 -- finish.
2023-09-13 07:25:51 -- start loading labeled_journal_info_df
2023-09-13 07:25:51 -- finish.


In [14]:
def PID2VID(PID: int):
    return model.target_paper_df.at[PID, 'VenueID']

def make_edgelist(start_year: int, end_year: int):
    model.load_ref_df(full_load=False, start_year=start_year, end_year=end_year)
    target_ref_df = model.target_ref_df.reset_index()
    model.load_paper_df(full_load=False, start_year=start_year, end_year=end_year)
    model.target_paper_df.set_index('PaperID', inplace=True)
    
    p_ref_df = pd.DataFrame()
    pandarallel.initialize()
    p_ref_df['citing_VID'] = target_ref_df.PaperID.parallel_apply(PID2VID)
    p_ref_df['cited_VID'] = target_ref_df.PaperReferenceID.parallel_apply(PID2VID)
    
    p_ref_df.loc[:,'citing_venue_name'] = p_ref_df['citing_VID'].parallel_apply(lambda x: model.MAG_venue_info_df.at[x, 'OriginalVenue'])
    p_ref_df.loc[:,'cited_venue_name'] = p_ref_df['cited_VID'].parallel_apply(lambda x: model.MAG_venue_info_df.at[x, 'OriginalVenue'])
    
    venue_name2NID_df = pd.DataFrame(set(list(pd.unique(p_ref_df['citing_venue_name'])) + list(pd.unique(p_ref_df['cited_venue_name']))), 
                                     columns=['venue_name']).reset_index(names='NID').set_index('venue_name')
    
    p_ref_df.loc[:,'citing_NID'] = p_ref_df['citing_venue_name'].parallel_apply(lambda x:venue_name2NID_df.at[x, 'NID'])
    p_ref_df.loc[:,'cited_NID'] = p_ref_df['cited_venue_name'].parallel_apply(lambda x: venue_name2NID_df.at[x, 'NID'])
    
    np.savetxt('/media/sdb/p2v/pickles/decades/%s_to_%s/edgelist.txt' %(start_year, end_year), p_ref_df[['citing_NID', 'cited_NID']].values, fmt='%u')
    venue_name2NID_df.to_pickle('/media/sdb/p2v/pickles/decades/%s_to_%s/venue_name2NID.pkl' %(start_year, end_year))

In [75]:
make_edgelist(1950, 1959)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [76]:
make_edgelist(1960, 1969)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
make_edgelist(1970, 1979)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
make_edgelist(1980, 1989)

In [15]:
make_edgelist(1990, 1999)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [16]:
make_edgelist(2000, 2009)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [17]:
make_edgelist(2010, 2021)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
