In [2]:
%load_ext autoreload
%autoreload 1

In [3]:
import pandas as pd
from pandarallel import pandarallel
pandarallel.initialize()
import multiprocessing
import numpy as np
import os
from collections import Counter, defaultdict
import itertools
import sys 
sys.path.append("..") 

%aimport preprocessing
%aimport utils
%aimport p2v

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from utils import prinT
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [1]:
k_list = [10, 10, 10, 10, 10, 10, 10]
start_year_list = ['1950', '1960', '1970', '1980', '1990', '2000', '2010']
end_year_list = ['1959', '1969', '1979', '1989', '1999', '2009', '2021']
ref_start_year = 2010
ref_end_year = 2021

# Functions for processing time series

In [4]:
def filter_nan_in_middle(x):
    val_list = x.to_list()
    
    start_idx = -1
    end_idx = -1
    nan_in_middle = False
    for i in range(0, len(val_list)):
        if not np.isnan(val_list[i]):
            start_idx = i
            break
    for i in range(len(val_list)-1, -1, -1):
        if not np.isnan(val_list[i]):
            end_idx = i
            break
    for i in range(start_idx, end_idx):
        if np.isnan(val_list[i]):
            nan_in_middle = True
            break
    if start_idx == end_idx:
        len_of_1 = True
    else :
        len_of_1 = False
    return nan_in_middle, len_of_1, start_idx, end_idx

# Generate a dataframe which contains periodical's local semantic change

In [5]:
def cal_local_distance(x, tar_neighbor_idxs, ref_neighbor_idxs, tar_vectors, ref_vectors):
    neighbor_idx_list = list(set(tar_neighbor_idxs[x]).union(set(ref_neighbor_idxs[x])))
    tar_neighbor_vectors = [tar_vectors[idx] for idx in neighbor_idx_list]
    ref_neighbor_vectors = [ref_vectors[idx] for idx in neighbor_idx_list]
    tar_focal_vector = tar_vectors[x]
    ref_focal_vector = ref_vectors[x]

    tar_second_order_change = cosine_similarity(tar_focal_vector.reshape(1,-1), tar_neighbor_vectors)
    ref_second_order_change = cosine_similarity(ref_focal_vector.reshape(1,-1), ref_neighbor_vectors)

    return paired_distances(tar_second_order_change, ref_second_order_change, metric='cosine')[0]

In [6]:
from sklearn.metrics.pairwise import cosine_similarity, paired_distances

model = p2v.P2V()

local_semantic_change_df = pd.DataFrame(columns=['VID'])
col_name_list = ['semantic_change_1960s', 
                 'semantic_change_1970s', 
                 'semantic_change_1980s',
                 'semantic_change_1990s',
                 'semantic_change_2000s',
                 'semantic_change_2010s']

for i in range(0, 6):
    old_start_year, old_end_year = start_year_list[i], end_year_list[i]
    new_start_year, new_end_year = start_year_list[i+1], end_year_list[i+1]
    print("-----------------------")
    prinT("start comparing {}-{} with the {}-{}".format(old_start_year, old_end_year, new_start_year, new_end_year))

    old_wv = model.load_wv(old_start_year, old_end_year, 100, 10)
    new_wv = model.load_wv(new_start_year, new_end_year, 100, 10)
    
    old_VIDs = old_wv.index_to_key
    new_VIDs = new_wv.index_to_key
    shared_VIDs = list(set(old_VIDs) & set(new_VIDs))
    prinT("the number of shared VIDs between this two decades: %d" %len(shared_VIDs))
    
    old_idx = [old_wv.get_index(VID) for VID in shared_VIDs]
    new_idx = [new_wv.get_index(VID) for VID in shared_VIDs]
    
    old_vectors = old_wv.get_normed_vectors()[old_idx]
    new_vectors = new_wv.get_normed_vectors()[new_idx]

    prinT("finish, start generate neighbor union...")
    new_neighbor_idxs = np.argsort(cosine_similarity(new_vectors, new_vectors))[:, -2:-k_list[i]-2:-1]
    old_neighbor_idxs = np.argsort(cosine_similarity(old_vectors, old_vectors))[:, -2:-k_list[-1]-2:-1]
    
    prinT("start cal local neighbor distance...")
    dist_df = pd.DataFrame({'VID': shared_VIDs})
    dist = dist_df.index.to_series().parallel_apply(cal_local_distance, args=(old_neighbor_idxs, 
                                                                              new_neighbor_idxs, 
                                                                              old_vectors, 
                                                                              new_vectors))
    prinT("finish")

    dist_df[col_name_list[i]] = dist
    local_semantic_change_df = local_semantic_change_df.merge(dist_df, on='VID', how='outer')
    local_semantic_change_df['total_semantic_change_'+new_start_year+'s'] = local_semantic_change_df[col_name_list[0:i+1]].sum(axis=1)

2024-01-17 21:13:41 -- start loading Mag_venue_info_df
2024-01-17 21:13:42 -- finish.
2024-01-17 21:13:42 -- start loading labeled_journal_info_df
2024-01-17 21:13:42 -- finish.
-----------------------
2024-01-17 21:13:42 -- start comparing 1950-1959 with the 1960-1969
2024-01-17 21:13:42 -- start loading word vectors...
2024-01-17 21:13:42 -- word vectors loaded, and its shape is: (1645, 100)
2024-01-17 21:13:42 -- start loading word vectors...
2024-01-17 21:13:42 -- word vectors loaded, and its shape is: (3116, 100)
2024-01-17 21:13:42 -- the number of shared VIDs between this two decades: 1500
2024-01-17 21:13:42 -- finish, start generate neighbor union...
2024-01-17 21:13:42 -- start cal local neighbor distance...
2024-01-17 21:13:42 -- finish
-----------------------
2024-01-17 21:13:42 -- start comparing 1960-1969 with the 1970-1979
2024-01-17 21:13:42 -- start loading word vectors...
2024-01-17 21:13:42 -- word vectors loaded, and its shape is: (3116, 100)
2024-01-17 21:13:42 -- 

In [7]:
# Labelling
local_semantic_change_df.loc[:,'venue_name'] = local_semantic_change_df['VID'].apply(lambda x: model.MAG_venue_info_df.at[x, 'OriginalVenue'])
# Using the lastes decade's labels
VID_labele_dict = model.load_VID_labeled(2010, 2021, 100, 10)
local_semantic_change_df['scopus_label'] = local_semantic_change_df.VID.map(lambda x: VID_labele_dict['label'][VID_labele_dict['VID'].index(x)] if x in VID_labele_dict['VID']
                                                                            else np.nan)

2024-01-17 21:17:55 -- start loading VID_labeled...
2024-01-17 21:17:55 -- finish.


In [8]:
# incorrect venues in MAG
corrupted_venue_name_list = ['Japanese Journal of Pharmacology', 
                             'Journal of Computers',
                             'Journal of Algorithms',
                             'Journal of Agricultural Engineering Research',
                             'Sozial-und Praventivmedizin',
                             'Scientia Forestalis',
                             'Interpretation',
                             'Genes',
                             'Protein Science',
                             'Hospital Medicine',
                             'Immunotechnology',
                             'Journal of Ayurveda and Integrative Medicine',
                             'Versus',
                             'Tradition',
                             'ACM Transactions on Cyber-Physical Systems',
                             'Journal of Biomedical Engineering',
                             'Antibiotics and Chemotherapy',
                             'Social Work',
                             'Production Journal',
                             'Insight',
                             'Sats',
                             'English',
                             'Leonardo',
                             'The Forum',
                             'Chemical Industry',
                             'The American review of respiratory disease',
                             'Chemistry & Industry',
                            ]

In [9]:
# Filtering
local_semantic_change_df = local_semantic_change_df[~(local_semantic_change_df.venue_name.isin(corrupted_venue_name_list))]
local_semantic_change_df[['nan_in_middle', 
                          'len_of_1', 
                          'start_idx', 
                          'end_idx']] = local_semantic_change_df[col_name_list].apply(filter_nan_in_middle, axis=1, result_type='expand')
# Make sure that each time series never interrupts and has a value for the lastest decade
local_semantic_change_df = local_semantic_change_df[(local_semantic_change_df.nan_in_middle==False) & (local_semantic_change_df.end_idx==5)]

In [12]:
# save as .pkl file
local_semantic_change_df.drop(labels=['nan_in_middle', 'len_of_1', 'start_idx', 'end_idx'], axis=1).to_pickle('local_semantic_change_df_k_10.pkl')

In [13]:
pd.read_pickle('local_semantic_change_df_k_10.pkl')

Unnamed: 0,VID,semantic_change_1960s,total_semantic_change_1960s,semantic_change_1970s,total_semantic_change_1970s,semantic_change_1980s,total_semantic_change_1980s,semantic_change_1990s,total_semantic_change_1990s,semantic_change_2000s,total_semantic_change_2000s,semantic_change_2010s,total_semantic_change_2010s,venue_name,scopus_label
0,2.785285e+06,0.028464,0.028464,0.017985,0.046449,0.002580,0.049029,0.005973,0.055002,0.000816,0.055818,0.005635,0.061453,The Journal of Comparative Neurology,Neuroscience
2,2.756444e+09,0.028157,0.028157,0.029099,0.057256,0.016555,0.073812,0.025299,0.099110,0.058427,0.157537,0.114181,0.271718,Advances in internal medicine,
3,1.709670e+08,0.030398,0.030398,0.044556,0.074954,0.023140,0.098095,0.006869,0.104964,0.007796,0.112760,0.010707,0.123466,American Journal of Epidemiology,Medicine
4,1.241662e+08,0.007862,0.007862,0.008346,0.016208,0.005651,0.021859,0.010619,0.032479,0.010537,0.043016,0.006222,0.049238,Quarterly Journal of Mechanics and Applied Mathematics,Mathematics
5,2.764775e+09,0.004671,0.004671,0.005047,0.009718,0.006526,0.016243,0.002833,0.019076,0.005002,0.024078,0.003519,0.027597,Journal of the Geological Society of Japan,Earth and Planetary Sciences
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27043,2.764439e+09,,,,,,,,,,,0.003137,0.003137,Bulgarian Journal of Veterinary Medicine,Veterinary
27044,2.898526e+09,,,,,,,,,,,0.006932,0.006932,EMC - AKOS - Trattato di Medicina,
27045,2.010644e+08,,,,,,,,,,,0.005643,0.005643,European Journal of Preventive Cardiology,Medicine
27046,1.125908e+08,,,,,,,,,,,0.017209,0.017209,Journal of Transport and Land Use,Social Sciences
