In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import pandas as pd
import numpy as np

import sys 
sys.path.append("../..") 
%aimport preprocessing
%aimport utils
%aimport p2v

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import kendalltau
from collections import defaultdict

from utils import prinT

pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)

**Tracking 3 flagships**  
Nature: 137773608  
Science: 3880285  
PNAS: 125754415  

In [3]:
model = p2v.P2V()
d=100
w=10

2024-04-27 09:33:47 -- start loading Mag_venue_info_df
2024-04-27 09:33:47 -- finish.
2024-04-27 09:33:47 -- start loading labeled_journal_info_df
2024-04-27 09:33:47 -- finish.


In [4]:
def default_value():
    return 0


def count_paper_num_for_disc(VID):
    dict_list = []
    for i in range(len(model.start_year_list)-7, len(model.start_year_list)):
        start_year=model.start_year_list[i]
        end_year=model.end_year_list[i]
        prinT("start searching in %s to %s" %(start_year, end_year))

        model.load_paper_df(full_load=False, start_year=start_year, end_year=end_year)
        model.load_ref_df(full_load=False, start_year=start_year, end_year=end_year)
        VID_labeled = model.load_VID_labeled(start_year, end_year, d, w)
        VID_labeled_df = pd.DataFrame(VID_labeled)
        VID_labeled_df.set_index('VID', inplace=True)

        journal_df = model.target_paper_df[model.target_paper_df.VenueID == VID]
        journal_ref_df = pd.merge(journal_df, model.target_ref_df, left_on='PaperID', right_on='PaperID')
        journal_ref_df = pd.merge(journal_ref_df, model.target_paper_df, left_on='PaperReferenceID', right_on='PaperID')
        journal_ref_df.drop(columns=['PaperReferenceID'], inplace=True)
        journal_ref_df = pd.merge(journal_ref_df, VID_labeled_df, left_on='VenueID_y', right_index=True)
        journal_ref_df.drop(journal_ref_df[journal_ref_df['label']=='Multidisciplinary'].index, inplace=True)

        disc_paper_count = defaultdict(default_value)
        for row in journal_ref_df.groupby(['PaperID_x'])['label'].agg(pd.Series.mode):
            if type(row) == str:
                disc_paper_count[row] = disc_paper_count[row] + 1
            if type(row) == np.ndarray:
                disc_num = len(row)
                for disc in row:
                    disc_paper_count[disc] = disc_paper_count[disc] + 1/disc_num
        dict_list.append(dict(disc_paper_count))
    count_pd = pd.DataFrame(dict_list).T
    count_pd.columns = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
    return count_pd


def cal_disc_avg_cs(target_VID: int):
    cs_data = []
    for i in range(len(model.start_year_list)-7, len(model.start_year_list)):
        model.load_wv(model.start_year_list[i], model.end_year_list[i], d, w)
        target_journal_vector = model.wv.get_vector(target_VID, norm=True)
        full_vector_list = model.wv.get_normed_vectors()

        VID_labeled = model.load_VID_labeled(model.start_year_list[i], model.end_year_list[i], d, w)
        label_list = VID_labeled['label']
        labeled_VID_list = VID_labeled['VID']
        labeled_vector_list = np.stack([model.wv.get_vector(VID, norm=True) for VID in labeled_VID_list])

        single_decade_data = {}
        disciplines = list(set(label_list))
        for discipline in disciplines:
            single_displine_idx_list = [idx for idx, label in list(enumerate(label_list)) if label == discipline]
            single_discipline_cs = cosine_similarity(target_journal_vector.reshape(1, -1), labeled_vector_list[single_displine_idx_list])
            general_cs = cosine_similarity(target_journal_vector.reshape(1, -1), full_vector_list)
            
            single_decade_data[discipline] = np.mean((single_discipline_cs))/np.mean(general_cs)
        cs_data.append(single_decade_data)
    cs_pd = pd.DataFrame(cs_data).T
    cs_pd.columns = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
    return cs_pd

# Nature

In [17]:
count_df = count_paper_num_for_disc(137773608)

2023-11-26 14:25:32 -- start searching in 1950 to 1959
2023-11-26 14:25:32 -- start loading 'paper_df'...
2023-11-26 14:25:32 -- finish.
2023-11-26 14:25:32 -- start loading 'ref_df'...
2023-11-26 14:25:32 -- finish.
2023-11-26 14:25:32 -- start loading VID_labeled...
2023-11-26 14:25:32 -- finish.
2023-11-26 14:25:34 -- start searching in 1960 to 1969
2023-11-26 14:25:34 -- start loading 'paper_df'...
2023-11-26 14:25:34 -- finish.
2023-11-26 14:25:34 -- start loading 'ref_df'...
2023-11-26 14:25:34 -- finish.
2023-11-26 14:25:34 -- start loading VID_labeled...
2023-11-26 14:25:34 -- finish.
2023-11-26 14:25:36 -- start searching in 1970 to 1979
2023-11-26 14:25:36 -- start loading 'paper_df'...
2023-11-26 14:25:36 -- finish.
2023-11-26 14:25:36 -- start loading 'ref_df'...
2023-11-26 14:25:37 -- finish.
2023-11-26 14:25:37 -- start loading VID_labeled...
2023-11-26 14:25:37 -- finish.
2023-11-26 14:25:40 -- start searching in 1980 to 1989
2023-11-26 14:25:40 -- start loading 'paper_d

In [18]:
target_VID = 137773608
cs_df = cal_disc_avg_cs(target_VID)

2023-11-26 14:26:57 -- start loading word vectors...
2023-11-26 14:26:57 -- word vectors loaded, and its shape is: (1645, 100)
2023-11-26 14:26:57 -- start loading VID_labeled...
2023-11-26 14:26:57 -- finish.
2023-11-26 14:26:57 -- start loading word vectors...
2023-11-26 14:26:57 -- word vectors loaded, and its shape is: (3116, 100)
2023-11-26 14:26:57 -- start loading VID_labeled...
2023-11-26 14:26:57 -- finish.
2023-11-26 14:26:57 -- start loading word vectors...
2023-11-26 14:26:57 -- word vectors loaded, and its shape is: (5686, 100)
2023-11-26 14:26:57 -- start loading VID_labeled...
2023-11-26 14:26:57 -- finish.
2023-11-26 14:26:57 -- start loading word vectors...
2023-11-26 14:26:57 -- word vectors loaded, and its shape is: (9337, 100)
2023-11-26 14:26:57 -- start loading VID_labeled...
2023-11-26 14:26:57 -- finish.
2023-11-26 14:26:58 -- start loading word vectors...
2023-11-26 14:26:58 -- word vectors loaded, and its shape is: (15019, 100)
2023-11-26 14:26:58 -- start loa

In [19]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Medicine,1945.166667,4633.083333,2095.066667,763.333333,653.366667,1312.783333,2667.566667
Physics and Astronomy,836.25,1103.833333,680.166667,1660.783333,1782.25,2489.2,2580.583333
Earth and Planetary Sciences,499.75,2131.083333,3645.083333,2613.116667,1900.0,1957.7,1686.9
"Biochemistry, Genetics and Molecular Biology",2758.366667,5237.583333,4806.516667,4637.25,4301.566667,4598.65,5823.4
Agricultural and Biological Sciences,1246.7,1999.416667,1230.766667,971.333333,944.233333,1637.783333,1423.6
Mathematics,32.0,34.666667,22.25,24.666667,25.5,59.5,30.916667
Materials Science,70.166667,223.5,97.583333,54.116667,68.75,241.533333,538.033333
Engineering,143.333333,311.333333,200.333333,81.666667,53.666667,51.083333,90.416667
Immunology and Microbiology,267.283333,1273.5,1512.333333,1121.666667,603.7,762.916667,829.65
Chemistry,518.7,1744.5,521.033333,271.666667,466.533333,473.116667,845.033333


In [20]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Energy,1.008003,1.113046,1.044854,1.094235,1.023377,1.031201,1.038692
Multidisciplinary,1.980631,2.121551,2.110416,2.221889,1.859755,1.491091,1.44732
"Biochemistry, Genetics and Molecular Biology",1.628237,1.662705,1.816256,1.843534,1.921341,1.870026,1.693591
"Business, Management and Accounting",0.111513,-0.018887,0.174898,0.627425,0.332297,0.712374,0.920856
Agricultural and Biological Sciences,1.591622,1.513946,1.414707,1.366947,1.506662,1.524348,1.330971
Chemistry,1.570976,1.423128,1.281256,1.256027,1.532841,1.415445,1.306864
Materials Science,1.448888,1.201758,1.069521,0.832933,1.06807,1.261125,1.071689
Environmental Science,1.289883,0.947638,1.145339,1.088695,1.30411,1.363388,1.161678
Immunology and Microbiology,1.468454,1.657227,1.789647,1.796143,1.795675,1.599552,1.585688
Medicine,0.89199,1.091923,1.025191,0.978311,1.023303,0.75768,0.994922


In [21]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [22]:
res

KendalltauResult(correlation=0.5282643931287749, pvalue=3.835666454507298e-26)

# Sciecne

In [5]:
count_df = count_paper_num_for_disc(3880285)

2023-11-26 14:13:48 -- start searching in 1950 to 1959
2023-11-26 14:13:48 -- start loading 'paper_df'...
2023-11-26 14:13:48 -- finish.
2023-11-26 14:13:48 -- start loading 'ref_df'...
2023-11-26 14:13:48 -- finish.
2023-11-26 14:13:48 -- start loading VID_labeled...
2023-11-26 14:13:48 -- finish.
2023-11-26 14:13:49 -- start searching in 1960 to 1969
2023-11-26 14:13:49 -- start loading 'paper_df'...
2023-11-26 14:13:49 -- finish.
2023-11-26 14:13:49 -- start loading 'ref_df'...
2023-11-26 14:13:50 -- finish.
2023-11-26 14:13:50 -- start loading VID_labeled...
2023-11-26 14:13:50 -- finish.
2023-11-26 14:13:52 -- start searching in 1970 to 1979
2023-11-26 14:13:52 -- start loading 'paper_df'...
2023-11-26 14:13:52 -- finish.
2023-11-26 14:13:52 -- start loading 'ref_df'...
2023-11-26 14:13:53 -- finish.
2023-11-26 14:13:53 -- start loading VID_labeled...
2023-11-26 14:13:53 -- finish.
2023-11-26 14:13:56 -- start searching in 1980 to 1989
2023-11-26 14:13:56 -- start loading 'paper_d

In [6]:
target_VID = 3880285
cs_df = cal_disc_avg_cs(target_VID)

2023-11-26 14:15:40 -- start loading word vectors...
2023-11-26 14:15:40 -- word vectors loaded, and its shape is: (1645, 100)
2023-11-26 14:15:40 -- start loading VID_labeled...
2023-11-26 14:15:40 -- finish.
2023-11-26 14:15:40 -- start loading word vectors...
2023-11-26 14:15:40 -- word vectors loaded, and its shape is: (3116, 100)
2023-11-26 14:15:40 -- start loading VID_labeled...
2023-11-26 14:15:40 -- finish.
2023-11-26 14:15:40 -- start loading word vectors...
2023-11-26 14:15:40 -- word vectors loaded, and its shape is: (5686, 100)
2023-11-26 14:15:40 -- start loading VID_labeled...
2023-11-26 14:15:40 -- finish.
2023-11-26 14:15:40 -- start loading word vectors...
2023-11-26 14:15:40 -- word vectors loaded, and its shape is: (9337, 100)
2023-11-26 14:15:40 -- start loading VID_labeled...
2023-11-26 14:15:40 -- finish.
2023-11-26 14:15:41 -- start loading word vectors...
2023-11-26 14:15:41 -- word vectors loaded, and its shape is: (15019, 100)
2023-11-26 14:15:41 -- start loa

In [7]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Physics and Astronomy,162.7,303.833333,254.033333,472.45,1281.5,1670.666667,1717.75
Medicine,983.533333,1717.566667,1685.716667,1059.616667,731.033333,842.2,1321.55
"Biochemistry, Genetics and Molecular Biology",1215.033333,2108.066667,1720.4,2689.616667,3777.533333,3485.733333,3700.45
Agricultural and Biological Sciences,514.7,1001.733333,982.516667,800.783333,673.366667,1296.15,1622.983333
Earth and Planetary Sciences,178.833333,1234.316667,1263.733333,1014.45,1683.7,1796.866667,1273.45
Social Sciences,54.666667,188.75,241.733333,153.7,142.366667,273.033333,355.483333
"Pharmacology, Toxicology and Pharmaceutics",48.583333,152.366667,270.4,172.2,58.916667,42.583333,35.033333
Mathematics,12.333333,12.0,17.833333,23.75,18.5,30.833333,21.416667
Chemistry,145.083333,446.866667,308.45,277.333333,649.616667,670.45,1173.333333
Psychology,41.95,210.566667,216.333333,100.25,46.833333,152.916667,151.2


In [8]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Energy,1.150464,0.665888,1.045876,0.896514,0.926402,1.14665,1.248029
Multidisciplinary,1.900845,2.382027,2.097441,2.199271,1.789105,1.505892,1.508389
"Biochemistry, Genetics and Molecular Biology",1.528861,1.414245,1.41369,1.548555,1.880927,1.696026,1.325673
"Business, Management and Accounting",0.525033,0.429622,0.796234,1.044028,0.684456,0.887595,1.075853
Agricultural and Biological Sciences,1.349176,1.355547,1.304523,1.252823,1.332925,1.505988,1.411518
Chemistry,1.204955,1.01896,0.989051,1.03636,1.471776,1.472189,1.500055
Materials Science,1.253112,0.552358,0.654635,0.628215,1.043553,1.33533,1.271321
Environmental Science,1.099815,1.225419,1.405473,1.170002,1.296479,1.465841,1.393603
Immunology and Microbiology,1.418164,1.220743,1.250091,1.487493,1.739188,1.561552,1.484979
Medicine,1.078755,1.009505,0.966122,1.00863,1.048686,0.73514,0.736425


In [9]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [10]:
res

KendalltauResult(correlation=0.4885123607842555, pvalue=1.3647337231051353e-22)

# PNAS

In [11]:
count_df = count_paper_num_for_disc(125754415)

2023-11-26 14:16:42 -- start searching in 1950 to 1959
2023-11-26 14:16:42 -- start loading 'paper_df'...
2023-11-26 14:16:42 -- finish.
2023-11-26 14:16:42 -- start loading 'ref_df'...
2023-11-26 14:16:42 -- finish.
2023-11-26 14:16:42 -- start loading VID_labeled...
2023-11-26 14:16:42 -- finish.
2023-11-26 14:16:43 -- start searching in 1960 to 1969
2023-11-26 14:16:43 -- start loading 'paper_df'...
2023-11-26 14:16:43 -- finish.
2023-11-26 14:16:43 -- start loading 'ref_df'...
2023-11-26 14:16:43 -- finish.
2023-11-26 14:16:43 -- start loading VID_labeled...
2023-11-26 14:16:43 -- finish.
2023-11-26 14:16:44 -- start searching in 1970 to 1979
2023-11-26 14:16:44 -- start loading 'paper_df'...
2023-11-26 14:16:44 -- finish.
2023-11-26 14:16:44 -- start loading 'ref_df'...
2023-11-26 14:16:44 -- finish.
2023-11-26 14:16:44 -- start loading VID_labeled...
2023-11-26 14:16:44 -- finish.
2023-11-26 14:16:46 -- start searching in 1980 to 1989
2023-11-26 14:16:46 -- start loading 'paper_d

In [12]:
target_VID = 125754415
cs_df = cal_disc_avg_cs(target_VID)

2023-11-26 14:19:25 -- start loading word vectors...
2023-11-26 14:19:25 -- word vectors loaded, and its shape is: (1645, 100)
2023-11-26 14:19:25 -- start loading VID_labeled...
2023-11-26 14:19:25 -- finish.
2023-11-26 14:19:25 -- start loading word vectors...
2023-11-26 14:19:25 -- word vectors loaded, and its shape is: (3116, 100)
2023-11-26 14:19:25 -- start loading VID_labeled...
2023-11-26 14:19:25 -- finish.
2023-11-26 14:19:25 -- start loading word vectors...
2023-11-26 14:19:25 -- word vectors loaded, and its shape is: (5686, 100)
2023-11-26 14:19:25 -- start loading VID_labeled...
2023-11-26 14:19:25 -- finish.
2023-11-26 14:19:25 -- start loading word vectors...
2023-11-26 14:19:25 -- word vectors loaded, and its shape is: (9337, 100)
2023-11-26 14:19:25 -- start loading VID_labeled...
2023-11-26 14:19:25 -- finish.
2023-11-26 14:19:25 -- start loading word vectors...
2023-11-26 14:19:25 -- word vectors loaded, and its shape is: (15019, 100)
2023-11-26 14:19:25 -- start loa

In [13]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Biochemistry, Genetics and Molecular Biology",74.333333,374.666667,1595.933333,3731.083333,9574.0,15718.45,15639.909524
Neuroscience,0.5,2.166667,42.65,167.0,1066.416667,2851.583333,3505.216667
Medicine,17.833333,68.833333,317.1,480.25,1217.583333,2970.333333,5272.383333
Agricultural and Biological Sciences,14.0,12.666667,71.233333,90.666667,709.0,2294.616667,4576.62619
Mathematics,16.0,39.833333,40.0,17.333333,35.916667,166.533333,307.15
Physics and Astronomy,8.5,6.5,7.5,16.5,129.5,631.65,2003.892857
Immunology and Microbiology,5.833333,57.166667,291.116667,646.916667,1712.0,3300.2,3846.65
"Pharmacology, Toxicology and Pharmaceutics",0.5,5.0,29.616667,71.75,146.583333,210.7,247.4
Earth and Planetary Sciences,2.0,4.5,3.0,1.333333,74.916667,549.5,1809.233333
Decision Sciences,7.5,3.5,,1.0,5.25,7.75,4.5


In [14]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Energy,1.25725,0.40489,0.618617,0.616378,0.417651,0.721157,0.997098
Multidisciplinary,2.038058,1.886269,1.802948,1.502439,1.29523,1.296506,1.415607
"Biochemistry, Genetics and Molecular Biology",1.473935,1.721312,2.029711,2.003044,2.150765,2.151032,1.631741
"Business, Management and Accounting",0.242031,0.332882,0.429946,0.655697,0.54714,0.703315,1.009511
Agricultural and Biological Sciences,1.296404,1.243907,1.28939,1.210468,1.405845,1.554611,1.474516
Chemistry,1.163527,1.064144,1.075046,0.956323,1.179155,1.29347,1.17935
Materials Science,1.05679,0.562022,0.670801,0.61404,0.724567,0.797134,0.881753
Environmental Science,1.303272,0.883071,0.829215,0.671328,0.852022,1.129267,1.293647
Immunology and Microbiology,1.37376,1.445065,1.807559,1.860784,1.918386,1.7325,1.506598
Medicine,0.844849,0.976786,1.144325,1.200025,1.239398,0.948922,0.863606


In [15]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [16]:
res

KendalltauResult(correlation=0.39390240175110824, pvalue=8.728063877617403e-15)

# Different sets of hypterparameter comparison

In [5]:
for d in [50, 100, 200, 300]:
    for w in [2 ,5, 10]:
        for VID in [137773608, 3880285, 125754415]:
            print("-------------------------------")
            print("D: {}, W: {}, Peridocial: {}".format(d, w, model.MAG_venue_info_df.at[VID, 'OriginalVenue']))
            count_df = count_paper_num_for_disc(VID)
            cs_df = cal_disc_avg_cs(VID)
            res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())
            print(res)

-------------------------------
D: 50, W: 2, Peridocial: Nature
2024-04-27 09:33:59 -- start searching in 1950 to 1959
2024-04-27 09:33:59 -- start loading 'paper_df'...
2024-04-27 09:33:59 -- finish.
2024-04-27 09:33:59 -- start loading 'ref_df'...
2024-04-27 09:33:59 -- finish.
2024-04-27 09:33:59 -- start loading VID_labeled...
2024-04-27 09:33:59 -- finish.
2024-04-27 09:34:00 -- start searching in 1960 to 1969
2024-04-27 09:34:00 -- start loading 'paper_df'...
2024-04-27 09:34:01 -- finish.
2024-04-27 09:34:01 -- start loading 'ref_df'...
2024-04-27 09:34:01 -- finish.
2024-04-27 09:34:01 -- start loading VID_labeled...
2024-04-27 09:34:01 -- finish.
2024-04-27 09:34:04 -- start searching in 1970 to 1979
2024-04-27 09:34:04 -- start loading 'paper_df'...
2024-04-27 09:34:04 -- finish.
2024-04-27 09:34:04 -- start loading 'ref_df'...
2024-04-27 09:34:05 -- finish.
2024-04-27 09:34:05 -- start loading VID_labeled...
2024-04-27 09:34:05 -- finish.
2024-04-27 09:34:08 -- start searchi