In [1]:
%load_ext autoreload
%autoreload 1

In [36]:
import pandas as pd
import numpy as np

import sys 
sys.path.append("..") 
%aimport preprocessing
%aimport utils
%aimport p2v

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import kendalltau
from collections import defaultdict

from utils import prinT

pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)

**Tracking 3 flagships**  
Nature: 137773608  
Science: 3880285  
PNAS: 125754415  

In [3]:
model = p2v.P2V()
d=100
w=10

2023-08-02 11:00:20 -- start loading Mag_venue_info_df
2023-08-02 11:00:20 -- finish.
2023-08-02 11:00:20 -- start loading labeled_journal_info_df
2023-08-02 11:00:20 -- finish.


In [4]:
def default_value():
    return 0


def count_paper_num_for_disc(VID):
    dict_list = []
    for i in range(len(model.start_year_list)-7, len(model.start_year_list)):
        start_year=model.start_year_list[i]
        end_year=model.end_year_list[i]
        prinT("start searching in %s to %s" %(start_year, end_year))

        model.load_paper_df(full_load=False, start_year=start_year, end_year=end_year)
        model.load_ref_df(full_load=False, start_year=start_year, end_year=end_year)
        VID_labeled = model.load_VID_labeled(start_year, end_year, d, w)
        VID_labeled_df = pd.DataFrame(VID_labeled)
        VID_labeled_df.set_index('VID', inplace=True)

        journal_df = model.target_paper_df[model.target_paper_df.VenueID == VID]
        journal_ref_df = pd.merge(journal_df, model.target_ref_df, left_on='PaperID', right_on='PaperID')
        journal_ref_df = pd.merge(journal_ref_df, model.target_paper_df, left_on='PaperReferenceID', right_on='PaperID')
        journal_ref_df.drop(columns=['PaperReferenceID'], inplace=True)
        journal_ref_df = pd.merge(journal_ref_df, VID_labeled_df, left_on='VenueID_y', right_index=True)
        journal_ref_df.drop(journal_ref_df[journal_ref_df['label']=='Multidisciplinary'].index, inplace=True)

        disc_paper_count = defaultdict(default_value)
        for row in journal_ref_df.groupby(['PaperID_x'])['label'].agg(pd.Series.mode):
            if type(row) == str:
                disc_paper_count[row] = disc_paper_count[row] + 1
            if type(row) == np.ndarray:
                disc_num = len(row)
                for disc in row:
                    disc_paper_count[disc] = disc_paper_count[disc] + 1/disc_num
        dict_list.append(dict(disc_paper_count))
    count_pd = pd.DataFrame(dict_list).T
    count_pd.columns = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
    return count_pd


def cal_disc_avg_cs(target_VID: int):
    cs_data = []
    for i in range(len(model.start_year_list)-7, len(model.start_year_list)):
        model.load_wv(model.start_year_list[i], model.end_year_list[i], d, w)
        target_journal_vector = model.wv.get_vector(target_VID, norm=True)
        full_vector_list = model.wv.get_normed_vectors()

        VID_labeled = model.load_VID_labeled(model.start_year_list[i], model.end_year_list[i], d, w)
        label_list = VID_labeled['label']
        labeled_VID_list = VID_labeled['VID']
        labeled_vector_list = np.stack([model.wv.get_vector(VID, norm=True) for VID in labeled_VID_list])

        single_decade_data = {}
        disciplines = list(set(label_list))
        for discipline in disciplines:
            single_displine_idx_list = [idx for idx, label in list(enumerate(label_list)) if label == discipline]
            single_discipline_cs = cosine_similarity(target_journal_vector.reshape(1, -1), labeled_vector_list[single_displine_idx_list])
            general_cs = cosine_similarity(target_journal_vector.reshape(1, -1), full_vector_list)
            
            single_decade_data[discipline] = np.mean((single_discipline_cs))/np.mean(general_cs)
        cs_data.append(single_decade_data)
    cs_pd = pd.DataFrame(cs_data).T
    cs_pd.columns = ['1950s', '1960s', '1970s', '1980s', '1990s', '2000s', '2010s']
    return cs_pd

In [24]:
count_df = count_paper_num_for_disc(137773608)

2023-08-02 12:11:02 -- start searching in 1950 to 1959
2023-08-02 12:11:03 -- start loading VID_labeled...
2023-08-02 12:11:03 -- finish.
2023-08-02 12:11:04 -- start searching in 1960 to 1969
2023-08-02 12:11:04 -- start loading VID_labeled...
2023-08-02 12:11:04 -- finish.
2023-08-02 12:11:07 -- start searching in 1970 to 1979
2023-08-02 12:11:07 -- start loading VID_labeled...
2023-08-02 12:11:07 -- finish.
2023-08-02 12:11:10 -- start searching in 1980 to 1989
2023-08-02 12:11:10 -- start loading VID_labeled...
2023-08-02 12:11:10 -- finish.
2023-08-02 12:11:15 -- start searching in 1990 to 1999
2023-08-02 12:11:16 -- start loading VID_labeled...
2023-08-02 12:11:16 -- finish.
2023-08-02 12:11:23 -- start searching in 2000 to 2009
2023-08-02 12:11:24 -- start loading VID_labeled...
2023-08-02 12:11:24 -- finish.
2023-08-02 12:11:39 -- start searching in 2010 to 2021
2023-08-02 12:11:43 -- start loading VID_labeled...
2023-08-02 12:11:43 -- finish.


In [25]:
target_VID = 137773608
cs_df = cal_disc_avg_cs(target_VID)

2023-08-02 12:17:07 -- start loading word vectors...
2023-08-02 12:17:07 -- word vectors loaded, and its shape is: (1683, 100)
2023-08-02 12:17:07 -- start loading VID_labeled...
2023-08-02 12:17:07 -- finish.
2023-08-02 12:17:08 -- start loading word vectors...
2023-08-02 12:17:08 -- word vectors loaded, and its shape is: (3186, 100)
2023-08-02 12:17:08 -- start loading VID_labeled...
2023-08-02 12:17:08 -- finish.
2023-08-02 12:17:08 -- start loading word vectors...
2023-08-02 12:17:08 -- word vectors loaded, and its shape is: (5761, 100)
2023-08-02 12:17:08 -- start loading VID_labeled...
2023-08-02 12:17:08 -- finish.
2023-08-02 12:17:08 -- start loading word vectors...
2023-08-02 12:17:08 -- word vectors loaded, and its shape is: (9437, 100)
2023-08-02 12:17:08 -- start loading VID_labeled...
2023-08-02 12:17:08 -- finish.
2023-08-02 12:17:08 -- start loading word vectors...
2023-08-02 12:17:08 -- word vectors loaded, and its shape is: (15153, 100)
2023-08-02 12:17:08 -- start loa

In [26]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
Medicine,1901.916667,4660.666667,2071.066667,799.416667,647.2,1303.65,2642.683333
Physics and Astronomy,838.666667,1036.0,629.25,1635.95,1777.0,2511.366667,2581.25
Earth and Planetary Sciences,499.75,2129.166667,3642.916667,2636.866667,1920.25,1935.166667,1680.066667
"Biochemistry, Genetics and Molecular Biology",2772.033333,5324.833333,4756.066667,4591.416667,4274.233333,4585.733333,5983.6
Agricultural and Biological Sciences,1244.366667,1974.666667,1262.783333,945.083333,958.65,1700.983333,1277.883333
Mathematics,34.333333,34.916667,93.75,49.416667,26.166667,56.5,41.616667
Materials Science,52.833333,289.416667,91.5,51.616667,69.25,233.95,534.033333
Engineering,141.75,311.916667,182.833333,78.416667,55.416667,49.333333,93.833333
Immunology and Microbiology,290.533333,1260.916667,1514.75,1124.416667,605.866667,760.916667,834.233333
Chemistry,488.033333,1766.75,519.7,273.416667,461.366667,474.95,852.366667


In [27]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Pharmacology, Toxicology and Pharmaceutics",1.284664,1.37991,1.496985,1.200284,1.212383,1.099028,0.986622
Arts and Humanities,0.211186,0.015352,0.357009,0.527851,0.719829,0.808214,1.156737
"Biochemistry, Genetics and Molecular Biology",1.702366,1.682011,1.860618,1.842635,1.914824,1.916099,1.746267
Computer Science,0.9245,0.695259,0.591174,0.831737,0.648588,0.954411,0.817454
Social Sciences,0.402991,0.306805,0.47579,0.593497,0.681864,0.872305,1.063874
Nursing,0.855602,0.624501,0.337165,0.410253,0.634971,0.501471,0.708121
Immunology and Microbiology,1.518738,1.680829,1.880275,1.777,1.808973,1.629787,1.663375
Dentistry,0.843171,1.064612,0.977512,0.970793,0.804003,0.742468,0.912468
Mathematics,0.708511,0.4626,0.715688,0.838852,0.728945,1.036447,1.050462
Agricultural and Biological Sciences,1.568457,1.495039,1.457249,1.37533,1.528661,1.520849,1.402821


In [28]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [29]:
res

KendalltauResult(correlation=0.5289866077036508, pvalue=3.201614912996724e-26)

In [11]:
count_df = count_paper_num_for_disc(3880285)

2023-08-02 11:14:10 -- start searching in 1950 to 1959
2023-08-02 11:14:11 -- start loading VID_labeled...
2023-08-02 11:14:11 -- finish.
2023-08-02 11:14:12 -- start searching in 1960 to 1969
2023-08-02 11:14:12 -- start loading VID_labeled...
2023-08-02 11:14:12 -- finish.
2023-08-02 11:14:13 -- start searching in 1970 to 1979
2023-08-02 11:14:13 -- start loading VID_labeled...
2023-08-02 11:14:13 -- finish.
2023-08-02 11:14:16 -- start searching in 1980 to 1989
2023-08-02 11:14:16 -- start loading VID_labeled...
2023-08-02 11:14:16 -- finish.
2023-08-02 11:14:21 -- start searching in 1990 to 1999
2023-08-02 11:14:21 -- start loading VID_labeled...
2023-08-02 11:14:21 -- finish.
2023-08-02 11:14:29 -- start searching in 2000 to 2009
2023-08-02 11:14:30 -- start loading VID_labeled...
2023-08-02 11:14:30 -- finish.
2023-08-02 11:14:48 -- start searching in 2010 to 2021
2023-08-02 11:14:52 -- start loading VID_labeled...
2023-08-02 11:14:52 -- finish.


In [12]:
target_VID = 3880285
cs_df = cal_disc_avg_cs(target_VID)

2023-08-02 11:36:38 -- start loading word vectors...
2023-08-02 11:36:38 -- word vectors loaded, and its shape is: (1683, 100)
2023-08-02 11:36:38 -- start loading VID_labeled...
2023-08-02 11:36:38 -- finish.
2023-08-02 11:36:38 -- start loading word vectors...
2023-08-02 11:36:38 -- word vectors loaded, and its shape is: (3186, 100)
2023-08-02 11:36:38 -- start loading VID_labeled...
2023-08-02 11:36:38 -- finish.
2023-08-02 11:36:38 -- start loading word vectors...
2023-08-02 11:36:38 -- word vectors loaded, and its shape is: (5761, 100)
2023-08-02 11:36:38 -- start loading VID_labeled...
2023-08-02 11:36:38 -- finish.
2023-08-02 11:36:38 -- start loading word vectors...
2023-08-02 11:36:38 -- word vectors loaded, and its shape is: (9437, 100)
2023-08-02 11:36:38 -- start loading VID_labeled...
2023-08-02 11:36:38 -- finish.
2023-08-02 11:36:38 -- start loading word vectors...
2023-08-02 11:36:38 -- word vectors loaded, and its shape is: (15153, 100)
2023-08-02 11:36:38 -- start loa

In [18]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Biochemistry, Genetics and Molecular Biology",72.166667,375.833333,1587.683333,3656.25,9557.4,15595.7,15938.67619
Neuroscience,0.5,2.166667,42.9,170.833333,1085.15,2878.25,3473.666667
Medicine,17.666667,69.5,316.35,538.416667,1206.866667,2993.0,5163.283333
Agricultural and Biological Sciences,14.0,12.166667,74.9,88.166667,711.616667,2375.2,4340.959524
Mathematics,16.0,39.5,40.0,17.333333,37.2,160.9,327.266667
Physics and Astronomy,8.5,5.5,7.5,16.5,128.866667,642.016667,1996.72619
Arts and Humanities,3.333333,,6.4,0.5,7.0,20.166667,240.95
Immunology and Microbiology,5.833333,56.166667,292.783333,650.75,1724.25,3304.533333,3866.016667
"Pharmacology, Toxicology and Pharmaceutics",0.5,5.0,32.45,78.416667,134.666667,230.616667,245.133333
Earth and Planetary Sciences,2.0,4.5,2.0,1.333333,74.916667,545.116667,1800.483333


In [19]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Pharmacology, Toxicology and Pharmaceutics",1.089731,1.119209,1.483761,1.40487,1.645053,1.519279,1.077628
Arts and Humanities,0.2658,0.230021,0.346395,0.423195,0.655154,0.932021,1.255241
"Biochemistry, Genetics and Molecular Biology",1.513023,1.757472,2.043535,2.002477,2.161356,2.138589,1.66285
Computer Science,1.502852,1.395101,0.918262,1.118799,0.835731,0.885643,0.681428
Social Sciences,0.279148,0.390953,0.461788,0.559505,0.642793,0.855757,1.267772
Nursing,0.753955,0.472428,0.731249,0.872315,0.780161,0.583365,0.583125
Immunology and Microbiology,1.347061,1.47061,1.830544,1.862788,1.9355,1.714872,1.563431
Dentistry,1.048133,0.99234,0.934815,0.943524,1.175859,0.969804,0.674967
Mathematics,2.019327,1.904091,1.447217,1.236867,1.013499,1.176268,1.093283
Agricultural and Biological Sciences,1.287526,1.224442,1.262253,1.201352,1.403692,1.540679,1.496234


In [20]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [22]:
res

KendalltauResult(correlation=0.379534308177933, pvalue=7.696631212625189e-14)

In [30]:
count_df = count_paper_num_for_disc(125754415)

2023-08-02 12:17:35 -- start searching in 1950 to 1959
2023-08-02 12:17:35 -- start loading VID_labeled...
2023-08-02 12:17:35 -- finish.
2023-08-02 12:17:36 -- start searching in 1960 to 1969
2023-08-02 12:17:36 -- start loading VID_labeled...
2023-08-02 12:17:36 -- finish.
2023-08-02 12:17:37 -- start searching in 1970 to 1979
2023-08-02 12:17:37 -- start loading VID_labeled...
2023-08-02 12:17:37 -- finish.
2023-08-02 12:17:39 -- start searching in 1980 to 1989
2023-08-02 12:17:39 -- start loading VID_labeled...
2023-08-02 12:17:39 -- finish.
2023-08-02 12:17:43 -- start searching in 1990 to 1999
2023-08-02 12:17:44 -- start loading VID_labeled...
2023-08-02 12:17:44 -- finish.
2023-08-02 12:17:51 -- start searching in 2000 to 2009
2023-08-02 12:17:52 -- start loading VID_labeled...
2023-08-02 12:17:52 -- finish.
2023-08-02 12:18:09 -- start searching in 2010 to 2021
2023-08-02 12:18:13 -- start loading VID_labeled...
2023-08-02 12:18:13 -- finish.


In [31]:
target_VID = 125754415
cs_df = cal_disc_avg_cs(target_VID)

2023-08-02 12:30:42 -- start loading word vectors...
2023-08-02 12:30:42 -- word vectors loaded, and its shape is: (1683, 100)
2023-08-02 12:30:42 -- start loading VID_labeled...
2023-08-02 12:30:42 -- finish.
2023-08-02 12:30:43 -- start loading word vectors...
2023-08-02 12:30:43 -- word vectors loaded, and its shape is: (3186, 100)
2023-08-02 12:30:43 -- start loading VID_labeled...
2023-08-02 12:30:43 -- finish.
2023-08-02 12:30:43 -- start loading word vectors...
2023-08-02 12:30:43 -- word vectors loaded, and its shape is: (5761, 100)
2023-08-02 12:30:43 -- start loading VID_labeled...
2023-08-02 12:30:43 -- finish.
2023-08-02 12:30:43 -- start loading word vectors...
2023-08-02 12:30:43 -- word vectors loaded, and its shape is: (9437, 100)
2023-08-02 12:30:43 -- start loading VID_labeled...
2023-08-02 12:30:43 -- finish.
2023-08-02 12:30:43 -- start loading word vectors...
2023-08-02 12:30:43 -- word vectors loaded, and its shape is: (15153, 100)
2023-08-02 12:30:43 -- start loa

In [32]:
count_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Biochemistry, Genetics and Molecular Biology",72.166667,375.833333,1587.683333,3656.25,9557.4,15595.7,15938.67619
Neuroscience,0.5,2.166667,42.9,170.833333,1085.15,2878.25,3473.666667
Medicine,17.666667,69.5,316.35,538.416667,1206.866667,2993.0,5163.283333
Agricultural and Biological Sciences,14.0,12.166667,74.9,88.166667,711.616667,2375.2,4340.959524
Mathematics,16.0,39.5,40.0,17.333333,37.2,160.9,327.266667
Physics and Astronomy,8.5,5.5,7.5,16.5,128.866667,642.016667,1996.72619
Arts and Humanities,3.333333,,6.4,0.5,7.0,20.166667,240.95
Immunology and Microbiology,5.833333,56.166667,292.783333,650.75,1724.25,3304.533333,3866.016667
"Pharmacology, Toxicology and Pharmaceutics",0.5,5.0,32.45,78.416667,134.666667,230.616667,245.133333
Earth and Planetary Sciences,2.0,4.5,2.0,1.333333,74.916667,545.116667,1800.483333


In [33]:
cs_df

Unnamed: 0,1950s,1960s,1970s,1980s,1990s,2000s,2010s
"Pharmacology, Toxicology and Pharmaceutics",1.089731,1.119209,1.483761,1.40487,1.645053,1.519279,1.077628
Arts and Humanities,0.2658,0.230021,0.346395,0.423195,0.655154,0.932021,1.255241
"Biochemistry, Genetics and Molecular Biology",1.513023,1.757472,2.043535,2.002477,2.161356,2.138589,1.66285
Computer Science,1.502852,1.395101,0.918262,1.118799,0.835731,0.885643,0.681428
Social Sciences,0.279148,0.390953,0.461788,0.559505,0.642793,0.855757,1.267772
Nursing,0.753955,0.472428,0.731249,0.872315,0.780161,0.583365,0.583125
Immunology and Microbiology,1.347061,1.47061,1.830544,1.862788,1.9355,1.714872,1.563431
Dentistry,1.048133,0.99234,0.934815,0.943524,1.175859,0.969804,0.674967
Mathematics,2.019327,1.904091,1.447217,1.236867,1.013499,1.176268,1.093283
Agricultural and Biological Sciences,1.287526,1.224442,1.262253,1.201352,1.403692,1.540679,1.496234


In [34]:
res = kendalltau(count_df.fillna(0).sort_index(), cs_df.drop('Multidisciplinary').sort_index())

In [35]:
res

KendalltauResult(correlation=0.379534308177933, pvalue=7.696631212625189e-14)