In [1]:
%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tqdm
import seaborn as sns; sns.set()
sns.__version__

import pickle

In [2]:
!pip show pandas

Name: pandas
Version: 1.2.5
Summary: Powerful data structures for data analysis, time series, and statistics
Home-page: https://pandas.pydata.org
Author: 
Author-email: 
License: BSD
Location: /home/minsu/proj/tensorflow_hub_env/lib/python3.8/site-packages
Requires: numpy, python-dateutil, pytz
Required-by: seaborn, statsmodels


In [1]:
! pip show statsmodels

Name: statsmodels
Version: 0.13.2
Summary: Statistical computations and models for Python
Home-page: https://www.statsmodels.org/
Author: 
Author-email: 
License: BSD License
Location: /home/minsu/proj/tensorflow_hub_env/lib/python3.8/site-packages
Requires: numpy, packaging, pandas, patsy, scipy
Required-by: 


## Data Preparation

In [2]:
df_paper_grant_stat= pd.read_pickle('/data/interdisciplinarity/df_paper_grant_stat_regression.pkl')

In [3]:
with open('/data/sci_of_sci/linkage/d_paper2fos-dist-in-ref.pkl', 'rb') as filename:
    d_reference = pickle.load(filename)

In [4]:
df_paper_grant_stat['mag_id'] = df_paper_grant_stat['mag_id'].apply(str)

In [5]:
df_paper_grant_stat['paper_fos'] = df_paper_grant_stat['paper_fos'].astype(int, errors='ignore').apply(str)

In [6]:
df_paper_grant_stat['num_ref'] = df_paper_grant_stat['mag_id'].map(d_reference).apply(lambda x: x if pd.isnull(x) else sum(x.values()))

In [7]:
tmp = df_paper_grant_stat[['interdisciplinarity_reference','mean_grant_interdisciplinarity','avg_dist_btw_grants',
                           'num_authors','sum_funding_usd','num_grant','num_funding_country','num_institutes',
                           'norm_c10_year_fos','c_hit_rate_5_lvl-0','c10',
                           'num_ref','pub_year','paper_fos']].dropna().copy()

In [8]:
tmp = tmp[tmp['num_grant']>1]

### OLS

In [9]:
#-https://medium.com/@kylecaron/introduction-to-linear-regression-part-2-standardization-and-regression-diagnostics-a15cb27944b1

def standardize(series):
    return (series - series.mean()) / series.std()

In [10]:
reg_df_standardized = tmp.copy()
reg_df_standardized[['norm_c10_year_fos',
                     'c10',
                     'interdisciplinarity_reference',
                     'mean_grant_interdisciplinarity',
                     'avg_dist_btw_grants',
                     'num_authors',
                     'num_grant',
                     'num_funding_country',
                     'num_institutes',
                     'sum_funding_usd',
                     'num_ref']] = reg_df_standardized[['norm_c10_year_fos',
                                                        'c10',
                                                        'interdisciplinarity_reference',
                                                        'mean_grant_interdisciplinarity',
                                                        'avg_dist_btw_grants',
                                                        'num_authors',
                                                        'num_grant',
                                                        'num_funding_country',
                                                        'num_institutes',
                                                        'sum_funding_usd',
                                                        'num_ref']].apply(standardize)

In [13]:
reg_df_standardized_w_sim = tmp.copy()
reg_df_standardized_w_sim['avg_sim_btw_grants'] = 1-reg_df_standardized_w_sim['avg_dist_btw_grants']
reg_df_standardized_w_sim[['norm_c10_year_fos',
                           'c10',
                           'interdisciplinarity_reference',
                           'mean_grant_interdisciplinarity',
                           'avg_sim_btw_grants',
                           'num_authors',
                           'num_grant',
                           'num_funding_country',
                           'num_institutes',
                           'sum_funding_usd',
                           'num_ref']] = reg_df_standardized_w_sim[['norm_c10_year_fos',
                                                                    'c10',
                                                                    'interdisciplinarity_reference',
                                                                    'mean_grant_interdisciplinarity',
                                                                    'avg_sim_btw_grants',
                                                                    'num_authors',
                                                                    'num_grant',
                                                                    'num_funding_country',
                                                                    'num_institutes',
                                                                    'sum_funding_usd',
                                                                    'num_ref']].apply(standardize)

In [249]:
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

reg1 = ols('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants', 
           data=reg_df_standardized).fit()
reg2 = ols('norm_c10_year_fos ~ num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes', 
           data=reg_df_standardized).fit()
reg3 = ols('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes', 
            data=reg_df_standardized).fit()
reg4 = ols('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            C(pub_year)', data=reg_df_standardized).fit()
reg5 = ols('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            num_ref', data=reg_df_standardized).fit()
reg6 = ols('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            C(pub_year) + num_ref', data=reg_df_standardized).fit()

results = summary_col([reg1, reg2, reg3, reg4, reg5, reg6], stars=True, float_format='%0.3f',
                      model_names=['Model\n(1)', 'Model\n(2)', 'Model\n(3)', 
                                   'Model\n(4)', 'Model\n(5)', 'Model\n(6)'],
                      info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                 'R2':lambda x: "{:.3f}".format(x.rsquared)})
print(results)


                                 Model    Model     Model     Model     Model     Model  
                                  (1)      (2)       (3)       (4)       (5)       (6)   
-----------------------------------------------------------------------------------------
C(pub_year)[T.1986]                                         0.019               0.026    
                                                            (0.122)             (0.121)  
C(pub_year)[T.1987]                                         0.009               -0.004   
                                                            (0.112)             (0.111)  
C(pub_year)[T.1988]                                         0.011               0.001    
                                                            (0.109)             (0.108)  
C(pub_year)[T.1989]                                         -0.001              -0.013   
                                                            (0.108)             (0.107)  
C(pub_yea

In [297]:
reg4.summary()

0,1,2,3
Dep. Variable:,c10,R-squared:,0.06
Model:,OLS,Adj. R-squared:,0.06
Method:,Least Squares,F-statistic:,648.0
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,0.0
Time:,14:42:01,Log-Likelihood:,-715870.0
No. Observations:,515796,AIC:,1432000.0
Df Residuals:,515744,BIC:,1432000.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2526,0.104,-2.438,0.015,-0.456,-0.050
paper_fos[T.127313418.0],0.1129,0.012,9.643,0.000,0.090,0.136
paper_fos[T.127413603.0],0.0852,0.019,4.406,0.000,0.047,0.123
paper_fos[T.138885662.0],0.0749,0.292,0.256,0.798,-0.498,0.648
paper_fos[T.142362112.0],0.3761,0.434,0.867,0.386,-0.474,1.226
paper_fos[T.144024400.0],0.1311,0.047,2.811,0.005,0.040,0.222
paper_fos[T.144133560.0],0.1584,0.142,1.119,0.263,-0.119,0.436
paper_fos[T.15744967.0],0.1754,0.009,20.217,0.000,0.158,0.192
paper_fos[T.162324750.0],0.1976,0.033,6.043,0.000,0.134,0.262

0,1,2,3
Omnibus:,1187842.178,Durbin-Watson:,1.992
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36797199597.953
Skew:,21.925,Prob(JB):,0.0
Kurtosis:,1310.766,Cond. No.,536.0


In [292]:
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

reg1 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            paper_fos', data=reg_df_standardized).fit()
reg2 = ols('c10 ~ num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos', data=reg_df_standardized).fit()
reg3 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos', data=reg_df_standardized).fit()
reg4 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            paper_fos + C(pub_year)', data=reg_df_standardized).fit()
reg5 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            paper_fos + num_ref', data=reg_df_standardized).fit()
reg6 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            paper_fos + C(pub_year) + num_ref', data=reg_df_standardized).fit()

results = summary_col([reg1, reg2, reg3, reg4, reg5, reg6], stars=True, float_format='%0.3f',
                      model_names=['Model\n(1)', 'Model\n(2)', 'Model\n(3)', 
                                   'Model\n(4)', 'Model\n(5)', 'Model\n(6)'],
                      info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                 'R2':lambda x: "{:.3f}".format(x.rsquared)})
print(results)


                                 Model     Model     Model     Model     Model     Model  
                                  (1)       (2)       (3)       (4)       (5)       (6)   
------------------------------------------------------------------------------------------
C(pub_year)[T.1986]                                          0.050               0.055    
                                                             (0.121)             (0.119)  
C(pub_year)[T.1987]                                          0.032               0.015    
                                                             (0.112)             (0.110)  
C(pub_year)[T.1988]                                          0.043               0.029    
                                                             (0.108)             (0.106)  
C(pub_year)[T.1989]                                          0.044               0.028    
                                                             (0.107)             (0.105) 

In [293]:
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

reg1 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            paper_fos + C(pub_year)', data=reg_df_standardized).fit()
reg2 = ols('c10 ~ num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos + C(pub_year)', data=reg_df_standardized).fit()
reg3 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos + C(pub_year)', data=reg_df_standardized).fit()
reg4 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            paper_fos + C(pub_year) + num_ref', data=reg_df_standardized).fit()

results = summary_col([reg1, reg2, reg3, reg4], stars=True, float_format='%0.3f',
                      model_names=['Model\n(1)', 'Model\n(2)', 'Model\n(3)', 'Model\n(4)'],
                      info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                 'R2':lambda x: "{:.3f}".format(x.rsquared)})
print(results)


                                 Model     Model     Model     Model  
                                  (1)       (2)       (3)       (4)   
----------------------------------------------------------------------
C(pub_year)[T.1986]            0.050     0.063     0.050     0.055    
                               (0.122)   (0.121)   (0.121)   (0.119)  
C(pub_year)[T.1987]            0.033     0.044     0.032     0.015    
                               (0.112)   (0.112)   (0.112)   (0.110)  
C(pub_year)[T.1988]            0.049     0.054     0.043     0.029    
                               (0.109)   (0.108)   (0.108)   (0.106)  
C(pub_year)[T.1989]            0.051     0.055     0.044     0.028    
                               (0.108)   (0.107)   (0.107)   (0.105)  
C(pub_year)[T.1990]            0.072     0.074     0.064     0.047    
                               (0.107)   (0.107)   (0.107)   (0.105)  
C(pub_year)[T.1991]            0.052     0.052     0.042     0.018    
     

In [240]:
reg6.summary()

0,1,2,3
Dep. Variable:,c10,R-squared:,0.056
Model:,OLS,Adj. R-squared:,0.056
Method:,Least Squares,F-statistic:,926.7
Date:,"Wed, 08 Nov 2023",Prob (F-statistic):,0.0
Time:,10:42:08,Log-Likelihood:,-717030.0
No. Observations:,515796,AIC:,1434000.0
Df Residuals:,515762,BIC:,1435000.0
Df Model:,33,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0743,0.104,-0.717,0.473,-0.277,0.129
C(pub_year)[T.1986],0.0482,0.120,0.403,0.687,-0.186,0.283
C(pub_year)[T.1987],0.0147,0.110,0.134,0.893,-0.201,0.231
C(pub_year)[T.1988],0.0304,0.107,0.285,0.776,-0.179,0.239
C(pub_year)[T.1989],0.0328,0.106,0.311,0.756,-0.174,0.240
C(pub_year)[T.1990],0.0525,0.105,0.499,0.618,-0.154,0.259
C(pub_year)[T.1991],0.0260,0.105,0.248,0.804,-0.179,0.231
C(pub_year)[T.1992],0.0552,0.105,0.527,0.598,-0.150,0.260
C(pub_year)[T.1993],0.0915,0.105,0.874,0.382,-0.114,0.297

0,1,2,3
Omnibus:,1184617.582,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,36079993823.726
Skew:,21.781,Prob(JB):,0.0
Kurtosis:,1297.954,Cond. No.,536.0


In [15]:
from statsmodels.formula.api import ols
from statsmodels.iolib.summary2 import summary_col

reg1 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_sim_btw_grants +\
            paper_fos + C(pub_year)', data=reg_df_standardized_w_sim).fit()
reg2 = ols('c10 ~ num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos + C(pub_year)', data=reg_df_standardized_w_sim).fit()
reg3 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_sim_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes +\
            paper_fos + C(pub_year)', data=reg_df_standardized_w_sim).fit()
reg4 = ols('c10 ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_sim_btw_grants +\
            num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
            paper_fos + C(pub_year) + num_ref', data=reg_df_standardized_w_sim).fit()

results = summary_col([reg1, reg2, reg3, reg4], stars=True, float_format='%0.3f',
                      model_names=['Model\n(1)', 'Model\n(2)', 'Model\n(3)', 'Model\n(4)'],
                      info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
                                 'R2':lambda x: "{:.3f}".format(x.rsquared)})
print(results)


                                 Model     Model     Model     Model  
                                  (1)       (2)       (3)       (4)   
----------------------------------------------------------------------
C(pub_year)[T.1986]            0.050     0.063     0.050     0.055    
                               (0.122)   (0.121)   (0.121)   (0.119)  
C(pub_year)[T.1987]            0.033     0.044     0.032     0.015    
                               (0.112)   (0.112)   (0.112)   (0.110)  
C(pub_year)[T.1988]            0.049     0.054     0.043     0.029    
                               (0.109)   (0.108)   (0.108)   (0.106)  
C(pub_year)[T.1989]            0.051     0.055     0.044     0.028    
                               (0.108)   (0.107)   (0.107)   (0.105)  
C(pub_year)[T.1990]            0.072     0.074     0.064     0.047    
                               (0.107)   (0.107)   (0.107)   (0.105)  
C(pub_year)[T.1991]            0.052     0.052     0.042     0.018    
     

In [219]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, X = dmatrices('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
                  num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
                  C(pub_year)', data=reg_df_standardized, return_type='dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df

Unnamed: 0,variable,VIF
0,Intercept,5862.163064
1,C(pub_year)[T.1986],3.975406
2,C(pub_year)[T.1987],8.705099
3,C(pub_year)[T.1988],17.485903
4,C(pub_year)[T.1989],25.913539
5,C(pub_year)[T.1990],33.305427
6,C(pub_year)[T.1991],42.493101
7,C(pub_year)[T.1992],47.819394
8,C(pub_year)[T.1993],49.607454
9,C(pub_year)[T.1994],53.537565


In [217]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, X = dmatrices('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
                  num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
                  C(pub_year) + num_ref', data=reg_df_standardized, return_type='dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df

Unnamed: 0,variable,VIF
0,Intercept,5862.381767
1,C(pub_year)[T.1986],3.975407
2,C(pub_year)[T.1987],8.705109
3,C(pub_year)[T.1988],17.485917
4,C(pub_year)[T.1989],25.913572
5,C(pub_year)[T.1990],33.305474
6,C(pub_year)[T.1991],42.493247
7,C(pub_year)[T.1992],47.81956
8,C(pub_year)[T.1993],49.60775
9,C(pub_year)[T.1994],53.537908


In [218]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, X = dmatrices('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
                  num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
                  num_ref', data=reg_df_standardized, return_type='dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df

Unnamed: 0,variable,VIF
0,Intercept,1.0
1,interdisciplinarity_reference,1.096247
2,mean_grant_interdisciplinarity,1.248104
3,avg_dist_btw_grants,1.160217
4,num_authors,1.03966
5,sum_funding_usd,1.175126
6,num_grant,1.707468
7,num_funding_country,1.021926
8,num_institutes,1.585343
9,num_ref,1.017684


In [298]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor

y, X = dmatrices('norm_c10_year_fos ~ interdisciplinarity_reference + mean_grant_interdisciplinarity + avg_dist_btw_grants +\
                  num_authors + sum_funding_usd + num_grant + num_funding_country + num_institutes + \
                  paper_fos + C(pub_year) + num_ref', data=reg_df_standardized, return_type='dataframe')

vif_df = pd.DataFrame()
vif_df['variable'] = X.columns 

vif_df['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

vif_df

Unnamed: 0,variable,VIF
0,Intercept,5891.322635
1,paper_fos[T.127313418.0],1.505703
2,paper_fos[T.127413603.0],1.146128
3,paper_fos[T.138885662.0],1.000551
4,paper_fos[T.142362112.0],1.000297
5,paper_fos[T.144024400.0],1.024318
6,paper_fos[T.144133560.0],1.00247
7,paper_fos[T.15744967.0],2.782259
8,paper_fos[T.162324750.0],1.049538
9,paper_fos[T.17744445.0],1.001054
