In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# from pandas_datareader.data import DataReader

In [None]:
# Read PWT 10.01
pwt = pd.read_stata('Dane/RnD/pwt1001.dta')

# Store country codes for later use
countrycodes = pwt['countrycode']
countrycodes = countrycodes.drop_duplicates()
countries = pwt['country']
countries = countries.drop_duplicates()

# Set MultiIndex
pwt.set_index(['countrycode', 'year'], inplace=True)

pwt.tail()

In [None]:
gdppw = (pwt.xs(2019, level='year')['rgdpe']/pwt.xs(2019, level='year')['emp']) # GDP per capita
pop = pwt.xs(2019, level='year')['pop']
tfp = pwt.xs(2019, level='year')['ctfp']
wtfp = pwt.xs(2019, level='year')['cwtfp']

In [None]:
pub_d = pd.read_csv('Dane/RnD/scientific-publications-per-million.csv')

cols = pub_d.columns.drop("Code")
pub_d[cols] = pub_d[cols].apply(pd.to_numeric, errors="coerce")
pub_d.groupby("Code").mean()

pub = pub_d.groupby(['Code']).mean()

In [None]:
rnd_d = pd.read_csv('Dane/RnD/research-and-development-expenditure-of-gdp.csv')

cols = rnd_d.columns.drop("Code")
rnd_d[cols] = rnd_d[cols].apply(pd.to_numeric, errors="coerce")
rnd_d.groupby("Code").mean()

rnd = rnd_d.groupby(['Code']).mean()
# rnd

In [None]:
pat_d = pd.read_csv('Dane/RnD/patent-applications-per-million.csv')

cols = pat_d.columns.drop("Code")
pat_d[cols] = pat_d[cols].apply(pd.to_numeric, errors="coerce")
pat_d.groupby("Code").mean()

pat = pat_d.groupby(['Code']).mean()
# pat

In [None]:
res_d = pd.read_csv('Dane/RnD/researchers-in-rd-per-million-people.csv')

cols = res_d.columns.drop("Code")
res_d[cols] = res_d[cols].apply(pd.to_numeric, errors="coerce")
res_d.groupby("Code").mean()

res = res_d.groupby(['Code']).mean()
# res

In [None]:
rnd_all = pd.DataFrame()
rnd_all['pub'] = pub.iloc[:, -1]
rnd_all['rnd'] = 100*rnd.iloc[:, -1]
rnd_all['pat'] = pat.iloc[:, -1]
rnd_all['res'] = res.iloc[:, -1]

In [None]:
rnd_all

In [None]:
rnd_all.corr()

In [None]:
pd.plotting.scatter_matrix(rnd_all) #, diagonal='kde'
plt.show()

In [None]:
np.log(rnd_all).corr()

In [None]:
pd.plotting.scatter_matrix(np.log(rnd_all))
plt.show()

In [None]:
pd.plotting.scatter_matrix(np.log(rnd_all), diagonal='kde');
# plt.show()

In [None]:
import seaborn as sns
sns.lmplot(x='rnd', y='pat', data=np.log(rnd_all))
plt.show()

In [None]:
fgrid = sns.lmplot(x='res', y='pub', data=np.log(rnd_all), scatter_kws={"s": 0});
ax = fgrid.axes[0, 0]  # fgrid.axes return an array of all axes in the figure, so we index the array

for country in rnd_all.index:
    ax.annotate(country, (np.log(rnd_all['res'][country]), np.log(rnd_all['pub'][country])), 
                ha='center', va='center', fontsize=6)

country = 'POL'
ax.annotate(country, (np.log(rnd_all['res'][country]), np.log(rnd_all['pub'][country])), 
                ha='center', va='center', fontsize=10, color='C3')

plt.show()
    
# plt.savefig('test.pdf')

In [None]:
from sklearn.decomposition import PCA

In [None]:
x = np.log(rnd_all).dropna().drop('VEN')

pca = PCA().fit(x)
print(pca.explained_variance_ratio_)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.show()

In [None]:
pca = PCA(n_components=2)

pca_res = pca.fit_transform(x)

print(pca.explained_variance_ratio_)

pca_df = pd.DataFrame(data=pca_res, columns = ['PC1', 'PC2'])

pca_df.index = x.index

pca_df

In [None]:
# plt.scatter(pca_df['PC1'], pca_df['PC2']);

In [None]:
x['pca'] = pca_df['PC1']

x

In [None]:
x.corr()

In [None]:
pd.plotting.scatter_matrix(x, diagonal='kde')
plt.show()

In [None]:
x['l_tfp'] = np.log(tfp)
x['tfp'] = tfp
x['gdppw'] = np.log(gdppw)
x['pop'] = pop

x.dropna(inplace=True)

In [None]:
pd.plotting.scatter_matrix(x.iloc[:, :-1], diagonal='kde', figsize=(10, 10))
plt.show()

In [None]:
x.iloc[:, :-1].corr()

In [None]:
sns.lmplot(x='pca', y='gdppw', data=x)
plt.show()

In [None]:
sns.lmplot(x='pca', y='l_tfp', data=x)
plt.show()

In [None]:
# from matplotlib import rc, rcParams

# rc('font', family='serif')
# rc('text', usetex=True)

# rcParams['axes.autolimit_mode'] = 'round_numbers'
# rcParams['axes.xmargin'] = 0
# rcParams['axes.ymargin'] = 0

# rcParams['xtick.direction'] = 'in'
# rcParams['ytick.direction'] = 'in'
# rcParams['xtick.top'] = True
# rcParams['ytick.right'] = True

# rcParams['legend.frameon'] = False

In [None]:
# plt.subplots(figsize = (5, 3))

# plt.scatter(x['pca'], x['gdppw'], s=x['pop'], alpha=0.5);

# plt.xlabel('Main PCA component of innovative activity')
# plt.ylabel('Log of real GDP per worker in 2019') # (2017 \$)

# plt.savefig('GDP_RnD.pdf', transparent=True)

In [None]:
fig, ax = plt.subplots(figsize = (8, 4))

sns.regplot(ax=ax, x='pca', y='gdppw', data=x.dropna(), #lowess=True, #order=3, #robust=True,
            scatter_kws={'s': x.dropna()['pop'], 'alpha': 0.25}
#             , line_kws={'color': 'C1'}
           );

for country in x.index:
    ax.annotate(country, (x['pca'][country], x['gdppw'][country]), 
                ha='center', va='center', fontsize=6)

plt.xlabel('Main PCA component of innovative activity')
plt.ylabel('Log of real GDP per worker in 2019 (2017 $)') #

plt.xlim(-8, 7)
plt.ylim(7, 13)

plt.show()

# plt.savefig('GDP_RnD_regplot.pdf', bbox_inches='tight', pad_inches=0.05)