In [2]:
import matplotlib.pyplot as plt
import numpy as np
import scipy.linalg
import pickle
import pandas as pd
import sklearn 
import sys
import pandas as pd
import os
repo_dir = os.environ.get("REPO_DIR")
code_dir = os.path.join(repo_dir, "code/")
data_dir = os.path.join(repo_dir, "data/")
os.chdir(code_dir)


In [3]:
def df_to_demeaned_y_vars(task, 
                          df, 
                          method ="calc", 
                log_before_diff=False):
    
    hats = df.copy()
    hats.rename(columns = {task : "y_true"}, inplace=True)

    if method == "calc":
        country_means = hats.groupby("ISO_Code")["y_true"].mean().to_frame().rename(columns={"y_true" : "y_bar_country"})
        hats = hats.merge(country_means, "left", left_on = "ISO_Code", right_index=True)
    else:
        raise Exception("NotImplemented  - Invalid method input")
    
    hats["demeaned_y_true"] = hats.y_true - hats.y_bar_country
    
    if log_before_diff:
        hats["demeaned_y_true"] = np.log(hats.y_true) - np.log(hats.y_bar_country)
    return hats["demeaned_y_true"]

In [4]:
df = pd.read_pickle(data_dir + "int/GDL_HDI/HDI_indicators_and_indices_clean.p")

In [5]:
tasks = ['Sub-national HDI', 
         "Life expectancy", 
         "Mean years schooling", 
         "Expected years schooling",
         "GNI per capita in thousands of US$ (2011 PPP)"]

mat_df = df[tasks + ["ISO_Code"]]

In [6]:
mat_df = mat_df.rename(columns = {"Sub-national HDI":"HDI","GNI per capita in thousands of US$ (2011 PPP)":"GNIpc" })

In [7]:
mat_df_countries = mat_df.pop("ISO_Code")

corr = mat_df.corr()

corr.columns = mat_df.columns
corr.index = mat_df.columns

corr = corr ** 2

In [8]:
print(corr.round(2).replace(np.nan, "").to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &   HDI &  Life expectancy &  Mean years schooling &  Expected years schooling &  GNIpc \\
\midrule
HDI                      &  1.00 &             0.79 &                  0.84 &                      0.83 &   0.63 \\
Life expectancy          &  0.79 &             1.00 &                  0.54 &                      0.60 &   0.44 \\
Mean years schooling     &  0.84 &             0.54 &                  1.00 &                      0.62 &   0.51 \\
Expected years schooling &  0.83 &             0.60 &                  0.62 &                      1.00 &   0.46 \\
GNIpc                    &  0.63 &             0.44 &                  0.51 &                      0.46 &   1.00 \\
\bottomrule
\end{tabular}



In [9]:

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
a = (corr
 .style
 .background_gradient(axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')  # Color NaNs grey
 .set_precision(2))

In [10]:
corr = corr.round(2)

In [11]:
corr[corr.isnull()] = ""

In [12]:
corr

Unnamed: 0,HDI,Life expectancy,Mean years schooling,Expected years schooling,GNIpc
HDI,,,,,
Life expectancy,0.79,,,,
Mean years schooling,0.84,0.54,,,
Expected years schooling,0.83,0.6,0.62,,
GNIpc,0.63,0.44,0.51,0.46,


In [13]:
print(corr.to_latex())

\begin{tabular}{llllll}
\toprule
{} &   HDI & Life expectancy & Mean years schooling & Expected years schooling & GNIpc \\
\midrule
HDI                      &       &                 &                      &                          &       \\
Life expectancy          &  0.79 &                 &                      &                          &       \\
Mean years schooling     &  0.84 &            0.54 &                      &                          &       \\
Expected years schooling &  0.83 &             0.6 &                 0.62 &                          &       \\
GNIpc                    &  0.63 &            0.44 &                 0.51 &                     0.46 &       \\
\bottomrule
\end{tabular}



In [14]:
a

Unnamed: 0,HDI,Life expectancy,Mean years schooling,Expected years schooling,GNIpc
HDI,,,,,
Life expectancy,0.79,,,,
Mean years schooling,0.84,0.54,,,
Expected years schooling,0.83,0.6,0.62,,
GNIpc,0.63,0.44,0.51,0.46,


In [15]:
mat_df["ISO_Code"] = mat_df_countries

for task in mat_df.columns[:-1]:
    mat_df["Within-ADM0 " + task] = df_to_demeaned_y_vars(task, mat_df, method="calc")
    mat_df.pop(task)

In [16]:
mat_df_countries = mat_df.pop("ISO_Code")

corr = mat_df.corr()

corr.columns = mat_df.columns
corr.index = mat_df.columns

corr = corr ** 2

In [17]:

mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
corr[mask] = np.nan
b=(corr
 .style
 .background_gradient(axis=None, vmin=-1, vmax=1)
 .highlight_null(null_color='#f1f1f1')  # Color NaNs grey
 .set_precision(2))

In [18]:
corr = corr.round(2)
corr[corr.isnull()] = ""

In [19]:
corr

Unnamed: 0,Within-ADM0 HDI,Within-ADM0 Life expectancy,Within-ADM0 Mean years schooling,Within-ADM0 Expected years schooling,Within-ADM0 GNIpc
Within-ADM0 HDI,,,,,
Within-ADM0 Life expectancy,0.32,,,,
Within-ADM0 Mean years schooling,0.83,0.14,,,
Within-ADM0 Expected years schooling,0.65,0.11,0.46,,
Within-ADM0 GNIpc,0.2,0.04,0.11,0.1,


In [20]:
print(corr.to_latex())

\begin{tabular}{llllll}
\toprule
{} & Within-ADM0 HDI & Within-ADM0 Life expectancy & Within-ADM0 Mean years schooling & Within-ADM0 Expected years schooling & Within-ADM0 GNIpc \\
\midrule
Within-ADM0 HDI                      &                 &                             &                                  &                                      &                   \\
Within-ADM0 Life expectancy          &            0.32 &                             &                                  &                                      &                   \\
Within-ADM0 Mean years schooling     &            0.83 &                        0.14 &                                  &                                      &                   \\
Within-ADM0 Expected years schooling &            0.65 &                        0.11 &                             0.46 &                                      &                   \\
Within-ADM0 GNIpc                    &             0.2 &                        0.