In [1]:
import mitoolspro as mtp
import pandas as pd
from pandas import DataFrame
from mitoolspro.project import Project
from mitoolspro.utils.objects import StringMapper
from mitoolspro import regressions

In [None]:
pr = Project.load(auto_load=True)
str_mapper = StringMapper.load_mappings(pr.get_path("string_map"))

In [3]:
def significances_map(x):
    if float(x) <= 0.001:
        return '***'
    elif float(x) <= 0.01:
        return '**'
    elif float(x) <= 0.05:
        return '*'
    return ''

In [4]:
data_name = pr.get_path("final_data")
data = pd.read_parquet(data_name).reset_index()
data = data.set_index(['Year', 'Country', 'Continent', 'Income Group', 'Current Income Group'])

In [None]:
[c for c in data.columns if ' SCI' in c and '_square' not in c]

In [None]:
regression_type = 'ols'
regression_degree = 'linear'
transform = True
eci_tag = 'SCI'
levels_var = 'Income Group'
id_var = 'Country'
dependent_variable = 'CO2 emissions (metric tons per capita)'
independent_variables = [v for v in data.columns if v.endswith(f' {eci_tag}')]
if regression_degree == 'quadratic':
    independent_variables = [f'{v}_square' for v in independent_variables] + independent_variables
independent_variables.sort()
control_variables = [
    'Economic Globalisation',
    'Energy Consumption per Capita',
    'Environmental Patents per Capita',
    'Renewable energy consumption (% of total final energy consumption)',
    'Total natural resources rents (% of GDP)',
    'Urban population (% of total population)',
]
income_levels = ['All income', 'High income', 'Upper middle income', 'Lower middle income', 'Low income']
independent_variables

In [7]:
def get_transformed_variable(data: DataFrame, var: str, transformation: str) -> str:
    if transformation in ["log", "square", "boxcox"]:
        if f"{var}_{transformation}" in data.columns:
            return f"{var}_{transformation}"
        else:
            raise ValueError(f"{var}_{transformation} not in data columns")
    if transformation is None:
        return var
    else:
        raise NotImplementedError(f"{transformation} not implemented")

In [8]:
transformations = {
    'CO2 emissions (metric tons per capita)': 'log',
}
if transform:
    dependent_variable = get_transformed_variable(data, dependent_variable, transformations.get(dependent_variable, None))
    control_variables = [get_transformed_variable(data, v, transformations.get(v, None)) for v in control_variables]
control_variables.sort()

In [9]:
variables = independent_variables + control_variables + [dependent_variable]

In [10]:
reg_data = data.loc[:, variables]

# Statistical Testing

In [None]:
test_data = {}
for income_level in income_levels:
    income_data = reg_data.loc[reg_data.index.get_level_values('Income Group') == income_level] if income_level != 'All income' else reg_data
    income_data = income_data.dropna(axis=0, how='any')
    print(income_level, income_data.shape[0])
    test_data[income_level] = income_data

In [None]:
data.index.get_level_values('Country').nunique()

## Descriptive Statistics

In [13]:
def create_data_description(data, income_level=None):

    data_stats = data.describe(percentiles=[0.5]).T
    data_stats.columns = [
                "N° Observations",
                "Mean",
                "Std. Dev.",
                "Min",
                "Median",
                "Max",
            ]
    data_stats["Kurtosis"] = data.kurtosis()
    data_stats["Skewness"] = data.skew()
    data_stats["N° Observations"] = data_stats[
        "N° Observations"
    ].astype(int)
    numeric_cols = [c for c in data_stats.columns if c != "N° Observations"]
    data_stats[numeric_cols] = data_stats[numeric_cols].round(7)
    data_stats.columns = (
        pd.MultiIndex.from_product([[income_level], data_stats.columns])
        if income_level
        else data_stats.columns
    )

    return data_stats

In [None]:
all_income_stats = create_data_description(test_data['All income'], 'All income')
all_income_stats.to_excel(pr.get_path('descriptive_statistics'))
all_income_stats

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(all_income_stats.round(3).astype(str)))

In [16]:
incomes_stats = []
for income_level in ['High income', 'Upper middle income', 'Lower middle income', 'Low income']:
    income_stats = create_data_description(test_data[income_level])
    income_stats.index = pd.MultiIndex.from_tuples([(income_level, idx) for idx in income_stats.index])
    incomes_stats.append(income_stats)
incomes_stats = pd.concat(incomes_stats)
incomes_stats.to_excel(pr.get_path('descriptive_statistics_by_income'))    

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(incomes_stats.round(3).astype(str)))

#### Shapiro-Wilk Test

In [None]:
shapiro_tests = pd.concat({k: regressions.statistical_tests.shapiro_tests(v) for k, v in test_data.items()})
shapiro_tests.columns = pd.MultiIndex.from_tuples([('Shapiro-Wilk', col) for col in shapiro_tests.columns])
shapiro_tests = shapiro_tests.unstack(0).reorder_levels([2, 0, 1], axis=1)
shapiro_tests = shapiro_tests[[(c[0], c[1], stat) for c in shapiro_tests.columns for stat in ['statistic', 'p-value']]]
stat_names = {
    'statistic': 'Statistic',
    'p-value': 'p-value'
}
shapiro_tests.columns = pd.MultiIndex.from_tuples([(c[0], c[1], stat_names.get(c[-1], c[-1])) for c in shapiro_tests.columns])
shapiro_tests.index.name = ""

shapiro_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(shapiro_tests))

#### Anderson-Darling Test

In [None]:
anderson_tests = pd.concat({k: regressions.statistical_tests.anderson_tests(v, criteria=0.01) for k, v in test_data.items()})
anderson_tests.columns = pd.MultiIndex.from_tuples([('Anderson-Darling', col) for col in anderson_tests.columns])
anderson_tests = anderson_tests.unstack(0).reorder_levels([2, 0, 1], axis=1)
anderson_tests = anderson_tests[[(c[0], c[1], stat) for c in anderson_tests.columns for stat in ['statistic', 'critical_value']]]
stat_names = {
    'statistic': 'Statistic',
    'critical_value': 'Critical Value'
}
anderson_tests.columns = pd.MultiIndex.from_tuples([(c[0], c[1], stat_names.get(c[-1], c[-1])) for c in anderson_tests.columns])
anderson_tests.index.name = ""

anderson_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(anderson_tests))

In [None]:
skewdness_tests = pd.concat([shapiro_tests, anderson_tests], axis=1)
skewdness_tests = skewdness_tests.round(2)
income_levels = ["All income", "High income", "Upper middle income", "Lower middle income", "Low income"]
statistical_tests = [
    ['Anderson-Darling', 'Statistic'],
    ['Anderson-Darling', 'Critical Value'],
    ['Shapiro-Wilk', 'Statistic'],
    ['Shapiro-Wilk', 'p-value'],
    ]
sorted_cols = pd.MultiIndex.from_tuples([(income, test[0], test[1]) for income in income_levels for test in statistical_tests])
skewdness_tests = skewdness_tests[sorted_cols]
skewdness_tests.to_excel(pr.get_path('skewdness_tests'))
skewdness_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(skewdness_tests.astype(str)))

#### Augmented Dickey-Fuller Test

In [None]:
adf_tests = pd.concat({k: regressions.statistical_tests.adf_tests(v, critical_value=5, regression='ctt') for k, v in test_data.items()})
adf_tests = adf_tests.round(2)
stat_names = {
    'statistic': 'Statistic',
    'p-value': 'p-value',
    "critical_value_5%": 'Critical Value (5%)'
}
adf_tests.columns = pd.MultiIndex.from_tuples([('ADF-Test', stat_names.get(col, col)) for col in adf_tests.columns])
adf_tests = adf_tests.unstack(0).reorder_levels([2, 0, 1], axis=1)
income_levels = ["All income", "High income", "Upper middle income", "Lower middle income", "Low income"]
statistical_tests = [  
    ['ADF-Test', 'Statistic'],
    ['ADF-Test', 'p-value'],
    ['ADF-Test', 'Critical Value (5%)'],
    ]
sorted_cols = pd.MultiIndex.from_tuples([(income, test[0], test[1]) for income in income_levels for test in statistical_tests])

adf_tests.index.name = ""
adf_tests = adf_tests[sorted_cols]
adf_tests.to_excel(pr.get_path('adf_tests'))
adf_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(adf_tests))

#### Variance Inflation Factor

In [None]:
vif_tests = pd.concat({k: regressions.statistical_tests.calculate_vif(v, dependent_variable='CO2 emissions (metric tons per capita)_log', threshold=10) for k, v in test_data.items()})
vif_tests.columns = pd.MultiIndex.from_tuples([('VIF', col) for col in vif_tests.columns])
vif_tests = vif_tests.unstack(0).reorder_levels([0, 2, 1], axis=1)
vif_tests = vif_tests.loc[:, vif_tests.columns.get_level_values(-1) == "VIF"].droplevel(-1, axis=1)
vif_tests.index.name = ""
vif_tests.to_excel(pr.get_path('vif_tests'))
vif_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(vif_tests))

#### Breusch-Pagan Test

In [None]:
bp_tests = pd.concat({k: regressions.statistical_tests.breusch_pagan_test(v, 'CO2 emissions (metric tons per capita)_log') for k, v in test_data.items()})
bp_tests.columns = pd.MultiIndex.from_tuples([('Breusch-Pagan Test', col) for col in bp_tests.columns])
bp_tests.to_excel(pr.get_path('bp_tests'))
bp_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(bp_tests))

#### White Test

In [None]:
white_tests = pd.concat({k: regressions.statistical_tests.white_test(v, 'CO2 emissions (metric tons per capita)_log') for k, v in test_data.items()})
white_tests.columns = pd.MultiIndex.from_tuples([('White Test', col) for col in white_tests.columns])
white_tests.index.name = ""
white_tests.to_excel(pr.get_path('white_tests'))
white_tests

In [None]:
print(mtp.pandas_utils.dataframe_to_latex(white_tests))

***