In [None]:
# Imports

import os
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import skew
import networkx as nx
import plotly.subplots as sp
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import seaborn as sns

In [None]:
# Fetching emission data from Our World In Data
# https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions
# https://github.com/owid/co2-data

url = 'https://nyc3.digitaloceanspaces.com/owid-public/data/co2/owid-co2-data.csv'
df_co2 = pd.read_csv(url)

### Let's start with gathering historic data for each country each year 
#### Panel data

In [None]:
# Creating a historic subset with key variables

df_co2_historic = df_co2[['country', 'year', 'iso_code', 'population', 'co2', 'gdp']].copy()
df_co2_historic.rename(columns={'iso_code': 'iso3'}, inplace=True)

In [None]:
# Adding a column with Co2 per capita and converting the scale so we have co2_per_capita and total co2 in tons

df_co2_historic['co2'] = df_co2_historic['co2']*1000000
df_co2_historic['co2_per_capita'] =  df_co2_historic['co2']/df_co2_historic['population']
df_co2_historic['gdp_per_capita'] =  df_co2_historic['gdp']/df_co2_historic['population']

In [None]:
df_co2_historic.describe()

### Extracting the QOG data to get values for the values climate policies and climate change damage.
#### Reminder for what the values represent:

| Explanation | QOG variable name | Our variable name |
| --- | ----------- | ----------- |
| Climate Change Laws implemented per year | emdat_ntotaff | people_affected |
| Total Climate Change Laws implemented |ccl_nlp | cc_laws_tot |
| Number of people affected natural disasters |ccl_lpp | cc_laws_year |

In [None]:
url_qog = 'https://raw.githubusercontent.com/edoardochiarotti/class_datascience/main/Notebooks/Assignment/individual_assignment/clean_data_prepared_EDA/df_qog_polity_merged.csv'
df_qog=pd.read_csv(url_qog)

In [None]:
# Subsetting the dataframe with key data: emdat_ntotaff (= Number of people affected by climate change by year) and ccl_nlp (= Number of climate laws)

df_climate = df_qog[['country', 'year','iso3', 'emdat_ntotaff', 'ccl_nlp', 'ccl_lpp']].copy()
df_climate.rename(columns={'emdat_ntotaff': 'people_affected', 'ccl_nlp': 'cc_laws_tot', 'ccl_lpp' : 'cc_laws_year'}, inplace=True)

# Renaming United States to match the other dataframe
df_climate.replace({'United States of America': 'United States'}, inplace=True)

# We will also rename  United Kingdom of Great Britain and Northern Ireland to United Kingdom, as is in the Co2 dataset. FYI: UK and Ireland are joined there...
df_climate.replace({'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom'}, inplace=True)

In [None]:
df_climate.describe()

In [None]:
# Merging historic Co2 data with the subset of the QOG data (climate change laws and people affected per country per year)

df_historic = pd.merge(left=df_co2_historic, right=df_climate, how='left', validate='1:1')
df_historic

In [None]:
df_historic_clean = df_historic.copy()
df_historic_clean = df_historic_clean.dropna(subset=['cc_laws_tot', 'cc_laws_year'])

### First visualization 

In [None]:
fig = px.scatter(df_historic_clean, y="co2_per_capita", x="year",
                 size="co2_per_capita", color="country", hover_name="country",
                 log_x=False)

fig.update_layout(
    title="CO2 emission per capita per country",
    xaxis_title="Year",
    yaxis_title="CO2 emission per capita (tons)",
    legend_title="Countries",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    )
)
fig.show()

In [None]:
# Dropping outliers
# Kuwait is an extreme case - so we will remove Kuwait
kuwait_index = df_historic_clean[df_historic_clean.country == 'Kuwait'].index
df_historic_clean.drop(index=kuwait_index, inplace=True)

In [None]:
fig = px.scatter(df_historic_clean, y="cc_laws_year", x="year",
                 size="cc_laws_year", color="country", hover_name="country",
                 log_x=False)

fig.update_layout(
    title="Climate change laws per year",
    xaxis_title="Year",
    yaxis_title="Number of Laws",
    legend_title="Countries",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    )
)
fig.show()

In [None]:
fig = px.scatter(df_historic_clean, y="cc_laws_tot", x="year",
                 size="cc_laws_tot", color="country", hover_name="country",
                 log_x=False)

fig.update_layout(
    title="Accumulated Climate change laws",
    xaxis_title="Year",
    yaxis_title="Number of Laws",
    legend_title="Countries",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    )
)
fig.show()

In [None]:

# Log Y

fig = px.scatter(df_historic_clean, y="people_affected", x="year", color="country",  hover_name="country",
                 log_y=True)

fig.update_layout(
    title="Yearly People Affected by Natural disasters",
    xaxis_title="Year",
    yaxis_title="People affected",
    legend_title="Countries",
    font=dict(
        family="Courier New, monospace",
        size=12,
        color="RebeccaPurple"
    )
)
fig.show()

#### Impressions after cleaning and first visualizations
Our data seems to be complete and clean. To develop further intuition we will now move to create cross-sectional data. This means that we will aggregate our data set and for each country end up with one value. Let's see if we can find some correlations. After the cross-sectional analysis, we will move back to the panel data and perform a more vigorous within-country analysis.

We want: 
1. Total Co2 emissions per country from the beginning of the countries emissions to 2010. 
   1. df_accumulative_co2_2010
2. Number of climate change laws passed from 2012 -> 2020
   1. df_accumulative_climate


### Creating total accumulated data for each country
#### Cross-sectional data
Starting with accumulated emissions

In [None]:
# Calculating the accumulative emissions per country (both total and per capita)
df_accumulative_co2_2010 = df_historic.copy()

df_accumulative_co2_2010_final = df_accumulative_co2_2010[df_accumulative_co2_2010.year <= 2010]

# Summing up co2 per capita and total co2
df_accumulative_co2_2010_final = df_accumulative_co2_2010.groupby(['iso3', 'country'])['co2_per_capita'].sum().reset_index(name='tot_co2_per_capita_up_till_2010')
df_accumulative_co2_2010_final['tot_co2_up_till_2010'] = df_accumulative_co2_2010.groupby(['iso3', 'country'])['co2'].sum().reset_index(name='tot_co2_up_till_2010').tot_co2_up_till_2010

# Getting the GDP for 2010 for each country, using the word 'temp' to describe the temporary aspects of the df
df_2010_temp = df_accumulative_co2_2010[(df_accumulative_co2_2010.year == 2012) & (df_accumulative_co2_2010.iso3.isin(df_accumulative_co2_2010_final.iso3))]

#df_accumulative_co2_2010_final['gpd_2010'] = df_2010_temp.gdp
# recognizing that not all countries has gdp values
df_2010_temp = df_2010_temp[['country', 'iso3', 'gdp']]

df_accumulative_co2_2010_final = pd.merge(left=df_accumulative_co2_2010_final, right=df_2010_temp, how='left', validate='1:1')

# Now we have a datasets with the accumulated co2 emissions per total and per capita, including the gdp for All - 65 countries. 

Now, let's move on to the climate section of the data and collect the total number of climate change polices implemented in each country, and then merge these numbers with `df_accumulative_co2_2010_final` to a dataset we call `df_acc_final` 

In [None]:
df_between_2012_2020 = df_historic_clean.copy()
df_between_2012_2020 = df_between_2012_2020[df_between_2012_2020.year >= 2012]


# Calculating the sum of new climate change policies from 2012 to 2020
df_laws_between_2012_2020 = df_between_2012_2020.groupby(['iso3', 'country'])['cc_laws_year'].sum().reset_index(name='laws_impl_2012_2020')

# Calculating the mean population from 2012 to 2020
df_mean_population_2012_2020 = df_between_2012_2020.groupby(['iso3', 'country'])['population'].mean().reset_index(name='mean_population_2012_2020')

In [None]:
df_accumulative_final = pd.merge(left=df_accumulative_co2_2010_final, right=df_laws_between_2012_2020, how='left', validate='1:1')
df_accumulative_final = pd.merge(left=df_accumulative_final, right=df_mean_population_2012_2020, how='left', validate='1:1')

df_accumulative_final_clean = df_accumulative_final.dropna(subset=['laws_impl_2012_2020'])
df_accumulative_final_clean


### Calculating total people affected up until 2010

TOOD: We need to investigate this dataset further, maybe calculate the average per population or something...

In [None]:
df_affected = df_climate[df_climate.year <= 2010]
df_affected = df_affected.groupby(['iso3', 'country'])['people_affected'].sum().reset_index(name='people_affected_until_2010')
df_affected

In [None]:
# Final merge

df_acc_final = pd.merge(left=df_accumulative_final_clean, right=df_affected, how='left', validate='1:1')
df_acc_final

In [None]:
df_acc_final.describe()

### Exploratory visualization
Now we can visualize total emissions, see where people are mostly affected and display the number of new policies implemented between 2012 and 2020.

In [None]:
# Non-Logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z = df['tot_co2_per_capita_up_till_2010'].round(0),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'CO2 per inhabitant<br>(tons)',
))

fig.update_layout(
    title_text='Total Co2 Emissions per capita by country (Until 2010)',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions">\
            OECD</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
# Logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z = np.log10(df['tot_co2_per_capita_up_till_2010'].round(0)),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'CO2 per inhabitant<br>Log10 Scale',
))

fig.update_layout(
    title_text='Total Co2 Emissions per capita by country (Until 2010 in Log10 scale)',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions">\
            OECD</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
# Non-logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z = df['tot_co2_up_till_2010'].round(0),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'CO2 total<br>(tons)',
))

fig.update_layout(
    title_text='Total Co2 Emissions by country (Until 2010)',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions">\
            OECD</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
df = df_acc_final
df_sorted = df.sort_values('tot_co2_up_till_2010', ascending=False)
df_sorted['%'] = (df_sorted['tot_co2_up_till_2010']/df_sorted['tot_co2_up_till_2010'].sum())*100

fig = px.sunburst(df_sorted, path=['country'], values='%', title="Co2 Per Country Emissions In Percent of Total Global",
                  color='country', hover_data=['%','tot_co2_up_till_2010'])
fig.show()


In [None]:
# Logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z = np.log10(df['tot_co2_up_till_2010'].round(0)),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'CO2 total<br>Log10 Scale',
))

fig.update_layout(
    title_text='Total Co2 Emissions by country (Until 2010 - Log10 Scale)',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions">\
            OECD</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z = df['laws_impl_2012_2020'].round(0),
    text = df['country'],
    colorscale = 'Blues',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'Number of laws',
))

fig.update_layout(
    title_text='Number of climate change laws implemented from 2012-2020',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://www.gu.se/en/quality-government/qog-data/data-downloads/environmental-indicators-dataset">\
            QOG</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
# Non-Logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z =df['people_affected_until_2010'].round(0),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'People affected',
))

fig.update_layout(
    title_text='Number of people affected by natural disasters until 2010',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://www.gu.se/en/quality-government/qog-data/data-downloads/environmental-indicators-dataset">\
            QOG</a>',
        showarrow = False
    )]
)

fig.show()

# Impossible to see, have to use logarithmic scale or divide by population

In [None]:
# Number of people affected by natural disaster / number of avg inhabitants from 2012-2020

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z =df['people_affected_until_2010']/df['mean_population_2012_2020'],
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'People affected<br>Per mean population from 2012-2020',
))

fig.update_layout(
    title_text='Total number of people affected by natural disasters until 2010 per mean population from 2012-2020',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://www.gu.se/en/quality-government/qog-data/data-downloads/environmental-indicators-dataset">\
            QOG</a>',
        showarrow = False
    )]
)

fig.show()

In [None]:
# Logarithmic scale

df = df_acc_final

fig = go.Figure(data=go.Choropleth(
    locations = df['iso3'],
    z =np.log10(df['people_affected_until_2010'].round(0)),
    text = df['country'],
    colorscale = 'Reds',
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    #colorbar_tickprefix = '$',
    colorbar_title = 'People affected<br>Log10 Scale',
))

fig.update_layout(
    title_text='Total number of people affected by natural disasters until 2010',
    geo=dict(
        showframe=True,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(l=20, r=20, t=60, b=20),
    annotations = [dict(
        x=0.5,
        y=0,    #Trying a negative number makes the caption disappear - I'd like the caption to be below the map
        xref='paper',
        yref='paper',
        text='Source: <a href="https://www.gu.se/en/quality-government/qog-data/data-downloads/environmental-indicators-dataset">\
            QOG</a>',
        showarrow = False
    )]
)

fig.show()

#### Let's use some scatterplots and a fitted line to see if any of these values correlate

In [None]:
df = df_acc_final

fig = px.scatter(df, y="laws_impl_2012_2020", x="tot_co2_per_capita_up_till_2010",
                 size="tot_co2_per_capita_up_till_2010", color="tot_co2_per_capita_up_till_2010", hover_name="country", trendline='ols', hover_data={'laws_impl_2012_2020': ':.1f', 'tot_co2_per_capita_up_till_2010': ':.1f'},
                 labels={'laws_impl_2012_2020': 'Climate change laws implemented 2012-2020',
                         'tot_co2_per_capita_up_till_2010': 'Tot Co2 emissions<br> per capita until 2010'},
                 log_x=False)

fig.update_layout(
    title="Historic Per Capita Emissions (->2010) vs Climate Change Polices (2012->2020) - with OLS trendline",
    xaxis_title="Co2 Emissions",
    yaxis_title="Climate Change Policies",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

results = px.get_trendline_results(fig)
fig.show()


In [None]:
print(results.px_fit_results.iloc[0].summary())

#### Initial reflections based on prelimenary analysis

- T-Value: 0.874 - indicating that it's not significant at all.
- R-Squared: Ranges from `0-1`, with a higher value indicating a better fit. In this case, the R-squared value is `0.005`, which suggests that the model does not fit the data well.
- F-statistic: In our case, the F-statistic is `0.7640`, which suggests that the model is not a good fit.
- Prob (F-statistic): The probability is `0.383`, meaning a 38% probability that the T-value is `0`. This tells us that we cannot reject the `Ho` with a `95%` certainty.

We could have anticipated that our statistical model would not be very robust. There are several cases that we need to address. First of all - there are several omitted variables present that can drive climate change policies. As of now - climate polices can dampen the economic prosperity of a country - thus, requiring a country to have a solid GDP to afford to incorporate climate change laws. To illustrate that, we can first of all include a simple emission driver illustration created by Our World In Data, displaying this driving force. Thereafter we will do a scatterplot using `gdp` as a predictor for new `climate change policies` from 2012-2020. Our guess is that there is a positive coefficient to be found.

<img src='https://ourworldindata.org/uploads/2020/07/Kaya-identity.png' />

GDP and CO2 emissions' relationship is complicated. And we can draw some simple circles and arrows to show how this relationship can complicate our analysis.

In [None]:
# Create a directed graph
G = nx.DiGraph()

# Adding the nodes to our graph
G.add_edge('GDP', 'CO2', color='red')
G.add_edge('CO2', 'GDP', color='blue')
G.add_edge('GDP', 'CC Polices')
G.add_edge('CO2', 'CC Polices')
G.add_edge('People Affected By Natural Disasters', 'CC Polices')

nx.draw(G, with_labels=True, node_size=5000)


#### A complicated relationship
Let's plot GDP as a predictor for CO2

In [None]:
df = df_acc_final.dropna(subset=['gdp']).copy()

fig = px.scatter(df, x="gdp", y="tot_co2_per_capita_up_till_2010",
                 hover_name="country", trendline='ols', trendline_options=dict(log_x=True),
                 labels={'gdp': 'GDP in 2010 ', 
                         'tot_co2_per_capita_up_till_2010': 'CO2/capita -> 2010'},
                 log_x=True)

fig.update_layout(
    title="GDP predicting CO2 Emissions",
    xaxis_title="GDP",
    yaxis_title="CO2 emissions",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

results = px.get_trendline_results(fig)
fig.show()


In [None]:
df = df_acc_final.dropna(subset=['gdp']).copy()

fig3 = px.scatter(df, x="gdp", y="laws_impl_2012_2020", size='gdp', color='country',
                 hover_name="country", trendline='ols', trendline_options=dict(log_x=True), trendline_scope="overall",
                 labels={'gdp': 'GDP in 2010 ', 
                         'laws_impl_2012_2020': 'Climate Change Laws (2012-2020) '},
                 log_x=True)

fig3.update_layout(
    title="GDP in 2010 vs Climate Change Polices (2012->2020)<br>With OLS trendline",
    xaxis_title="GDP",
    yaxis_title="Climate Change Policies",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

results = px.get_trendline_results(fig)
fig3.show()


What we see here is that there is a stronger correlation between GDP and Climate Change Policies than CO2 emissions on Climate Change Policies. 

So, GDP is just ONE example of omitted variables, there are so many more, such as:
1.  Government: Countries with stricter environmental regulations may have lower levels of CO2 emissions and stricter climate change laws.
2.  Geography & Climate: Countries with very warmer climates higher levels of CO2 emissions and less strict climate change laws, and at the same time countries with very cold climates may also have higher  levels of CO2 emissions and less stricter climate change laws. Due to either e.g. domestic cooling or warming systems.
3. Energy-mix: Countries with a higher mix of fossil fuels may have higher levels of CO2 emissions and less strict climate change laws, while countries with a higher mix of renewable energy sources may have lower levels of CO2 emissions and stricter climate change laws. 

So, let's try to do some within-country analysis to escape this intricate web of biases.

#### Panel data - Within-country analysis

First, we can try to use clustered standard errors,  a method of adjusting the standard errors of the estimates in a linear regression model to account for the fact that the observations within each group  are correlated.

The idea behind clustered standard errors is that the errors within each cluster are more likely to be correlated than the errors between different clusters. By doing this we can eradicate some of the omitted variables, since the method takes into account similarity within-country.

To calculate clustered standard errors, we will first  specify the clusters in our data. We will of course use `country`. Standard errors of the estimates will then take into account the within-cluster correlations.

However, using clustered standard errors can also lead to larger standard errors, as the adjustment for the within-cluster correlations increases the variability of the estimates.

In [None]:
import statsmodels.api as sm


# Load the data
df = df_historic.dropna(subset=['co2', 'co2_per_capita', 'cc_laws_year', 'gdp', 'people_affected']).copy()

# Setting more explicit variable names to the regression result
features = ['CO2 tot', 'CO2/capita', 'GDP 2010', 'PA']


X = np.array(df[['co2', 'co2_per_capita', 'gdp', 'people_affected']])  # predictor variables
y = np.array(df['cc_laws_year'])  # dependent variable

# Estimate the parameters of the model using GMM
model = sm.OLS(y, X)
results = model.fit(cov_type='cluster', cov_kwds={'groups': df['country']})

# Print the results
print(results.summary(xname=features))


In [None]:
import statsmodels.api as sm


# Load the data
df = df_historic.dropna(subset=['co2_per_capita', 'cc_laws_year', 'gdp']).copy()

# Setting more explicit variable names to the regression result
features = ['CO2/capita', 'GDP']


X = np.array(df[['co2_per_capita', 'gdp']])  # predictor variables
y = np.array(df['cc_laws_year'])  # dependent variable

# Estimate the parameters of the model using GMM
model = sm.OLS(y, X)
results = model.fit(cov_type='cluster', cov_kwds={'groups': df['country']})

# Print the results
print(results.summary(xname=features))

Short summary and interpretation of the clustered OLS table above.

* The model explains about 5.4% of the variance in Climate Change Laws.
* The F-statistic is 5.595, which suggests that at least one of the coefficients in the model is non-zero. 
* The p-value of the F-statistic is 0.0191, which indicates that the model is statistically significant.
* The independent variable (CO2/capita) has a coefficient estimate of 0.0249, which means that for each unit increase in CO2/capita, CC laws is expected to increase by 0.0249 units, holding all other variables constant. 
* The standard error of the coefficient estimate is 0.011, which is a measure of the uncertainty around the estimate. 
* The z-score of the coefficient estimate is 2.365, which is a measure of the number of standard deviations that the estimate is from zero. 
* The p-value of the coefficient estimate is 0.018, which indicates that the coefficient is statistically significant. 
* The 95% confidence interval of the coefficient estimate is [0.004, 0.046], which means that there is a 95% chance that the true value of the coefficient falls within this interval.

In [None]:
import statsmodels.api as sm


# Load the data
df = df_historic.dropna(subset=['co2', 'co2_per_capita', 'cc_laws_year', 'gdp', 'people_affected']).copy()

# Setting more explicit variable names to the regression result
features = ['CO2/capita', 'GDP 2010', 'PA']


X = np.array(df[['co2_per_capita', 'gdp', 'people_affected']])  # predictor variables
y = np.array(df['cc_laws_year'])  # dependent variable

# Estimate the parameters of the model using GMM
model = sm.MixedLM(y, X, groups= df['country'])
results = model.fit()

# Print the results
print(results.summary())


In [None]:
# Calculating the difference from previous' year emissions and inserting it in a column called `co2_capita_dif`

df_historic_norway = df_historic[df_historic.country == 'Norway'].copy()
df_historic_norway['co2_capita_dif'] = df_historic_norway['co2_per_capita'].diff()

# Removing all nans in the cc_laws column
df_historic_norway.dropna(
    subset=['cc_laws_year', 'co2_capita_dif'], inplace=True)
df_historic_norway_shifted = df_historic_norway.copy()
df_historic_norway['co2_capita_dif'] = df_historic_norway.co2_capita_dif.shift(
    -1)
df_historic_norway = df_historic_norway[:-1]
df_historic_norway
df = df_historic_norway.copy()

fig = px.scatter(df, x="co2_capita_dif", y="cc_laws_year", trendline='ols',
                 hover_name="co2_capita_dif",
                 labels={'cc_laws_year': '#CC laws',
                         'co2_capita_dif': 'Change in CO2 / capita'},
                 )
fig.update_layout(
    title="Yearly change in emissions on next years climate change policies for Norway",
    xaxis_title="CO2 emission change from last year",
    yaxis_title="CC Laws ",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()


In [None]:
# Doing the same but not calculating each yearly change.

df_historic_norway = df_historic[df_historic.country == 'Norway'].copy()

# Removing all nans in the cc_laws column
df_historic_norway.dropna(
    subset=['cc_laws_year', 'co2_per_capita'], inplace=True)
df_historic_norway_shifted = df_historic_norway.copy()

df_historic_norway['co2_per_capita'] = df_historic_norway.co2_per_capita.shift(-2)
df_historic_norway = df_historic_norway[:-2]
df_historic_norway
df = df_historic_norway.copy()

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", trendline='ols', trendline_color_override='red',
                 hover_name="co2_per_capita",
                 labels={'cc_laws_year': '#CC laws',
                         'co2_per_capita': 'CO2 year / capita / year'},
                 )
fig.update_layout(
    title="Yearly emissions vs 2 Years Later Climate Change laws <br> (Norway)",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()


#### A possible way forward

We have now found a new method to calculate the relationship between emission change and climate change laws. This way we can internalize a lot of country differences, let's try to do this on the large dataset.


In [None]:
# Work in progress
df_historic_wip = df_historic.copy()
df_historic_wip = df_historic_wip.dropna(subset=['cc_laws_year'])
num_years_to_shift = 2

df_historic_wip['co2_per_capita'] = df_historic_wip.co2_per_capita.shift(-num_years_to_shift)
df_historic_wip = df_historic_wip[:-num_years_to_shift]

kuwait_index = df_historic_wip[df_historic_wip.country == 'Kuwait'].index
df_historic_wip.drop(index=kuwait_index, inplace=True)

df = df_historic_wip.copy()

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", trendline='ols', trendline_color_override='red', trendline_options=dict(log_x=False),
                 hover_name="country",
                 size="co2_per_capita",
                 log_x=False,
                 labels={'cc_laws_year': '#CC laws',
                         'co2_per_capita': 'CO2 year / Capita'},
                 )
fig.update_layout(
    title=f"Yearly Emissions vs {num_years_to_shift} Years Later climate change policies <br> Global",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()

#### A short reflection on the panel data outcome. 

We can see that there is as small negative correlation between CO2 per capita and Climate change laws implemented two years after. The reasoning behind two years is that first of all, emissions data are complicated to calculate, and thereafter we have to expect political lag. So it may be that 2 years is even a little optimistic. But let's see how changing the `num_years_to_shift` changes the graph. After investigating the difference when we chose  `1,2,3,4,[...]` year lag it's seems to be pretty much the same. 

To gain further understanding, we could devide the timeframe into three:
1. Pre Kyoto Procotol (< 1997), 
2. Pre Paris Agreement (1998 - 2015)
3. Post Paris Agreement (> 2016)

But there can still be other variances such as political change in these periods, for instance, Donald Trump was elected president in 2016, and cut down on Climate Change laws.

In [None]:
# Work in progress
df_historic_wip = df_historic_clean.copy()
df_historic_wip = df_historic_wip.dropna(subset=['cc_laws_year', 'co2_per_capita'])
num_years_to_shift = 2

df_historic_wip['co2_per_capita'] = df_historic_wip.co2_per_capita.shift(-num_years_to_shift)
df_historic_wip = df_historic_wip[:-num_years_to_shift]


# Creating a new column stating with a marker for pre_kyo, pre_par and post_par
df = df_historic_wip.copy()

conditions = [
    df['year'] < 1997,
    ((df['year'] >= 1997 ) & (df['year'] <= 2016)),
    df['year'] > 2016,
]

choices = ['pre_kyo', 'pre_par', 'post_par']

df['protocol'] = np.select(conditions, choices)

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", facet_col='protocol', trendline='ols', trendline_color_override='red',
                category_orders={"protocol": ["pre_kyo", "pre_par", "post_par"]},
                hover_name="country",
                size="co2_per_capita",
                log_x=False,
                labels={'cc_laws_year': '#CC laws',
                        'co2_per_capita': 'CO2 year / Capita'},
                )
fig.update_layout(
    title=f"Yearly Emissions vs {num_years_to_shift} Years Later climate change policies <br> Global",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()

This is interesting, it seems that the paris agreement has really put pressure on the countries to take action. We could divide into countries that has an higher than avg gdp and lower than avg gdp, but since we have so few datas for gdp after 2016, we will focus  on really high emitting countries. Hence divide into two groups: 
1. Above mean `co2_per_capita`
2. Below mean `co2_per_capita`

In [None]:
# Work in progress
import math

df_historic_wip = df_historic_clean.copy()
df_historic_wip = df_historic_wip.dropna(
    subset=['cc_laws_year', 'co2_per_capita'])
num_years_to_shift = 2

df_historic_wip['co2_per_capita'] = df_historic_wip.co2_per_capita.shift(
    -num_years_to_shift)
df_historic_wip = df_historic_wip[:-num_years_to_shift]


# Creating a new column  with a marker for pre_kyo, pre_par and post_par
# Creating a new column
df = df_historic_wip.copy()
mean_gdp = df.co2_per_capita.mean()

conditions = [
    df['year'] < 1997,
    ((df['year'] >= 1997) & (df['year'] <= 2016)),
    df['year'] > 2016,
]

conditions_gdp = [
    df['co2_per_capita'] < mean_gdp,
    (df['co2_per_capita'] >= mean_gdp) | (
        np.isclose(df['co2_per_capita'], mean_gdp))
]

choices = ['pre_kyo', 'pre_par', 'post_par']
choices_gdp = ['below_mean', 'above_mean']

df['protocol'] = np.select(conditions, choices)
df['co2_rel'] = np.select(conditions_gdp, choices_gdp)

#df = df.dropna(subset=['gdp_rel'])
#df = df[df.gdp_rel != '0'].copy()

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", facet_row='co2_rel', facet_col='protocol', trendline='ols', trendline_color_override='red',
                 category_orders={"protocol": [
                     "pre_kyo", "pre_par", "post_par"]},
                 hover_name="country",
                 size="co2_per_capita",
                 log_x=False,
                 labels={'cc_laws_year': '#CC laws',
                         'co2_per_capita': 'CO2 year / Capita'},
                 )
fig.update_layout(
    title=f"Yearly Emissions vs {num_years_to_shift} Years Later climate change policies (Global)",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)

results = px.get_trendline_results(fig)
fig.show()


In [None]:

# Work in progress
import math

df_historic_wip = df_historic_clean.copy()
df_historic_wip = df_historic_wip.dropna(subset=['cc_laws_year', 'co2_per_capita'])
num_years_to_shift = 2

df_historic_wip['co2_per_capita'] = df_historic_wip.co2_per_capita.shift(-num_years_to_shift)
df_historic_wip = df_historic_wip[:-num_years_to_shift]


# Creating a new column  with a marker for pre_kyo, pre_par and post_par
# Creating a new column 
df = df_historic_wip.copy()
mean_gdp = df.co2_per_capita.mean()

conditions = [
    df['year'] < 1997,
    ((df['year'] >= 1997 ) & (df['year'] <= 2016)),
    df['year'] > 2016,
]

conditions_gdp = [
    df['co2_per_capita'] < mean_gdp,
    (df['co2_per_capita'] >= mean_gdp) | (np.isclose(df['co2_per_capita'], mean_gdp))
]

choices = ['pre_kyo', 'pre_par', 'post_par']
choices_gdp = ['below_mean', 'above_mean']

df['protocol'] = np.select(conditions, choices)
df['co2_rel'] = np.select(conditions_gdp, choices_gdp)

df = df[df.co2_rel == 'below_mean']

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", facet_col='protocol', trendline='ols', trendline_color_override='red',
                category_orders={"protocol": ["pre_kyo", "pre_par", "post_par"]},
                hover_name="country",
                size="co2_per_capita",
                log_x=False,
                labels={'cc_laws_year': '#CC laws',
                        'co2_per_capita': 'CO2 year / Capita'},
                )
fig.update_layout(
    title=f"Yearly Emissions vs {num_years_to_shift} Years Later climate change policies (Global) <br> Countries below mean CO2/Capita)",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()


In [None]:

# Work in progress
import math

df_historic_wip = df_historic_clean.copy()
df_historic_wip = df_historic_wip.dropna(subset=['cc_laws_year', 'co2_per_capita'])

kuwait_index = df_historic_wip[df_historic_wip.country == 'Kuwait'].index
df_historic_wip.drop(index=kuwait_index, inplace=True)

num_years_to_shift = 2

df_historic_wip['co2_per_capita'] = df_historic_wip.co2_per_capita.shift(-num_years_to_shift)
df_historic_wip = df_historic_wip[:-num_years_to_shift]


# Creating a new column  with a marker for pre_kyo, pre_par and post_par
# Creating a new column 
df = df_historic_wip.copy()
mean_gdp = df.co2_per_capita.mean()

conditions = [
    df['year'] < 1997,
    ((df['year'] >= 1997 ) & (df['year'] <= 2016)),
    df['year'] > 2016,
]

conditions_gdp = [
    df['co2_per_capita'] < mean_gdp,
    (df['co2_per_capita'] >= mean_gdp) | (np.isclose(df['co2_per_capita'], mean_gdp))
]

choices = ['pre_kyo', 'pre_par', 'post_par']
choices_gdp = ['below_mean', 'above_mean']

df['protocol'] = np.select(conditions, choices)
df['co2_rel'] = np.select(conditions_gdp, choices_gdp)

df = df[df.co2_rel == 'above_mean']

fig = px.scatter(df, x="co2_per_capita", y="cc_laws_year", facet_col='protocol', trendline='ols', trendline_color_override='red',
                category_orders={"protocol": ["pre_kyo", "pre_par", "post_par"]},
                hover_name="country",
                size="co2_per_capita",
                log_x=False,
                labels={'cc_laws_year': '#CC laws',
                        'co2_per_capita': 'CO2 year / Capita'},
                )
fig.update_layout(
    title=f"Yearly Emissions vs {num_years_to_shift} Years Later climate change policies (Global) <br> Countries above mean CO2/Capita)",
    xaxis_title="CO2 per capita",
    yaxis_title="CC Laws per year",
    font=dict(
        family="Courier New, monospace",
        size=10,
        color="RebeccaPurple"
    )
)
results = px.get_trendline_results(fig)
fig.show()
