In [58]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn import metrics

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

import seaborn as sns

## Theory
Energy is crucial for production. A rise in the production levels of country will result in an increase in energy consumption. The majority of the energy will come from unclean sources which will culminate in higher CO2 emission levels. Considering all things to be equal, a higher population size consumes more energy which leads to higher emission levels.  This notebook investigates the relationship between GDP, population size and CO2 emissions in South Africa. To do this, it uses data from the World Bank.

## Findings 
The coefficients are 1.26962844e-07 and 6.75158649e-03 respectively. This indicates that population size has a greater effect on CO2 emissions than GDP. In addition they were both positive which is line with theory. The intercept is 11660 while the R2 value is 0.88 which indicates high correlation levels. 

In [93]:
# Read CSV
df = pd.read_csv('SA Statistics.csv')

# Drop columns to match data
df.drop(['2019', '2020'], axis=1)

# Transform df from wide form to long form data
df1 = pd.melt(df, id_vars=['Indicator Name', 'Indicator Code'],
                            var_name='Year', value_name='Value')

# Select GDP and CO2 data
df2 = df1.loc[(df1['Indicator Code']=='NY.GDP.MKTP.CD')|
              (df1['Indicator Code']=='SP.POP.TOTL')| 
              (df1['Indicator Code']=='EN.ATM.CO2E.KT')]


# Drop 'Indicator Code', not required anymore
df2.drop(['Indicator Code'], axis=1)

df3 = df2.pivot(index = ['Year'], 
                columns = ['Indicator Name'], 
                values = ['Value']).droplevel(level = 0, axis = 1)

df3.columns = ['CO2 emissions (kt)', 'GDP (current US$)', 'Population, total']

df4 = df3.reset_index()
df5 = df4.dropna(axis=0)

# Set variables
X = df5[['GDP (current US$)', 'Population, total']]
y = df5['CO2 emissions (kt)']

# Auto-correlation
serial_corr1 =  

# Fitting
lin_reg_model = LinearRegression()

lin_reg_model.fit(X, y)

lin_reg_model.coef_

y_true=y
y_predict=lin_reg_model.predict(X)


print('Coefficients:', lin_reg_model.coef_)
print('Intercept:', round(lin_reg_model.intercept_))
print('R2:', round(r2_score(y_true, y_predict), 2))

Coefficients: [1.26962844e-07 6.75158649e-03]
Intercept: 11660
R2: 0.88
