In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

### load datasets

* Total population by country
* GDP (Current US$)
* % population who have access to electricity
* % electricity from Coal, Oil or Gas
* % electricity from Nuclear Power
* Fertility rate
* Smoking rate

In [5]:
# read csv file
df_raw = pd.read_csv('../data/ADS_project_dataset2.csv')

In [124]:
# set index and exclude rows before 1990 or after 2012 from the main data
df = df_raw[(df_raw['Year'] >= 1990) & (df_raw['Year'] <= 2012)] \
    .drop(['SmokingRate', 'EduSpend', 'RoadTrans'], axis=1) \
    .set_index(['Country', 'Year'])

### explore data

In [125]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Key,Group,AveTemperature,CO2,CO2prox1000,CO2prox2000,CO2prox3000,CO2prox4000,CO2prox5000,CO2prox6000,...,CO2prox10000,CO2prox11000,CO2prox12000,CO2prox13000,Population,GDP,FertilityRate,ElecFossi,ElecNuclear,ForestArea
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Afghanistan,1990,1990Afghanistan,Rest,14.993333,2.915,183.1442,870.7274,4738.8492,3762.8294,0.0,511.4036,...,0.0,0.0,0.0,0.0,12249114,611392.9882,7.466,,,13500.0
Albania,1990,1990Albania,Rest,13.468667,6.0437,2223.4138,901.7874,0.0,2772.0448,2428.295,6806.8569,...,0.0,0.0,0.0,0.0,3286542,259410.371,2.978,13.592233,,7888.0
Algeria,1990,1990Algeria,Rest,24.014417,68.0497,68.0497,2633.0391,305.8892,180.2292,3219.5983,7456.4885,...,263.6924,0.0,0.0,0.0,25912367,22325.67501,4.726,99.161699,,16670.0
Andorra,1990,1990Andorra,Rest,12.455583,0.4067,2282.387,490.4241,346.7531,0.0,8047.9495,2896.8677,...,263.6924,0.0,0.0,0.0,54509,5959.528408,,,,160.0
Angola,1990,1990Angola,Rest,22.466417,6.3459,6.3459,247.7112,0.0,895.3406,2988.4974,0.0,...,0.0,0.0,0.0,0.0,12171441,8894.395056,7.247,13.793103,,609760.0


In [126]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4025 entries, (Afghanistan, 1990) to (Zimbabwe, 2012)
Data columns (total 23 columns):
Key               4025 non-null object
Group             4025 non-null object
AveTemperature    4025 non-null float64
CO2               3989 non-null float64
CO2prox1000       4023 non-null float64
CO2prox2000       4025 non-null float64
CO2prox3000       4025 non-null float64
CO2prox4000       4025 non-null float64
CO2prox5000       4025 non-null float64
CO2prox6000       4025 non-null float64
CO2prox7000       4025 non-null float64
CO2prox8000       4025 non-null float64
CO2prox9000       4025 non-null float64
CO2prox10000      4025 non-null float64
CO2prox11000      4025 non-null float64
CO2prox12000      4025 non-null float64
CO2prox13000      4025 non-null float64
Population        4025 non-null int64
GDP               3820 non-null float64
FertilityRate     3924 non-null float64
ElecFossi         3000 non-null float64
ElecNuclear       675 non-n

In [130]:
df.describe(include='all')

Unnamed: 0,Key,Group,AveTemperature,CO2,CO2prox1000,CO2prox2000,CO2prox3000,CO2prox4000,CO2prox5000,CO2prox6000,...,CO2prox10000,CO2prox11000,CO2prox12000,CO2prox13000,Population,GDP,FertilityRate,ElecFossi,ElecNuclear,ForestArea
count,4025,4025,4025.0,3989.0,4023.0,4025.0,4025.0,4025.0,4025.0,4025.0,...,4025.0,4025.0,4025.0,4025.0,4025.0,3820.0,3924.0,3000.0,675.0,3979.0
unique,4025,2,,,,,,,,,...,,,,,,,,,,
top,2009Nigeria,Rest,,,,,,,,,...,,,,,,,,,,
freq,1,3565,,,,,,,,,...,,,,,,,,,,
mean,,,19.264666,143.1623,728.881568,1488.895796,1715.600335,2297.327609,2909.833251,3128.833032,...,970.948211,341.045048,170.476765,33.629087,34805740.0,308044.1,3.316458,60.579229,27.459496,230306.6
std,,,8.615288,603.469319,1114.235958,2121.825549,2121.142422,2338.290799,3301.448303,2921.665919,...,1970.562104,1170.636752,901.629472,409.476996,128537000.0,1168507.0,1.75618,34.048486,20.941268,826226.3
min,,,-19.000417,0.0,0.0037,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,122.4586,1.076,0.001735,0.021151,2.2
25%,,,11.919417,1.7734,2.67035,142.3985,316.3865,535.0396,552.895,889.4362,...,0.0,0.0,0.0,0.0,2327075.0,5265.031,1.82925,32.319701,6.71016,4774.0
50%,,,22.68075,11.2304,255.1588,523.7731,923.0375,1667.9168,1774.8929,1981.2495,...,264.6071,0.0,0.0,0.0,7759258.0,21764.54,2.764,66.3106,26.980217,31140.0
75%,,,26.496333,64.3993,1149.75505,1906.6536,2254.9835,2944.5852,3683.3846,5463.3949,...,724.61,263.6924,0.0,0.0,22377000.0,150521.1,4.644,93.470967,40.618223,120971.0


### Fill missing values

In [131]:
# fill missing values
df_fillna = df.copy()

In [132]:
df_fillna.ElecNuclear = df_fillna.ElecNuclear.fillna(0)

In [134]:
df_fillna = df_fillna.fillna(method='backfill')

In [146]:
df_clean = df_fillna.reset_index().drop(['Key', 'Group'], axis=1)

In [147]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4025 entries, 0 to 4024
Data columns (total 23 columns):
Country           4025 non-null object
Year              4025 non-null int64
AveTemperature    4025 non-null float64
CO2               4025 non-null float64
CO2prox1000       4025 non-null float64
CO2prox2000       4025 non-null float64
CO2prox3000       4025 non-null float64
CO2prox4000       4025 non-null float64
CO2prox5000       4025 non-null float64
CO2prox6000       4025 non-null float64
CO2prox7000       4025 non-null float64
CO2prox8000       4025 non-null float64
CO2prox9000       4025 non-null float64
CO2prox10000      4025 non-null float64
CO2prox11000      4025 non-null float64
CO2prox12000      4025 non-null float64
CO2prox13000      4025 non-null float64
Population        4025 non-null int64
GDP               4025 non-null float64
FertilityRate     4025 non-null float64
ElecFossi         4025 non-null float64
ElecNuclear       4025 non-null float64
ForestArea        

In [148]:
df_clean.reset_index().describe(include='all')

Unnamed: 0,index,Country,Year,AveTemperature,CO2,CO2prox1000,CO2prox2000,CO2prox3000,CO2prox4000,CO2prox5000,...,CO2prox10000,CO2prox11000,CO2prox12000,CO2prox13000,Population,GDP,FertilityRate,ElecFossi,ElecNuclear,ForestArea
count,4025.0,4025,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,...,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0,4025.0
unique,,175,,,,,,,,,...,,,,,,,,,,
top,,Ireland,,,,,,,,,...,,,,,,,,,,
freq,,23,,,,,,,,,...,,,,,,,,,,
mean,2012.0,,2001.0,19.264666,142.08334,728.520844,1488.895796,1715.600335,2297.327609,2909.833251,...,970.948211,341.045048,170.476765,33.629087,34805740.0,298950.4,3.323388,61.854565,4.605009,228430.0
std,1162.061745,,6.634074,8.615288,600.872095,1114.07653,2121.825549,2121.142422,2338.290799,3301.448303,...,1970.562104,1170.636752,901.629472,409.476996,128537000.0,1139578.0,1.762488,34.439878,13.368769,821676.0
min,0.0,,1990.0,-19.000417,0.0,0.0037,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,122.4586,1.076,0.001735,0.0,2.2
25%,1006.0,,1995.0,11.919417,1.7922,2.6708,142.3985,316.3865,535.0396,552.895,...,0.0,0.0,0.0,0.0,2327075.0,5345.388,1.84,33.407178,0.0,4910.0
50%,2012.0,,2001.0,22.68075,11.5949,254.7848,523.7731,923.0375,1667.9168,1774.8929,...,264.6071,0.0,0.0,0.0,7759258.0,22125.12,2.762,67.726093,0.0,32286.0
75%,3018.0,,2007.0,26.496333,63.4434,1149.746,1906.6536,2254.9835,2944.5852,3683.3846,...,724.61,263.6924,0.0,0.0,22377000.0,156873.8,4.652,95.139581,0.0,119082.0


In [149]:
# split train and test
last_year = df_clean.Year.max()

In [150]:
df_train_raw = df_clean[df_clean.Year != last_year]
df_test_raw = df_clean[df_clean.Year == last_year]

In [152]:
# get X and y
y_train = df_train_raw['AveTemperature']
X_train = df_train_raw.iloc[:,3:]
y_test = df_test_raw['AveTemperature']
X_test = df_test_raw.iloc[:,3:]

In [159]:
# save datasets
y_train.to_csv('../data/y_train.csv')
X_train.to_csv('../data/X_train.csv')
y_test.to_csv('../data/y_test.csv')
X_test.to_csv('../data/X_test.csv')

### Modeling

In [154]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [155]:
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [156]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [157]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 3.29091918e-04 -2.52426690e-03 -2.30945960e-04  1.73165734e-04
 -8.66124443e-05 -2.86949232e-05  2.78828657e-05  1.70548383e-04
  8.20898394e-04  8.33917011e-04  1.05249699e-03  1.02436973e-03
 -1.61836583e-04 -2.02029374e-03  4.52291794e-09  1.12848297e-07
  1.25867591e+00  4.74189755e-02  9.17316219e-03 -1.64135276e-06]
Mean squared error: 33.08
Variance score: 0.56


In [162]:
import statsmodels.api as sm
X = sm.add_constant(X_train)
model = sm.OLS(y_train, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,AveTemperature,R-squared:,0.651
Model:,OLS,Adj. R-squared:,0.65
Method:,Least Squares,F-statistic:,357.7
Date:,"Sun, 30 Sep 2018",Prob (F-statistic):,0.0
Time:,16:14:30,Log-Likelihood:,-11725.0
No. Observations:,3850,AIC:,23490.0
Df Residuals:,3829,BIC:,23620.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.7012,0.646,15.012,0.000,8.434,10.968
CO2,0.0003,0.000,1.373,0.170,-0.000,0.001
CO2prox1000,-0.0025,0.000,-21.041,0.000,-0.003,-0.002
CO2prox2000,-0.0002,5.16e-05,-4.473,0.000,-0.000,-0.000
CO2prox3000,0.0002,4.84e-05,3.575,0.000,7.82e-05,0.000
CO2prox4000,-8.661e-05,4.34e-05,-1.994,0.046,-0.000,-1.45e-06
CO2prox5000,-2.869e-05,3.7e-05,-0.775,0.438,-0.000,4.39e-05
CO2prox6000,2.788e-05,3.83e-05,0.728,0.467,-4.72e-05,0.000
CO2prox7000,0.0002,3.57e-05,4.772,0.000,0.000,0.000

0,1,2,3
Omnibus:,1081.33,Durbin-Watson:,2.129
Prob(Omnibus):,0.0,Jarque-Bera (JB):,4411.881
Skew:,-1.326,Prob(JB):,0.0
Kurtosis:,7.525,Cond. No.,1040000000.0
