### Load packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

### Load datasets

In [14]:
# read data
df = pd.read_csv('../data/df_clean.csv')

In [16]:
# split train and test
last_year = df.Year.max()

In [17]:
df_train_raw = df[df.Year != last_year]
df_test_raw = df[df.Year == last_year]

In [18]:
# get X and y
y_train = df_train_raw['AveTemperature']
X_train = df_train_raw.iloc[:,3:]
y_test = df_test_raw['AveTemperature']
X_test = df_test_raw.iloc[:,3:]

In [153]:
# # save datasets
# y_train.to_csv('../data/y_train.csv')
# X_train.to_csv('../data/X_train.csv')
# y_test.to_csv('../data/y_test.csv')
# X_test.to_csv('../data/X_test.csv')

### Data clensing

In [3]:
# read csv file
df_raw = pd.read_csv('../data/ADS_project_dataset2.csv')

In [42]:
col_list = ['Country', 'Year', 'AveTemperature', 'Population', 'GDP', 'FertilityRate', 'ElecFossi', 'ElecNuclear', 
            'EduSpend', 'ForestArea']

In [43]:
# filter data by year
START_YEAR = 1990   # some countries are missing key predictors prior to 1990
END_YEAR = 2012     # no temperature data after 2012
NUM_YEAR = END_YEAR - START_YEAR + 1  # number of years to be included in the dataset
# COL_DROP = ['SmokingRate', 'EduSpend', 'RoadTrans']   # columns to drop because the majority of the data is missing

In [44]:
# set index and exclude rows before 1990 or after 2012 from the main data
df = df_raw[(df_raw['Year'] >= START_YEAR) & (df_raw['Year'] <= END_YEAR) & (df_raw['Group'] == 'Top20')] \
    .loc[:,col_list]

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 5264 to 9274
Data columns (total 10 columns):
Country           460 non-null object
Year              460 non-null int64
AveTemperature    460 non-null float64
Population        460 non-null int64
GDP               449 non-null float64
FertilityRate     460 non-null float64
ElecFossi         460 non-null float64
ElecNuclear       298 non-null float64
EduSpend          308 non-null float64
ForestArea        460 non-null float64
dtypes: float64(7), int64(2), object(1)
memory usage: 59.5+ KB


In [61]:
# fill missing values
df_fillna = df.sort_values(['Country', 'Year']).copy()
df_fillna.ElecNuclear = df_fillna.ElecNuclear.fillna(0)
df_fillna = df_fillna.fillna(method='backfill')

In [62]:
df_fillna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 460 entries, 5264 to 9274
Data columns (total 10 columns):
Country           460 non-null object
Year              460 non-null int64
AveTemperature    460 non-null float64
Population        460 non-null int64
GDP               460 non-null float64
FertilityRate     460 non-null float64
ElecFossi         460 non-null float64
ElecNuclear       460 non-null float64
EduSpend          460 non-null float64
ForestArea        460 non-null float64
dtypes: float64(7), int64(2), object(1)
memory usage: 39.5+ KB


In [63]:
X = df_fillna.iloc[:,3:]
y = df_fillna['AveTemperature']

### Modeling

In [66]:
import statsmodels.api as sm
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()

  from pandas.core import datetools


0,1,2,3
Dep. Variable:,AveTemperature,R-squared:,0.5
Model:,OLS,Adj. R-squared:,0.492
Method:,Least Squares,F-statistic:,64.57
Date:,"Sun, 14 Oct 2018",Prob (F-statistic):,3.9400000000000004e-64
Time:,15:38:33,Log-Likelihood:,-1487.2
No. Observations:,460,AIC:,2990.0
Df Residuals:,452,BIC:,3023.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,14.7792,2.102,7.032,0.000,10.649,18.910
Population,-4.58e-11,9.38e-10,-0.049,0.961,-1.89e-09,1.8e-09
GDP,8.881e-08,2.59e-07,0.343,0.732,-4.2e-07,5.98e-07
FertilityRate,5.2319,0.386,13.550,0.000,4.473,5.991
ElecFossi,-0.0771,0.016,-4.720,0.000,-0.109,-0.045
ElecNuclear,-0.1291,0.022,-5.804,0.000,-0.173,-0.085
EduSpend,-0.5094,0.272,-1.875,0.061,-1.043,0.025
ForestArea,-1.796e-06,1.7e-07,-10.551,0.000,-2.13e-06,-1.46e-06

0,1,2,3
Omnibus:,6.596,Durbin-Watson:,0.144
Prob(Omnibus):,0.037,Jarque-Bera (JB):,6.479
Skew:,-0.254,Prob(JB):,0.0392
Kurtosis:,3.283,Cond. No.,2830000000.0
