In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [6]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

### load datasets

* Total population by country
* GDP (Current US$)
* % population who have access to electricity
* % electricity from Coal, Oil or Gas
* % electricity from Nuclear Power
* Fertility rate
* Smoking rate

In [7]:
df_raw = pd.read_csv('../data/ADS_project_dataset1.csv')

In [8]:
df_pop_raw = pd.read_csv('../data/Populationfrom1960.csv')

In [9]:
df_gdp_raw = pd.read_csv('../data/GDP1960topresent.csv')

In [10]:
df_elec_non_nuclear_raw = pd.read_csv('../data/ElectOilandCoal1960topresent.csv')

In [11]:
df_elec_nuclear_raw = pd.read_csv('../data/ElectfromNuclear1960present.csv')

In [12]:
df_fertil_raw = pd.read_csv('../data/Fertilityrates1960present.csv')

In [13]:
df_smoke_raw = pd.read_csv('../data/Smoking1960topresent.csv')

### transform data

In [56]:
# set index and exclude rows prior to 1960 from the main data
df = df_raw[(df_raw['Year'] >= 1960) & (df_raw['Year'] <= 2012)] \
    .set_index(['Country', 'Year']) 

In [15]:
# create lists containing column names to be included in the dataset
year_col_names = list(map(str, list(range(1960, 2018))))
country_col_name = ['Country Name']

In [16]:
def trans_data (df, country_col_name, year_col_names, var_col_name):
    df_trans = pd.melt(df.loc[:,country_col_name + year_col_names],
            id_vars=country_col_name,
            value_vars=year_col_names,
            value_name=var_col_name) \
            .rename(columns={country_col_name[0]:'Country', 'variable':'Year'}) 
    df_trans['Year'] = pd.to_numeric(df_trans['Year'])
    df_trans = df_trans.set_index(['Country', 'Year'])
    return df_trans

In [17]:
df_pop = trans_data(df_pop_raw, country_col_name, year_col_names, 'population')

In [18]:
df_gdp = trans_data(df_gdp_raw, country_col_name, year_col_names, 'GDP')

In [19]:
df_elec_non_nuclear = trans_data(df_elec_non_nuclear_raw, country_col_name, year_col_names, 'electricity_others')

In [20]:
df_elec_nuclear = trans_data(df_elec_nuclear_raw , country_col_name, year_col_names, 'electricity_nuclear')

In [21]:
df_fertil = trans_data(df_fertil_raw, country_col_name, year_col_names, 'fertility')

In [22]:
df_smoke = trans_data(df_smoke_raw, country_col_name, year_col_names, 'smoke')

In [23]:
# combine all the datasets into the main dataset
df_comb = df.join(df_pop, how='inner') \
    .join(df_gdp, how='inner') \
    .join(df_elec_non_nuclear, how='inner') \
    .join(df_elec_nuclear, how='inner') \
    .join(df_fertil, how='inner') \
    .join(df_smoke, how='inner')

In [24]:
# export to csv
df_comb.to_csv('../data/df_comb.csv')

### explore data

In [25]:
df_comb.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Key,Group,Ave Temperature,CO2 (MtCO2),prox1000,prox2000,prox3000,prox4000,prox5000,prox6000,...,prox10000,prox11000,prox12000,prox13000,population,GDP,electricity_others,electricity_nuclear,fertility,smoke
Country,Year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Zimbabwe,2008,2008Zimbabwe,Rest,21.545583,7.6479,437.5703,0.0,0.0,920.1856,3643.4216,1267.5012,...,1027.9804,0.0,0.0,0.0,13558469.0,4415703000.0,24.104918,0.0,4.014,
Zimbabwe,2009,2009Zimbabwe,Rest,21.37725,7.5281,412.8215,0.0,0.0,960.5159,3680.9984,1213.4997,...,981.7444,0.0,0.0,0.0,13810599.0,8621574000.0,24.358799,0.0,4.024,
Zimbabwe,2010,2010Zimbabwe,Rest,21.98625,9.6294,421.7848,0.0,0.0,999.7063,3850.6933,1288.8009,...,1001.4111,0.0,0.0,0.0,14086317.0,10141860000.0,31.990767,0.0,4.028,16.4
Zimbabwe,2011,2011Zimbabwe,Rest,21.602417,11.5683,411.7169,0.0,0.0,1029.8112,3894.3432,1285.4005,...,1024.4059,0.0,0.0,0.0,14386649.0,12098450000.0,42.144097,0.0,4.019,16.4
Zimbabwe,2012,2012Zimbabwe,Rest,21.521333,12.1765,425.6263,0.0,0.0,1069.3231,4029.0282,1357.4317,...,1025.2629,0.0,0.0,0.0,14710826.0,14242490000.0,39.60665,0.0,3.996,16.2


In [26]:
df_comb.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8162 entries, (Afghanistan, 1960) to (Zimbabwe, 2012)
Data columns (total 23 columns):
Key                    8162 non-null object
Group                  8162 non-null object
Ave Temperature        8162 non-null float64
CO2 (MtCO2)            7896 non-null float64
prox1000               8126 non-null float64
prox2000               8162 non-null float64
prox3000               8162 non-null float64
prox4000               8162 non-null float64
prox5000               8162 non-null float64
prox6000               8162 non-null float64
prox7000               8162 non-null float64
prox8000               8162 non-null float64
prox9000               8162 non-null float64
prox10000              8162 non-null float64
prox11000              8162 non-null float64
prox12000              8162 non-null float64
prox13000              8162 non-null float64
population             8128 non-null float64
GDP                    6610 non-null float64
electricit

In [27]:
df_comb.describe(include='all')

Unnamed: 0,Key,Group,Ave Temperature,CO2 (MtCO2),prox1000,prox2000,prox3000,prox4000,prox5000,prox6000,...,prox10000,prox11000,prox12000,prox13000,population,GDP,electricity_others,electricity_nuclear,fertility,smoke
count,8162,8162,8162.0,7896.0,8126.0,8162.0,8162.0,8162.0,8162.0,8162.0,...,8162.0,8162.0,8162.0,8162.0,8128.0,6610.0,4781.0,4781.0,7911.0,625.0
unique,8162,2,,,,,,,,,...,,,,,,,,,,
top,1973Hungary,Rest,,,,,,,,,...,,,,,,,,,,
freq,1,7314,,,,,,,,,...,,,,,,,,,,
mean,,,19.025063,81.553389,621.884517,1088.191233,1336.327269,1773.896944,2234.471546,2493.217849,...,730.307399,251.309234,120.668146,20.543344,27662310.0,121498000000.0,59.597517,4.367776,4.188009,24.61648
std,,,8.633881,334.746561,984.403335,1619.267785,1759.731902,1904.90848,2737.662054,2502.332864,...,1578.843434,928.589824,687.260162,296.732264,111967300.0,473692400000.0,34.853787,12.661576,2.062566,11.160664
min,,,-19.85775,-0.0806,-0.0806,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9642.0,9122751.0,0.0,0.0,1.085,2.4
25%,,,11.697021,1.252475,1.9655,20.7276,90.0647,381.9947,329.5981,599.4481,...,0.0,0.0,0.0,0.0,1911810.0,1597157000.0,28.060553,0.0,2.2135,16.3
50%,,,22.316833,7.6964,94.33015,353.0483,668.60855,1346.24605,1339.9264,1661.7983,...,103.24785,0.0,0.0,0.0,5573018.0,7692343000.0,66.046923,0.0,3.952,24.3
75%,,,26.304417,49.155875,945.311725,1560.2308,2026.107525,2524.350975,2735.1952,3763.28465,...,434.2663,60.39,0.0,0.0,14992090.0,47305510000.0,93.681806,0.0,6.1735,31.9


In [28]:
# creat a dataset wi
df_groupby = df_comb.drop(['smoke'], axis=1).reset_index().dropna().loc[:,['Country', 'Year']].groupby('Country').count()

In [29]:
# find countries without any missing data
country_list = list(df_groupby[df_groupby.Year == 53].reset_index()['Country'])
country_list

['Australia',
 'Austria',
 'Belgium',
 'Canada',
 'Denmark',
 'Finland',
 'France',
 'Greece',
 'Iceland',
 'Ireland',
 'Italy',
 'Japan',
 'Netherlands',
 'New Zealand',
 'Norway',
 'Portugal',
 'Spain',
 'Sweden',
 'Turkey',
 'United Kingdom']

In [30]:
# subset data using the list created above
df_clean = df_comb.drop(['Key', 'Group', 'smoke'], axis=1).loc[country_list].reset_index()

In [49]:
df_clean.columns

Index(['Country', 'Year', 'Ave Temperature', 'CO2 (MtCO2)', 'prox1000',
       'prox2000', 'prox3000', 'prox4000', 'prox5000', 'prox6000', 'prox7000',
       'prox8000', 'prox9000', 'prox10000', 'prox11000', 'prox12000',
       'prox13000', 'population', 'GDP', 'electricity_others',
       'electricity_nuclear', 'fertility'],
      dtype='object')

In [31]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060 entries, 0 to 1059
Data columns (total 22 columns):
Country                1060 non-null object
Year                   1060 non-null int64
Ave Temperature        1060 non-null float64
CO2 (MtCO2)            1060 non-null float64
prox1000               1060 non-null float64
prox2000               1060 non-null float64
prox3000               1060 non-null float64
prox4000               1060 non-null float64
prox5000               1060 non-null float64
prox6000               1060 non-null float64
prox7000               1060 non-null float64
prox8000               1060 non-null float64
prox9000               1060 non-null float64
prox10000              1060 non-null float64
prox11000              1060 non-null float64
prox12000              1060 non-null float64
prox13000              1060 non-null float64
population             1060 non-null float64
GDP                    1060 non-null float64
electricity_others     1060 non-null floa

In [32]:
# split train and test
df_train_raw = df_clean[df_clean.Year != 2012]
df_test_raw = df_clean[df_clean.Year == 2012]

In [33]:
# get X and y
y_train = df_train_raw['Ave Temperature']
X_train = df_train_raw.iloc[:,3:]
y_test = df_test_raw['Ave Temperature']
X_test = df_test_raw.iloc[:,3:]

### Modeling

In [34]:
# Create linear regression object
regr = linear_model.LinearRegression()

In [35]:
# Train the model using the training sets
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
# Make predictions using the testing set
y_pred = regr.predict(X_test)

In [37]:
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [-2.88433496e-02  4.32222764e-04  2.21413417e-04  1.61848536e-03
 -1.58570693e-04 -2.24181319e-04  1.61005870e-03 -1.57259150e-03
 -1.42441822e-03 -1.18915915e-02  9.33570959e-03  3.36493814e-02
  1.53213292e-02  1.73472348e-16  3.41523179e-07 -9.15790013e-13
  2.35891734e-02  9.32003807e-02  1.56701214e+00]
Mean squared error: 49.62
Variance score: 0.26
