In [17]:
# Configures matplotlib
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Allows for interactive shell - outputs all non variable statements
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Common imports
import numpy as np
import numpy.random as rnd
import os

# to make this notebook's output stable across runs
rnd.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

# Where to save the figures
PROJECT_ROOT_DIR = "."
DATA_DIR = './handson-ml/datasets/lifesat/'
CHAPTER_ID = "fundamentals"

# Frame the problem
# Data
## Get

In [18]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn

oecd_bli = pd.read_csv(DATA_DIR + 'oecd_bli_2015.csv', thousands=',')
gdp_per_capita = pd.read_csv(DATA_DIR + 'gdp_per_capita.csv', thousands=',',
                             delimiter='\t', na_values='n/a', encoding='latin1')

backup = oecd_bli, gdp_per_capita

## Explore

### Column types

In [40]:
oecd_bli = backup[0]
oecd_bli.info()
oecd_bli[oecd_bli['LOCATION'] == "MEX"]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3292 entries, 0 to 3291
Data columns (total 17 columns):
LOCATION                 3292 non-null object
Country                  3292 non-null object
INDICATOR                3292 non-null object
Indicator                3292 non-null object
MEASURE                  3292 non-null object
Measure                  3292 non-null object
INEQUALITY               3292 non-null object
Inequality               3292 non-null object
Unit Code                3292 non-null object
Unit                     3292 non-null object
PowerCode Code           3292 non-null int64
PowerCode                3292 non-null object
Reference Period Code    0 non-null float64
Reference Period         0 non-null float64
Value                    3292 non-null float64
Flag Codes               1120 non-null object
Flags                    1120 non-null object
dtypes: float64(3), int64(1), object(13)
memory usage: 437.3+ KB


Unnamed: 0,LOCATION,Country,INDICATOR,Indicator,MEASURE,Measure,INEQUALITY,Inequality,Unit Code,Unit,PowerCode Code,PowerCode,Reference Period Code,Reference Period,Value,Flag Codes,Flags
17,MEX,Mexico,HO_BASE,Dwellings without basic facilities,L,Value,TOT,Total,PC,Percentage,0,units,,,4.200000,,
54,MEX,Mexico,HO_BASE,Dwellings without basic facilities,L,Value,MN,Men,PC,Percentage,0,units,,,4.200000,E,Estimated value
90,MEX,Mexico,HO_BASE,Dwellings without basic facilities,L,Value,WMN,Women,PC,Percentage,0,units,,,4.200000,E,Estimated value
126,MEX,Mexico,HO_HISH,Housing expenditure,L,Value,TOT,Total,PC,Percentage,0,units,,,21.000000,,
163,MEX,Mexico,HO_HISH,Housing expenditure,L,Value,MN,Men,PC,Percentage,0,units,,,21.000000,E,Estimated value
199,MEX,Mexico,HO_HISH,Housing expenditure,L,Value,WMN,Women,PC,Percentage,0,units,,,21.000000,E,Estimated value
235,MEX,Mexico,HO_NUMR,Rooms per person,L,Value,TOT,Total,RATIO,Ratio,0,units,,,1.000000,,
272,MEX,Mexico,HO_NUMR,Rooms per person,L,Value,MN,Men,RATIO,Ratio,0,units,,,1.000000,E,Estimated value
308,MEX,Mexico,HO_NUMR,Rooms per person,L,Value,WMN,Women,RATIO,Ratio,0,units,,,1.000000,E,Estimated value
344,MEX,Mexico,IW_HADI,Household net adjusted disposable income,L,Value,TOT,Total,USD,US Dollar,0,units,,,13085.000000,,


In [47]:
oecd_bli = backup[0]
oecd_bli = oecd_bli[oecd_bli["INEQUALITY"]=="TOT"]
oecd_bli = oecd_bli.pivot(index="Country", columns="Indicator", values="Value")
oecd_bli.head(2)

Indicator,Air pollution,Assault rate,Consultation on rule-making,Dwellings without basic facilities,Educational attainment,Employees working very long hours,Employment rate,Homicide rate,Household net adjusted disposable income,Household net financial wealth,...,Long-term unemployment rate,Personal earnings,Quality of support network,Rooms per person,Self-reported health,Student skills,Time devoted to leisure and personal care,Voter turnout,Water quality,Years in education
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Australia,13.0,2.1,10.5,1.1,76.0,14.02,72.0,0.8,31588.0,47657.0,...,1.08,50449.0,92.0,2.3,85.0,512.0,14.41,93.0,91.0,19.4
Austria,27.0,3.4,7.1,1.0,83.0,7.61,72.0,0.4,31173.0,49887.0,...,1.19,45199.0,89.0,1.6,69.0,500.0,14.46,75.0,94.0,17.0


In [48]:
oecd_bli.info()
oecd_bli.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 37 entries, Australia to United States
Data columns (total 24 columns):
Air pollution                                37 non-null float64
Assault rate                                 37 non-null float64
Consultation on rule-making                  37 non-null float64
Dwellings without basic facilities           37 non-null float64
Educational attainment                       37 non-null float64
Employees working very long hours            37 non-null float64
Employment rate                              37 non-null float64
Homicide rate                                37 non-null float64
Household net adjusted disposable income     37 non-null float64
Household net financial wealth               37 non-null float64
Housing expenditure                          37 non-null float64
Job security                                 37 non-null float64
Life expectancy                              37 non-null float64
Life satisfaction                     

Indicator,Air pollution,Assault rate,Consultation on rule-making,Dwellings without basic facilities,Educational attainment,Employees working very long hours,Employment rate,Homicide rate,Household net adjusted disposable income,Household net financial wealth,...,Long-term unemployment rate,Personal earnings,Quality of support network,Rooms per person,Self-reported health,Student skills,Time devoted to leisure and personal care,Voter turnout,Water quality,Years in education
count,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,...,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0,37.0
mean,19.891892,4.045946,7.075676,2.52973,75.351351,9.246486,66.351351,2.862162,24170.0,40999.027027,...,3.514595,36055.567568,89.594595,1.664865,67.918919,494.135135,14.883784,70.108108,82.648649,17.497297
std,8.085597,2.206253,2.639382,3.698188,16.340502,8.240518,7.487828,5.707274,7177.493074,32296.37649,...,3.836257,12870.738038,5.035758,0.427016,14.180147,29.9566,0.534137,12.008756,10.462144,1.27682
min,9.0,1.3,2.0,0.0,34.0,0.16,49.0,0.3,11664.0,3251.0,...,0.01,16193.0,72.0,0.9,30.0,402.0,13.42,49.0,56.0,14.4
25%,15.0,2.6,5.1,0.2,72.0,3.66,61.0,0.6,18575.0,14579.0,...,1.37,22655.0,87.0,1.4,65.0,484.0,14.61,62.0,77.0,16.4
50%,18.0,3.8,7.1,0.9,78.0,6.98,67.0,0.9,23965.0,31580.0,...,1.97,36118.0,90.0,1.7,69.0,498.0,14.95,69.0,85.0,17.6
75%,24.0,5.0,9.0,3.7,86.0,12.51,72.0,1.5,28799.0,60328.0,...,3.88,47590.0,94.0,1.9,76.0,515.0,15.11,78.0,91.0,18.4
max,46.0,12.8,11.5,15.1,94.0,40.86,82.0,25.5,41355.0,145769.0,...,18.39,56340.0,96.0,2.5,90.0,542.0,16.06,93.0,97.0,19.8


## Prepare

# Learn
## Short list different models

## Fine tune / combine your models
# Present
# Launch