In [2]:
import pandas as pd
import psycopg2 as pg
import pandas.io.sql as psql
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [3]:
db_conn = pg.connect(
    dbname="wheretolive",
    user="wheretolive",
    password="wheretolive",
    host="127.0.0.1")

In [4]:
sql_query = """
select bfs_nr_state.state, s.bfs_nr, s.min_income, s.rate as s_rate, m0c.rate as m0c_rate, m2c.rate as m2c_rate, m2c2s.rate as m2c2s_rate from
(select * from tax_rate
where profile = 'single') as s
join
(select * from tax_rate
where profile = 'married_no_children') as m0c
on m0c.bfs_nr = s.bfs_nr and m0c.min_income = s.min_income and m0c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children') as m2c
on m2c.bfs_nr = s.bfs_nr and m2c.min_income = s.min_income and m2c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children_2_salaries') as m2c2s
on m2c2s.bfs_nr = s.bfs_nr and m2c2s.min_income = s.min_income and m2c2s.max_income = s.max_income
join 
(select distinct bfs_nr, state from town) as bfs_nr_state
on bfs_nr_state.bfs_nr = s.bfs_nr
order by s.bfs_nr, s.min_income
"""

In [5]:
df = pd.read_sql(sql_query, db_conn)
df.head()

Unnamed: 0,state,bfs_nr,min_income,s_rate,m0c_rate,m2c_rate,m2c2s_rate
0,ZH,1,12500,0.59328,0.384,0.384,0.384
1,ZH,1,15000,1.107467,0.32,0.32,0.32
2,ZH,1,17500,1.618057,0.274286,0.274286,0.274286
3,ZH,1,20000,2.1055,0.4072,0.24,0.24
4,ZH,1,25000,2.82972,1.06144,0.192,0.192


In [6]:
df['2c_effect'] = df['m2c_rate'] - df['m0c_rate']
df['m_effect'] = df['m0c_rate'] - df['s_rate']
df['2s_effect'] = df['m2c2s_rate'] - df['m2c_rate']
df.head(n=24)

Unnamed: 0,state,bfs_nr,min_income,s_rate,m0c_rate,m2c_rate,m2c2s_rate,2c_effect,m_effect,2s_effect
0,ZH,1,12500,0.59328,0.384,0.384,0.384,0.0,-0.20928,0.0
1,ZH,1,15000,1.107467,0.32,0.32,0.32,0.0,-0.787467,0.0
2,ZH,1,17500,1.618057,0.274286,0.274286,0.274286,0.0,-1.343771,0.0
3,ZH,1,20000,2.1055,0.4072,0.24,0.24,-0.1672,-1.6983,0.0
4,ZH,1,25000,2.82972,1.06144,0.192,0.192,-0.86944,-1.76828,0.0
5,ZH,1,30000,3.6121,1.741433,0.16,0.16,-1.581433,-1.870667,0.0
6,ZH,1,35000,4.302314,2.083829,0.137143,0.137143,-1.946686,-2.218486,0.0
7,ZH,1,40000,4.757275,2.695925,0.13045,0.12,-2.565475,-2.06135,-0.01045
8,ZH,1,45000,5.357289,3.083756,0.533956,0.106667,-2.5498,-2.273533,-0.427289
9,ZH,1,50000,5.92508,3.49852,0.96962,0.18796,-2.5289,-2.42656,-0.78166


In [86]:
df_effects = df.groupby(by=['min_income', 'state']).agg(
    {
        "2c_effect": [np.mean, np.std, np.var],
        "m_effect": [np.mean, np.std, np.var],
        "2s_effect": [np.mean, np.std, np.var]
    })
df_effects.head(n=24)

Unnamed: 0_level_0,Unnamed: 1_level_0,2c_effect,2c_effect,2c_effect,m_effect,m_effect,m_effect,2s_effect,2s_effect,2s_effect
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,var,mean,std,var,mean,std,var
min_income,state,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
12500,AG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,AI,-0.540715,0.038672,0.001496,-1.104285,0.07897818,0.006237553,0.0,0.0,0.0
12500,AR,0.0,0.0,0.0,-0.456682,0.02622192,0.0006875892,0.0,0.0,0.0
12500,BE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,BL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,BS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,FR,0.0,0.0,0.0,-0.372397,0.04958549,0.00245872,0.0,0.0,0.0
12500,GE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,GL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12500,GR,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
effects = set(df_effects.columns.get_level_values(0))
states = set(df_effects.index.get_level_values(1))
incomes = set(df_effects.index.get_level_values(0))
for effect in effects:
    for state in states:
        for min_income in incomes:
            df.loc[(df.min_income == min_income) & (df.state == state) , effect + '_est'] = df_effects[effect]['mean'][min_income][state]
df.head(n=24)

Unnamed: 0,state,bfs_nr,min_income,s_rate,m0c_rate,m2c_rate,m2c2s_rate,2c_effect,m_effect,2s_effect,m_effect_est,2c_effect_est,2s_effect_est
0,ZH,1,12500,0.59328,0.384,0.384,0.384,0.0,-0.20928,0.0,-0.229671,0.0,0.0
1,ZH,1,15000,1.107467,0.32,0.32,0.32,0.0,-0.787467,0.0,-0.834562,0.0,0.0
2,ZH,1,17500,1.618057,0.274286,0.274286,0.274286,0.0,-1.343771,0.0,-1.416979,0.0,0.0
3,ZH,1,20000,2.1055,0.4072,0.24,0.24,-0.1672,-1.6983,0.0,-1.787154,-0.176269,0.0
4,ZH,1,25000,2.82972,1.06144,0.192,0.192,-0.86944,-1.76828,0.0,-1.859478,-0.912934,0.0
5,ZH,1,30000,3.6121,1.741433,0.16,0.16,-1.581433,-1.870667,0.0,-1.965639,-1.660088,0.0
6,ZH,1,35000,4.302314,2.083829,0.137143,0.137143,-1.946686,-2.218486,0.0,-2.330255,-2.04301,0.0
7,ZH,1,40000,4.757275,2.695925,0.13045,0.12,-2.565475,-2.06135,-0.01045,-2.165944,-2.690302,-0.011679
8,ZH,1,45000,5.357289,3.083756,0.533956,0.106667,-2.5498,-2.273533,-0.427289,-2.387153,-2.675003,-0.448906
9,ZH,1,50000,5.92508,3.49852,0.96962,0.18796,-2.5289,-2.42656,-0.78166,-2.547639,-2.652647,-0.820328


In [88]:
effects = set(df_effects.columns.get_level_values(0))
for effect in effects:
    mae = mean_absolute_error(df[effect], df[effect + '_est'])
    mse = mean_squared_error(df[effect], df[effect + '_est'])
    r2 = r2_score(df[effect], df[effect + '_est'])
    print('{}:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(effect, mae, mse, r2))

m_effect:	MAE: 0.13	MSE: 0.04	R2: 0.99
2c_effect:	MAE: 0.09	MSE: 0.03	R2: 0.99
2s_effect:	MAE: 0.02	MSE: 0.00	R2: 0.99


In [89]:
#Sanity Check m2c_effect
df['m2c_effect'] = df['m2c_rate'] - df['s_rate']
df['m2c_effect_est'] = df['2c_effect_est'] + df['m_effect_est']
mae = mean_absolute_error(df['m2c_effect'], df['m2c_effect_est'])
mse = mean_squared_error(df['m2c_effect'], df['m2c_effect_est'])
r2 = r2_score(df['m2c_effect'], df['m2c_effect_est'])
print('m2c_effect:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(mae, mse, r2))

m2c_effect:	MAE: 0.22	MSE: 0.12	R2: 0.99


In [90]:
#Sanity Check m2c2s_effect
df['m2c2s_effect'] = df['m2c2s_rate'] - df['s_rate']
df['m2c2s_effect_est'] = df['2c_effect_est'] + df['m_effect_est'] + df['2s_effect_est']
mae = mean_absolute_error(df['m2c2s_effect'], df['m2c2s_effect_est'])
mse = mean_squared_error(df['m2c2s_effect'], df['m2c2s_effect_est'])
r2 = r2_score(df['m2c2s_effect'], df['m2c2s_effect_est'])
print('m2c2s_effect:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(mae, mse, r2))

m2c2s_effect:	MAE: 0.24	MSE: 0.13	R2: 0.99
