In [None]:
import pandas as pd
import psycopg2 as pg
import pandas.io.sql as psql
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

In [None]:
db_conn = pg.connect(
    dbname="wheretolive",
    user="wheretolive",
    password="wheretolive",
    host="127.0.0.1")

In [None]:
sql_query = """
select bfs_nr_state.state, s.bfs_nr, s.min_income, s.rate as s_rate, m0c.rate as m0c_rate, m2c.rate as m2c_rate, m2c2s.rate as m2c2s_rate from
(select * from tax_rate
where profile = 'single') as s
join
(select * from tax_rate
where profile = 'married_no_children') as m0c
on m0c.bfs_nr = s.bfs_nr and m0c.min_income = s.min_income and m0c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children') as m2c
on m2c.bfs_nr = s.bfs_nr and m2c.min_income = s.min_income and m2c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children_2_salaries') as m2c2s
on m2c2s.bfs_nr = s.bfs_nr and m2c2s.min_income = s.min_income and m2c2s.max_income = s.max_income
join 
(select distinct bfs_nr, state from town) as bfs_nr_state
on bfs_nr_state.bfs_nr = s.bfs_nr
order by s.bfs_nr, s.min_income
"""

In [None]:
df = pd.read_sql(sql_query, db_conn)
df.head()

In [None]:
df['2c_effect'] = df['m2c_rate'] - df['m0c_rate']
df['m_effect'] = df['m0c_rate'] - df['s_rate']
df['2s_effect'] = df['m2c2s_rate'] - df['m2c_rate']
df.head(n=24)

In [None]:
df_effects = df.groupby(by=['min_income', 'state']).agg(
    {
        "2c_effect": [np.mean, np.std, np.var],
        "m_effect": [np.mean, np.std, np.var],
        "2s_effect": [np.mean, np.std, np.var]
    })
df_effects.head(n=24)

In [None]:
effects = set(df_effects.columns.get_level_values(0))
states = set(df_effects.index.get_level_values(1))
incomes = set(df_effects.index.get_level_values(0))
for effect in effects:
    for state in states:
        for min_income in incomes:
            df.loc[(df.min_income == min_income) & (df.state == state) , effect + '_est'] = df_effects[effect]['mean'][min_income][state]
df.head(n=24)

In [None]:
effects = set(df_effects.columns.get_level_values(0))
for effect in effects:
    mae = mean_absolute_error(df[effect], df[effect + '_est'])
    mse = mean_squared_error(df[effect], df[effect + '_est'])
    r2 = r2_score(df[effect], df[effect + '_est'])
    print('{}:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(effect, mae, mse, r2))

In [None]:
#Sanity Check m2c_effect
df['m2c_effect'] = df['m2c_rate'] - df['s_rate']
df['m2c_effect_est'] = df['2c_effect_est'] + df['m_effect_est']
mae = mean_absolute_error(df['m2c_effect'], df['m2c_effect_est'])
mse = mean_squared_error(df['m2c_effect'], df['m2c_effect_est'])
r2 = r2_score(df['m2c_effect'], df['m2c_effect_est'])
print('m2c_effect:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(mae, mse, r2))

In [None]:
#Sanity Check m2c2s_effect
df['m2c2s_effect'] = df['m2c2s_rate'] - df['s_rate']
df['m2c2s_effect_est'] = df['2c_effect_est'] + df['m_effect_est'] + df['2s_effect_est']
mae = mean_absolute_error(df['m2c2s_effect'], df['m2c2s_effect_est'])
mse = mean_squared_error(df['m2c2s_effect'], df['m2c2s_effect_est'])
r2 = r2_score(df['m2c2s_effect'], df['m2c2s_effect_est'])
print('m2c2s_effect:\tMAE: {:.2f}\tMSE: {:.2f}\tR2: {:.2f}'.format(mae, mse, r2))