In [41]:
import pandas as pd
import psycopg2 as pg
import pandas.io.sql as psql
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
db_conn = pg.connect(
    dbname="wheretolive",
    user="wheretolive",
    password="wheretolive",
    host="127.0.0.1")

In [17]:
sql_query = """
select s.bfs_nr, s.min_income, s.rate as s_rate, m0c.rate as m0c_rate, m2c.rate as m2c_rate, m2c2s.rate as m2c2s_rate from
(select * from tax_rate
where profile = 'single') as s
join
(select * from tax_rate
where profile = 'married_no_children') as m0c
on m0c.bfs_nr = s.bfs_nr and m0c.min_income = s.min_income and m0c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children') as m2c
on m2c.bfs_nr = s.bfs_nr and m2c.min_income = s.min_income and m2c.max_income = s.max_income
join
(select * from tax_rate
where profile = 'married_2_children_2_salaries') as m2c2s
on m2c2s.bfs_nr = s.bfs_nr and m2c2s.min_income = s.min_income and m2c2s.max_income = s.max_income
order by s.bfs_nr, s.min_income
"""

sql_query = """
select bfs_nr, profile, min_income, rate from tax_rate
order by bfs_nr, profile, min_income
"""

In [18]:
df = pd.read_sql(sql_query, db_conn)

In [26]:
df['children'] = df['profile'].map(lambda x: 2 if '2_children' in x else 0)
df['salaries'] = df['profile'].map(lambda x: 2 if '2_salaries' in x else 1)
df['num_taxed'] = df['profile'].map(lambda x: 2 if 'married' in x else 1)
df = df.drop(['profile'], axis=1)

Unnamed: 0,bfs_nr,min_income,rate,children,salaries,num_taxed
0,1,12500,0.384000,2,1,2
1,1,15000,0.320000,2,1,2
2,1,17500,0.274286,2,1,2
3,1,20000,0.240000,2,1,2
4,1,25000,0.192000,2,1,2
...,...,...,...,...,...,...
195,3,20000,0.240000,2,1,2
196,3,25000,0.192000,2,1,2
197,3,30000,0.160000,2,1,2
198,3,35000,0.137143,2,1,2


In [29]:
X_train, X_test, y_train, y_test = train_test_split(df[['bfs_nr', 'min_income', 'children', 'salaries', 'num_taxed']], df['rate'], test_size=0.2, random_state=0)

In [30]:
X_train.head()

Unnamed: 0,bfs_nr,min_income,children,salaries,num_taxed
2971,37,250000,0,1,1
53157,1081,400000,0,1,2
163939,5487,250000,0,1,2
41456,867,45000,0,1,1
71616,2183,12500,2,1,2


In [None]:
param_grid = {
    'kernel': ['rbf'],
    'C': [0.1, 1, 10],
    'epsilon': [0.1]
}
svr = SVR()
grid_search = GridSearchCV(
    estimator=svr,
    param_grid=param_grid,
    scoring=['r2', 'neg_mean_squared_error'],
    n_jobs=4,
    pre_dispatch=6,
    refit='r2',
    verbose=50)
grid_search = grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed: 36.9min
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed: 52.7min
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed: 53.3min
[Parallel(n_jobs=4)]: Done   4 tasks      | elapsed: 58.3min
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed: 71.4min
[Parallel(n_jobs=4)]: Done   6 tasks      | elapsed: 92.5min
[Parallel(n_jobs=4)]: Done   7 tasks      | elapsed: 99.6min


In [33]:
clf = grid_search.best_estimator_ # use optimal parameters from grid_search
# clf = clf.fit(X_train, y_train)

In [34]:
y_pred = clf.predict(X_test)

In [40]:
print(f'Mean Squared Error: {mean_squared_error(y_test, y_pred)}')
print(f'R^2 Score: {r2_score(y_test, y_pred)}')

Mean Squared Error: 10.253291269914145
R^2 Score: 0.8295861731145563
