# Machine Learning Engineer Nanodegree
# Lin Muqing
# Capstone Model Iteration

In [2]:
import warnings
from imp import reload
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os
import seaborn as sns
import pickle as pkl
from IPython.display import display

%matplotlib inline

In [3]:
import data_prep
import models
import features
import cross_validation

In [6]:
reload(models)
reload(features)
reload(cross_validation)

<module 'cross_validation' from 'E:\\udacityMLND\\projects\\capstone\\project_submit\\cross_validation.py'>

In [4]:
def format_cv_output(out_data, label):
    pd.options.display.float_format = '{:,.7f}'.format
    df = pd.DataFrame([out_data], columns = ['cv_avg', 
                                             'cv_public_LB', 
                                             'cv_private_LB', 
                                             'score_public_LB', 
                                             'rank_public_LB', 
                                             'score_private_LB',
                                             'rank_private_LB'])
    df.to_csv('models/cv_res_%s.csv' % label)
    display(df)

### Parameter Search

Randomly run 100 iterations and manually pick parameter set. It takes quite some time to run the iterations, so the search code is commented out below, search results is commited and I directly demonstrate several top ones here.

In [11]:
# import lgb_models
# lgb_models.param_search_raw()
search_res = pd.read_csv('param_search/lgb_random_raw.csv')
display(search_res.iloc[:5, :])

Unnamed: 0,l1-mean,l1-stdv,n_rounds,num_leaves,min_data_in_leaf,learning_rate
0,0.067906,0.000259,10775,35,135,0.002373
1,0.067908,0.000263,10785,44,147,0.001895
2,0.067909,0.000271,2235,76,168,0.006207
3,0.06791,0.000262,4661,54,167,0.0041
4,0.067911,0.00027,10939,74,192,0.001252


In [3]:
# choose seasonality CV target months
m = models.ModelLGBRaw()
m.train(data_prep.train_x, data_prep.train_y, False, False)
is_pred_y = m.predict(data_prep.train_x)
y_diff = data_prep.train_y - is_pred_y
y_diff_group_med = y_diff.groupby(data_prep.train_x['sale_month']).median()
display(y_diff_group_med)

sale_month
1     0.0030938
2     0.0011820
3    -0.0024694
4    -0.0026113
5    -0.0017349
6    -0.0010402
7     0.0000633
8     0.0009286
9     0.0030595
10    0.0033645
11    0.0038973
12    0.0029188
Name: logerror, dtype: float64

In [4]:
# first train all the models, make submission and record LB scores.
model_median = models.ModelMedian()
model_lgb_raw = models.ModelLGBRaw()

# model_median.submit()
# model_lgb_raw.submit()

In [5]:
model_lgb_raw_sub_col = models.ModelLGBRawSubCol()
model_lgb_raw_inc_mon = models.ModelLGBRawIncMon()
model_lgb_raw_inc_mon_rm_outlier = models.ModelLGBRawIncMonOutlierRm()

# model_lgb_raw_sub_col.submit()
# model_lgb_raw_inc_mon.submit()
# model_lgb_raw_inc_mon_rm_outlier.submit()

In [13]:
cv_res = []
cv_df_index = []
cv_res.append(model_median.analysis())
cv_df_index.append('median')
cv_res.append(model_lgb_raw.analysis())
cv_df_index.append('lgb_raw')
cv_res.append(model_lgb_raw_sub_col.analysis())
cv_df_index.append('lgb_raw_sub_col')
cv_res.append(model_lgb_raw_inc_mon.analysis())
cv_df_index.append('lgb_raw_inc_mon')
cv_res.append(model_lgb_raw_inc_mon_rm_outlier.analysis())
cv_df_index.append('lgb_raw_inc_mon_rm_outlier')

cv_df = pd.DataFrame(cv_res, index=cv_df_index, columns=['cv_avg', 
                                                         'cv_public_LB', 
                                                         'cv_private_LB', 
                                                         'score_public_LB', 
                                                         'rank_public_LB', 
                                                         'score_private_LB',
                                                         'rank_private_LB'])
cv_df.to_csv('models/cv_res_lgb_raw.csv')
pd.options.display.float_format = '{:,.7f}'.format
display(cv_df)

Unnamed: 0,cv_avg,cv_public_LB,cv_private_LB,score_public_LB,rank_public_LB,score_private_LB,rank_private_LB
median,0.0688602,0.0686197,0.0670265,0.0653607,3257,0.0763265,2839
lgb_raw,0.0678963,0.068046,0.0660475,0.0643716,763,0.0752874,501
lgb_raw_sub_col,0.0679994,0.0680998,0.0661279,0.06448,1406,0.075371,680
lgb_raw_inc_mon,0.0678196,0.0678556,0.0659152,0.0641573,232,0.0750084,81
lgb_raw_inc_mon_rm_outlier,0.0678245,0.0678746,0.0659274,0.0641336,181,0.0749581,58


In [7]:
model_lgb_1step = models.ModelLGBOneStep()
# model_lgb_1step.submit()
format_cv_output(model_lgb_1step.analysis(), 'lgb_1step')

Unnamed: 0,cv_avg,cv_public_LB,cv_private_LB,score_public_LB,rank_public_LB,score_private_LB,rank_private_LB
0,0.0677845,0.0678174,0.0658441,0.0641632,242,0.0749579,58


In [10]:
outlier_frac = 0.01
upper_bound = np.percentile(data_prep.train_y, (1 - outlier_frac / 2) * 100)
lower_bound = np.percentile(data_prep.train_y, outlier_frac / 2 * 100)
print('outlier upper bound: %.4f' % upper_bound)
print('outlier lower bound: %.4f' % lower_bound)

outlier upper bound: 0.7566
outlier lower bound: -0.4865


In [12]:
model_lgb_raw_model = pkl.load(open('models/model_lgb_raw.pkl', 'rb'))
feature_info = features.feature_importance(model_lgb_raw_model)
feature_info = feature_info[['features', 'class', 'avg_rank', 'split_rank', 'gain_rank']]
pd.options.display.float_format = '{:,.2f}'.format
display(feature_info)

Unnamed: 0,features,class,avg_rank,split_rank,gain_rank
0,area_lot,1,1.5,1,2
1,area_living_finished_calc,1,1.5,2,1
2,dollar_taxvalue_structure,1,4.0,5,3
3,latitude,1,4.0,4,4
4,year_built,1,4.0,3,5
5,longitude,1,7.0,7,7
6,dollar_tax,1,7.0,8,6
7,code_zip_lgb,3,8.0,6,10
8,dollar_taxvalue_land,1,8.5,9,8
9,dollar_taxvalue_total,1,9.5,10,9
