In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd
import gc

In [3]:
ncpu = 8 # It should be modified if run of midawy :)

In [4]:
store = pd.HDFStore('../Data/store1.h5')
print store

<class 'pandas.io.pytables.HDFStore'>
File path: ../Data/store1.h5
/miss             frame        (shape->[11437,3])    
/prop             frame        (shape->[2883630,254])
/train            frame        (shape->[90275,256])  


In [7]:
train = store["train"]
prop = store["prop"]

In [8]:
dd = ['type_architectural',
 'area_basement',
 'num_bathroom',
 'num_bedroom',
 'type_framing',
 'num_bathroom_calc',
 'type_deck',
 'area_liveperi_finished',
 'num_bath',
 'pooltypeid10',
 'region_county',
 'type_story',
 'type_material',
 'area_shed',
 'tax_year',
 'num_rot75_X',
 'num_rot75_Y']

train = train.drop(dd + ["area_total_calc"], axis=1)
prop = prop.drop(dd + ["area_total_calc"], axis=1)

print "prop shape " + str(prop.shape)
print "train shape " + str(train.shape)

prop shape (2883630, 236)
train shape (90275, 238)


In [9]:
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3
interval = q3 - q1
fac = 8.0
interval = interval * fac / 2.
hi = interval + mid
lo = -interval + mid
print hi, lo

-0.0313 0.0332
0.264 -0.252


In [10]:
# split the data to 9 months for train and 3 months for test
x1 = train[train.month < 10]    # use for train
x0 = train[train.month > 9]     # use for test
print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

y1 = x1['logerror'].values
y0 = x0['logerror'].values

index_hi = y1 > hi   # drop 1480 points
index_lo = y1 < lo    # drop 947 points
print sum(index_hi), sum(index_lo)

y1 = y1[(~index_lo) & (~index_hi)]
x1 = x1[(~index_lo) & (~index_hi)]

print "Size of the x1 data frame: ", x1.shape
print "Size of the x0 data frame: ", x0.shape

Size of the x1 data frame:  (81733, 238)
Size of the x0 data frame:  (8542, 238)
2088 1432
Size of the x1 data frame:  (78213, 238)
Size of the x0 data frame:  (8542, 238)


In [11]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [12]:
col = ["rate", "depth", "nround", "subsample", "colsample", "child", "L2", "score"]
result = pd.DataFrame(columns=col)

for rate in [0.01]:
    for depth in [9]:
        for nround in [10]:
            for sample in [1.0]:
                for colsample in [0.5]:
                    for child in [1]:
                        for L2 in [3]:
                            model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE',
                                                            iterations=nround, 
                                                            learning_rate=rate,
                                                            depth=depth,
                                                            bagging_temperature=sample,
                                                            rsm=colsample,
                                                            l2_leaf_reg=L2, 
                                                            thread_count = ncpu,
                                                            random_seed=123)

                            model.fit(x1.drop(["id_parcel", "month", "logerror"], axis=1), y1)

                            score = mean_absolute_error(y0, model.predict(x0.drop(["id_parcel", "month", "logerror"], axis=1)))
                            
                            print score
                            
                            result = result.append(pd.DataFrame([[rate, depth, nround, sample, colsample,
                                                                      child, L2, score]], columns=col))

0.065887853208


In [13]:
result = result.sort_values("score")
result.to_csv("result.csv")
print result

   rate  depth  nround  subsample  colsample  child   L2     score
0  0.01    9.0    10.0        1.0        0.5    1.0  3.0  0.065888


In [14]:
model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE',
        calc_feature_importance=True,
        iterations=np.int(result.iloc[0].nround),
        learning_rate=result.iloc[0].rate,
        depth=np.int(result.iloc[0].depth), 
        bagging_temperature=result.iloc[0].subsample,
        rsm=result.iloc[0].colsample,
        l2_leaf_reg=np.int(result.iloc[0].L2),
        thread_count = ncpu,
        random_seed=123)

In [15]:
model.fit(x1.drop(["id_parcel", "month", "logerror"], axis=1), y1) # Train the model without outliers

print "Error on training data ", mean_absolute_error(y1, model.predict(x1.drop(["id_parcel", "month", "logerror"], axis=1)))
print "Error on 3 months test ", mean_absolute_error(y0, model.predict(x0.drop(["id_parcel", "month", "logerror"], axis=1)))

Error on training data  0.0463481486513
Error on 3 months test  0.065887853208


In [16]:
importance = {}
for col, val in zip(x1.drop(["id_parcel", "month", "logerror"], axis=1).columns, model.feature_importance):
    importance[col] = val

import operator
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

df.to_csv("importance")

# CV

In [17]:
# we have duplicate in train :(
# we can have three simple strategies
# 1- keep first one; 2- keep last one; 3- average
# I think the logerror reduce from first to last but I am not sure
# it is a very important point
duplicate = train["id_parcel"].duplicated(keep='first')
train = train[~duplicate]
print "Size of the train data frame: ", train.shape
print "Size of the prop data frame: ", prop.shape

Size of the train data frame:  (90150, 238)
Size of the prop data frame:  (2883630, 236)


In [18]:
y = train.logerror
mid = np.percentile(y, 50)
y = y - mid
q1 = np.percentile(y, 25)
q3 = np.percentile(y, 75)
print q1, q3

fac = 8.0
interval = q3 - q1
interval = interval * fac / 2.
hi_train = interval + mid
lo_train = -interval + mid

fac = 65.0
interval = q3 - q1
interval = interval * fac / 2.
hi_test = interval + mid
lo_test = -interval + mid

print lo_train, hi_train
print lo_test, hi_test

-0.0313 0.0332
-0.252 0.264
-2.09025 2.10225


In [19]:
y = train['logerror'].values
x = train.drop(['month', 'logerror'], axis=1)
print "Size of the train data frame: ", x.shape
print "Size of the prop  data frame: ", prop.shape

print("Generate a list of outliers should be droped for training")
index_hi = y > hi_train
index_lo = y < lo_train
print sum(index_hi), sum(index_lo)

outliers_train = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_train.append(ii)

print("Generate a list of outliers should be droped for testing")
index_hi = y > hi_test
index_lo = y < lo_test
print sum(index_hi), sum(index_lo)

outliers_test = []
for ii in range(y.shape[0]):
    if index_hi[ii] or index_lo[ii]:
        outliers_test.append(ii)

Size of the train data frame:  (90150, 236)
Size of the prop  data frame:  (2883630, 236)
Generate a list of outliers should be droped for training
2310 1568
Generate a list of outliers should be droped for testing
51 46


In [20]:
def splitDataFrameIntoSmaller(df, chunkSize = 100000):
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(i*chunkSize)
    listOfDf.append(len(df))
    return listOfDf

split_index = splitDataFrameIntoSmaller(prop)

In [22]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 10, shuffle = True, random_state = 44)

train_pred = np.zeros(train.shape[0], dtype=np.float16)
prop_pred = np.zeros(prop.shape[0], dtype=np.float16)
scores1 = []; scores2 = []

for train_index, test_index in skf.split(x, y):

    train_index_wo = [ix for ix in train_index if ix not in outliers_train]
    test_index_wo = [ix for ix in test_index if ix not in outliers_test]

    x1, x0 = x.iloc[train_index_wo], x.iloc[test_index_wo]
    y1, y0 = y[train_index_wo], y[test_index_wo]

    model.fit(x1.drop(["id_parcel"], axis=1), y1) # Train the model without outliers

    #calculate score without second outliers
    scores1.append(mean_absolute_error(y0, model.predict(x0.drop(["id_parcel"], axis=1))))
    print "Score without outliers for the ", len(scores1), " fold is ", scores1[len(scores1)-1]

    #calculate score with outliers
    x0 = x.iloc[test_index]
    y0 = y[test_index]

    pred = model.predict(x0.drop(["id_parcel"], axis=1))
    scores2.append(mean_absolute_error(y0, pred))
#    print "Score with outliers for the ", len(scores2), " fold is ", scores2[len(scores2)-1]

    for ii, idx in enumerate(test_index):
        train_pred[idx] = pred[ii]

    for ii in range(0, len(split_index)-1):
        n1 = split_index[ii]; n2 = split_index[ii+1]
        pred = model.predict(prop.iloc[n1:n2].drop(['id_parcel'], axis=1))
        prop_pred[n1:n2] += pred

print "Average score without outliers over all folds : " , np.mean(scores1), " ", np.std(scores1)
print "Average score with    outliers over all folds : " , np.mean(scores2), " ", np.std(scores2)

Score without outliers for the  1  fold is  0.0647679709375
Score without outliers for the  2  fold is  0.0657277238894
Score without outliers for the  3  fold is  0.0644547384655
Score without outliers for the  4  fold is  0.0652583988152
Score without outliers for the  5  fold is  0.0644142335129


KeyboardInterrupt: 

In [None]:
out = pd.DataFrame()
out["ParcelId"] = prop["id_parcel"]
months = ["201610", "201611", "201612", "201710", "201711", "201712"]
for col in months:
    out[col] = map(lambda x: x/10.0, prop_pred)
    
out_train = pd.DataFrame()
out_train["ParcelId"] = train["id_parcel"]
for col in months:
    out_train[col] = train_pred #+ 0.02 #IMPORTANT POINT: I add a constant to train prediction


print("Read the missing")
miss = store["miss"]

med = train.logerror.median()
for col in months:
    miss[col] = med
    
miss = miss[["id_parcel"]+months]
miss.columns = ["ParcelId"] + months

out = pd.concat([out, out_train, miss], axis=0)

from datetime import datetime
out.to_csv('cat.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False, float_format='%.4f')