# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

# import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# import data

In [2]:
train = pd.read_csv("../output/combination.csv")

# data preparation 

In [3]:
#There is no common tube_assembly_id between train and test data. So we drop this variable.
train.drop("tube_assembly_id", axis=1, inplace=True)

In [4]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07,2013-07-07,2013-07-07,2013-07-07,2013-07-07
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [5]:
train.quote_date = pd.to_datetime(train.quote_date)

In [6]:
#add new numeric time features

train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

In [7]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

In [8]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

# Utility Functions

In [9]:
#define a evaluation function

def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [10]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [11]:
# define a function for comparing predictions and true data.

def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

# application of XGB(eXtreme Gradient Boosting) model

### simple XGB regression

In [12]:
import xgboost as xgb
from xgboost import XGBRegressor

In [13]:
# split for machine learning model

train_data, valid_data = train_test_split(data, test_size = 0.2)

label = "cost"

data_labels = train_data.columns.tolist()
data_labels.remove(label)

train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

In [14]:
#XGB regression

start = time.time()
xgb_regressor=XGBRegressor(max_depth=3, n_estimators=300, learning_rate=0.1)


label_log=np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

XGB RMSLE is : 0.3255426899948521
It takes 0.9886560440063477 seconds


In [15]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,22720,5.814503,6.658740,14.519488
1,23025,2.612353,2.806032,7.413946
2,1258,6.679724,6.979096,4.481804
3,19392,2.042031,5.848816,186.421479
4,17342,7.633970,9.628932,26.132693
5,24634,6.173389,6.921050,12.111030
6,11158,6.281889,7.125575,13.430445
7,5294,6.423774,7.955406,23.843176
8,4123,4.365051,5.162497,18.268892
9,27156,3.616677,4.413052,22.019532


### XGB Regression and GridSearch

In [16]:
from sklearn.model_selection import GridSearchCV
#set parameters
parameters = {
    'max_depth': [3, 5, 7],
    "n_estimators": [100, 300, 500],
}

#define XGB Grid Search model
xgb_gridsearch = GridSearchCV(xgb_regressor, parameters, scoring=RMSLE, cv=5)

In [17]:
#grid search experiment
start = time.time()

#label_log=np.log1p(train_label)

xgb_gridsearch.fit(train_df, train_label)

end = time.time()
duration = end-start
print ("It takes {} seconds".format(duration))

It takes 105.01680016517639 seconds


In [18]:
#get/show the best parameters
best_parameters, score, _ = min(xgb_gridsearch.grid_scores_, key=lambda x: x[1])
print('score:', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    

#use best model to predict
start = time.time()
xgb_regressor = XGBRegressor(max_depth=best_parameters["max_depth"], n_estimators=best_parameters["n_estimators"], learning_rate=0.1)

label_log = np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

score: 0.315792288829
max_depth: 7
n_estimators: 500
XGB RMSLE is : 0.22984824572059886
It takes 6.564118146896362 seconds


In [19]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,22720,5.814503,6.091280,4.760098
1,23025,2.612353,2.610927,0.054591
2,1258,6.679724,7.015801,5.031303
3,19392,2.042031,2.050822,0.430487
4,17342,7.633970,9.256715,21.256892
5,24634,6.173389,6.536420,5.880576
6,11158,6.281889,6.726801,7.082448
7,5294,6.423774,7.372171,14.763852
8,4123,4.365051,4.616990,5.771730
9,27156,3.616677,4.421338,22.248624


### XGB Regression and Random Grid Search

In [20]:
from sklearn.model_selection import RandomizedSearchCV
#define XGB Random Grid Search model
xgb_randomsearch = RandomizedSearchCV(xgb_regressor, parameters, scoring=RMSLE, cv=5, n_iter=3) #n_iter works for what?

#set parameters
parameters = {
    'max_depth': [3, 5, 7],
    "n_estimators": [100, 300, 500],
}

In [21]:
#Random Grid Search experiment
start = time.time()

#label_log=np.log1p(train_label)

xgb_randomsearch.fit(train_df, train_label)

end = time.time()
duration = end-start
print ("It takes {} seconds".format(duration))

It takes 17.312834978103638 seconds


In [22]:
#get/show the best parameters
best_parameters, score, _ = min(xgb_randomsearch.grid_scores_, key=lambda x: x[1])
print('score:', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
#use best model to predict
start = time.time()
xgb_regressor = XGBRegressor(max_depth=best_parameters["max_depth"], n_estimators=best_parameters["n_estimators"], learning_rate=0.1)

label_log = np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

score: 0.376823158016
max_depth: 5
n_estimators: 300
XGB RMSLE is : 0.27480943459853013
It takes 1.4680047035217285 seconds


In [23]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,22720,5.814503,6.389507,9.889131
1,23025,2.612353,2.827005,8.216785
2,1258,6.679724,7.473504,11.883422
3,19392,2.042031,3.297149,61.464169
4,17342,7.633970,8.879421,16.314593
5,24634,6.173389,6.804862,10.228952
6,11158,6.281889,7.593151,20.873677
7,5294,6.423774,7.119533,10.830988
8,4123,4.365051,5.019281,14.987919
9,27156,3.616677,4.236654,17.142187
