# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

# import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# import data

In [2]:
train = pd.read_csv("../output/combination.csv")

# data preparation 

In [3]:
#There is no common tube_assembly_id between train and test data. So we drop this variable.
train.drop("tube_assembly_id", axis=1, inplace=True)

In [4]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07,2013-07-07,2013-07-07,2013-07-07,2013-07-07
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [5]:
train.quote_date = pd.to_datetime(train.quote_date)

In [6]:
#add new numeric time features

train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

In [7]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

In [8]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

# Utility Functions

In [9]:
#define a evaluation function

def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [10]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [11]:
# define a function for comparing predictions and true data.

def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

# application of XGB(eXtreme Gradient Boosting) model

### simple XGB regression

In [12]:
import xgboost as xgb
from xgboost import XGBRegressor

In [13]:
# split for machine learning model

train_data, valid_data = train_test_split(data, test_size = 0.2)

label = "cost"

data_labels = train_data.columns.tolist()
data_labels.remove(label)

train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

In [16]:
#XGB regression

start = time.time()
xgb_regressor=XGBRegressor(max_depth=3, n_estimators=300, learning_rate=0.1)


label_log=np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

XGB RMSLE is : 0.32213513809943617
It takes 0.9424402713775635 seconds


In [17]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,29194,6.482198,6.448730,0.516300
1,20012,5.530733,7.338700,32.689451
2,6907,13.033947,15.978783,22.593586
3,16536,3.369074,3.966884,17.744073
4,20563,2.826572,3.565568,26.144618
5,29230,6.409864,7.559137,17.929753
6,23753,5.425015,5.891052,8.590521
7,15811,1.658107,2.947505,77.763264
8,14252,5.653144,5.831585,3.156490
9,6134,4.117447,3.936363,4.397984


### XGB Regression and GridSearch

In [19]:
from sklearn.model_selection import GridSearchCV
#set parameters
parameters = {
    'max_depth': [3, 5, 7],
    "n_estimators": [100, 300, 500],
}

#define XGB Grid Search model
xgb_gridsearch = GridSearchCV(xgb_regressor, parameters, scoring=RMSLE, cv=5)

In [20]:
#grid search experiment
start = time.time()

#label_log=np.log1p(train_label)

xgb_gridsearch.fit(train_df, train_label)

end = time.time()
duration = end-start
print ("It takes {} seconds".format(duration))

It takes 58.76397490501404 seconds


In [21]:
#get/show the best parameters
best_parameters, score, _ = min(xgb_gridsearch.grid_scores_, key=lambda x: x[1])
print('score:', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    

#use best model to predict
start = time.time()
xgb_regressor = XGBRegressor(max_depth=best_parameters["max_depth"], n_estimators=best_parameters["n_estimators"], learning_rate=0.1)

label_log = np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

score: 0.315398028632
max_depth: 7
n_estimators: 500
XGB RMSLE is : 0.22504466730722944
It takes 3.5135741233825684 seconds


In [22]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,29194,6.482198,5.989542,7.600140
1,20012,5.530733,6.878868,24.375325
2,6907,13.033947,13.974409,7.215485
3,16536,3.369074,3.713573,10.225354
4,20563,2.826572,2.312587,18.184023
5,29230,6.409864,7.397852,15.413554
6,23753,5.425015,6.527256,20.317756
7,15811,1.658107,1.471904,11.229877
8,14252,5.653144,5.776119,2.175342
9,6134,4.117447,3.848173,6.539835


### XGB Regression and Random Grid Search

In [23]:
from sklearn.model_selection import RandomizedSearchCV
#define XGB Random Grid Search model
xgb_randomsearch = RandomizedSearchCV(xgb_regressor, parameters, scoring=RMSLE, cv=5, n_iter=3) #n_iter works for what?

#set parameters
parameters = {
    'max_depth': [3, 5, 7],
    "n_estimators": [100, 300, 500],
}

In [24]:
#Random Grid Search experiment
start = time.time()

#label_log=np.log1p(train_label)

xgb_randomsearch.fit(train_df, train_label)

end = time.time()
duration = end-start
print ("It takes {} seconds".format(duration))

It takes 19.190407276153564 seconds


In [25]:
#get/show the best parameters
best_parameters, score, _ = min(xgb_randomsearch.grid_scores_, key=lambda x: x[1])
print('score:', score)

for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))
    
#use best model to predict
start = time.time()
xgb_regressor = XGBRegressor(max_depth=best_parameters["max_depth"], n_estimators=best_parameters["n_estimators"], learning_rate=0.1)

label_log = np.log1p(train_label)

model = xgb_regressor.fit(train_df, label_log)
xgb_preds1 = model.predict(valid_df)

xgb_preds = np.expm1(xgb_preds1)
        
rmsle_xgb = rmsle_score(xgb_preds, valid_label)
print ("XGB RMSLE is : {}".format(rmsle_xgb))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

compare_xgb = compare_result(preds=xgb_preds, true=valid_label)

score: 0.322098342605
max_depth: 7
n_estimators: 300
XGB RMSLE is : 0.23874539782666315
It takes 2.2239298820495605 seconds


In [26]:
compare_xgb

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,29194,6.482198,6.009648,7.289955
1,20012,5.530733,7.241231,30.927138
2,6907,13.033947,14.314043,9.821249
3,16536,3.369074,3.702716,9.903089
4,20563,2.826572,2.324458,17.764075
5,29230,6.409864,7.431536,15.939061
6,23753,5.425015,6.739486,24.229805
7,15811,1.658107,1.509204,8.980339
8,14252,5.653144,5.764462,1.969134
9,6134,4.117447,3.922324,4.738931
