# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

In this notebook, we will have a look at how to prepare data for machine learning model and implement some basic models with these data.

# import packages

In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# import data

In [58]:
train = pd.read_csv("../output/combination.csv")

# data preparation 

In [59]:
train.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost,material_id,diameter,...,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,type_totals,component_totals,spec_totals
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0


In [60]:
#There is no common tube_assembly_id between train and test data. So we drop this variable.
train.drop("tube_assembly_id", axis=1, inplace=True)

In [61]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07,2013-07-07,2013-07-07,2013-07-07,2013-07-07
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [62]:
train.quote_date = pd.to_datetime(train.quote_date)

In [63]:
#add new numeric time features

train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

In [64]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

In [65]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

# split for machine learning model

In [66]:
train_data, valid_data = train_test_split(data, test_size = 0.3)

In [67]:
label = "cost"

In [68]:
data_labels = train_data.columns.tolist()
data_labels.remove(label)

In [69]:
train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

# application of machine learning model

In [70]:
#define a evaluation function

def rmse_score(preds, true):
    rmse_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmse_score

In [71]:
# define a function for comparing predictions and true data.

def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

In [72]:
# sklearn LinearRegression
# Preprocess: do feature scaling or not 
from sklearn.linear_model import LinearRegression

def linear_learning(labels, train, test, preprocess):
    
    if preprocess == False:
        label_log=np.log1p(labels)
        linear=LinearRegression()
        model=linear.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
        
    elif preprocess == True:
        
        train = preprocessing.scale(train)
        test = preprocessing.scale(test)
        
        label_log=np.log1p(labels)
        linear=LinearRegression()
        model=linear.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
        
    return preds

In [73]:
# sklearn svm regression 
# Preprocess: do feature scaling or not
from sklearn import svm

def svm_learning(labels, train, test, preprocess):
    
    if preprocess == False:    
        label_log=np.log1p(labels)
        clf=svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma="auto",
            kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
        
    elif preprocess == True:
        
        train = preprocessing.scale(train)
        test = preprocessing.scale(test)
        
        label_log=np.log1p(labels)
        clf=svm.SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma="auto",
            kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
    return preds

In [74]:
# sklearn random forest regression
# Preprocess: do feature scaling or not
from sklearn.ensemble import RandomForestRegressor

def rf_learning(labels, train, test, preprocess):
    
    if preprocess == False:
        label_log=np.log1p(labels)
        clf=RandomForestRegressor(n_estimators=50, n_jobs=-1)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
        
    elif preprocess == True:
        
        train = preprocessing.scale(train)
        test = preprocessing.scale(test)
        
        label_log=np.log1p(labels)
        clf=RandomForestRegressor(n_estimators=50, n_jobs=-1)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
    return preds

In [75]:
# K-nearest neighbor regression
# Preprocess: do feature scaling or not
from sklearn.neighbors import KNeighborsRegressor

def knn_learning(labels, train, test, n, preprocess):
    
    if preprocess == False:
        label_log=np.log1p(labels)
        clf=KNeighborsRegressor(n_neighbors=n, n_jobs=-1)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
        
    elif preprocess == True:
        
        train = preprocessing.scale(train)
        test = preprocessing.scale(test)
        
        label_log=np.log1p(labels)
        clf=KNeighborsRegressor(n_neighbors=n, n_jobs=-1)
        model=clf.fit(train, label_log)
        preds1=model.predict(test)
        preds=np.expm1(preds1)
    return preds

Following machine learning models are trained by part of train data and tested with the rest of train data (**local validation data**)

In [76]:
start = time.time()

#linear_preds = linear_learning(labels=train_label, train=train_df, test=valid_df, preprocess=False)
linear_preds = linear_learning(labels=train_label, train=train_df, test=valid_df, preprocess=True)

rmse_linear = rmse_score(linear_preds, valid_label)
print ("Linear RMSLE is : {}".format(rmse_linear))

compare_linear = compare_result(preds=linear_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Linear RMSLE is : 0.6535462576078592
It takes 0.028548002243041992 seconds


In [77]:
start = time.time()

#svm_preds = svm_learning(train_label, train_df, valid_df, False)
svm_preds = svm_learning(train_label, train_df, valid_df, True)

rmse_svm = rmse_score(svm_preds, valid_label)
print ("SVM RMSLE is : {}".format(rmse_svm))

compare_svm = compare_result(preds=svm_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

SVM RMSLE is : 0.5277956355546624
It takes 26.552299976348877 seconds


In [78]:
start = time.time()

#rf_preds = rf_learning(train_label, train_df, valid_df, False)
rf_preds = rf_learning(train_label, train_df, valid_df, True)

rmse_rf = rmse_score(rf_preds, valid_label)
print ("RF RMSLE is : {}".format(rmse_rf))

compare_rf = compare_result(preds=rf_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

RF RMSLE is : 0.4128730827171936
It takes 1.1927931308746338 seconds


In [92]:
start = time.time()

#knn_preds = knn_learning(train_label, train_df, valid_df, 3, False)
knn_preds = knn_learning(train_label, train_df, valid_df, 3, True)

rmse_knn = rmse_score(knn_preds, valid_label)
print ("KNN RMSLE is : {}".format(rmse_knn))

compare_knn = compare_result(preds=svm_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

KNN RMSLE is : 0.4725182906356169
It takes 0.44101810455322266 seconds


Results shown as following:

  | Preprocessing | Time (second)| RMSLE Score)
:----:         |:------:|:----:|-:---:
_Linear_       | True   | 0.029| 0.654
_Linear_       | False  | 0.017| 0.648
_SVM_          | True   | 26.55| 0.528
_SVM_          | False  | 90.06| 0.593
_RandomForest_ | True   | 1.19 | 0.413
_RandomForest_ | False  | 0.97 | __0.270__
_KNN_          | True   | 1.69 | 0.573
_KNN_          | False  | 0.33 | 0.469

From blow table, we know that RandomForest without any preprocessing could get the best score.

In [80]:
compare_linear.head()

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,15794,4.838001,5.8038,19.962767
1,10167,3.683446,7.802193,111.817742
2,16760,12.046316,6.143395,49.001872
3,2589,16.639495,13.760012,17.305109
4,7161,31.907435,52.076048,63.209757


In [81]:
compare_svm.head()

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,15794,4.838001,8.169096,68.852716
1,10167,3.683446,4.946538,34.291035
2,16760,12.046316,6.181421,48.686214
3,2589,16.639495,13.093322,21.311781
4,7161,31.907435,51.242553,60.59753


In [82]:
compare_rf.head()

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,15794,4.838001,5.866108,21.250672
1,10167,3.683446,4.59783,24.824152
2,16760,12.046316,15.929105,32.232176
3,2589,16.639495,27.470716,65.093445
4,7161,31.907435,70.039041,119.506959


In [83]:
compare_knn.head()

Unnamed: 0,test_id,real_cost,pred_cost,error_percent_(%)
0,15794,4.838001,8.169096,68.852716
1,10167,3.683446,4.946538,34.291035
2,16760,12.046316,6.181421,48.686214
3,2589,16.639495,13.093322,21.311781
4,7161,31.907435,51.242553,60.59753
