# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

# import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# import data

In [2]:
train = pd.read_csv("../output/combination.csv")

# data preparation 

In [3]:
#There is no common tube_assembly_id between train and test data. So we drop this variable.
train.drop("tube_assembly_id", axis=1, inplace=True)

In [4]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07,2013-07-07,2013-07-07,2013-07-07,2013-07-07
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [5]:
train.quote_date = pd.to_datetime(train.quote_date)

In [6]:
#add new numeric time features

train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

In [7]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

In [8]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

# Utility Functions

In [9]:
#define a evaluation function

def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [10]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [11]:
# define a function for comparing predictions and true data.

def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

# application of machine learning model

### Simple Linear Regression

In [12]:
# split for machine learning model

train_data, valid_data = train_test_split(data, test_size = 0.2)

label = "cost"

data_labels = train_data.columns.tolist()
data_labels.remove(label)

train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

In [13]:
#Linear regression

from sklearn.linear_model import LinearRegression

start = time.time()
linear=LinearRegression()


label_log=np.log1p(train_label)

model=linear.fit(train_df, label_log)
linear_preds1 = model.predict(valid_df)

linear_preds=np.expm1(linear_preds1)
        
rmsle_linear = rmsle_score(linear_preds, valid_label)
print ("Linear RMSLE is : {}".format(rmsle_linear))

compare_linear_log = compare_result(preds=linear_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Linear RMSLE is : 0.6548706833534235
It takes 0.2538750171661377 seconds


### Linear Regression with KFold object

In [14]:
# split for cross_val_score machine learning model

label = "cost"

data_labels = data.columns.tolist()
data_labels.remove(label)

X = data[data_labels]
y = data[label]

In [18]:
#Linear regression and KFolder
from sklearn.model_selection import KFold

from sklearn.linear_model import LinearRegression

start = time.time()
linear=LinearRegression()
scores = []

kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print("TRAIN:", train_index[:10], "TEST:", test_index[:10])
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    label_log=np.log1p(y_train)

    model=linear.fit(X_train, label_log)
    linear_preds1 = model.predict(X_test)

    linear_preds = np.expm1(linear_preds1)
        
    rmlse_linear = rmsle_score(linear_preds, y_test)
    scores.append(rmsle_linear)
    print ("Folder {}, Linear RMSLE is : {}".format(i, rmlse_linear))

print("Mean RMSLE is : {}".format(np.mean(scores)))
    
end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Folder 0, Linear RMSLE is : 0.6354630402756541
Folder 1, Linear RMSLE is : 0.6010368627808833
Folder 2, Linear RMSLE is : 0.6169496765063249
Folder 3, Linear RMSLE is : 0.7515635354643725
Folder 4, Linear RMSLE is : 0.6621153784463271
Mean RMSLE is : 0.6548706833534235
It takes 0.14751696586608887 seconds


### Simple RandomForest Regression

In [16]:
# RandomForest Regression 
from sklearn.ensemble import RandomForestRegressor

start = time.time()
rf=RandomForestRegressor(random_state=0)

label_log=np.log1p(train_label)

model=rf.fit(train_df, label_log)
rf_preds1 = model.predict(valid_df)

rf_preds=np.expm1(rf_preds1)
        
rmsle_rf = rmsle_score(rf_preds, valid_label)
print ("Random Forest RMSLE is : {}".format(rmsle_rf))

compare_rf_log = compare_result(preds=rf_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Random Forest RMSLE is : 0.2781450302275326
It takes 0.9106957912445068 seconds


### RandomForest and KFold object

In [19]:
#RandomForest Regression and KFold
from sklearn.model_selection import KFold

from sklearn.ensemble import RandomForestRegressor

start = time.time()
rf=RandomForestRegressor(random_state=0)
scores = []

kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X)):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    label_log=np.log1p(y_train)

    model=rf.fit(X_train, label_log)
    rf_preds1=model.predict(X_test)

    rf_preds=np.expm1(rf_preds1)
        
    rmsle_rf = rmsle_score(rf_preds, y_test)
    print ("Folder cv {}, Random Forest RMSLE is : {}".format(i, rmsle_rf))
    scores.append(rmsle_rf)

print("Mean RMSLE is {}".format(np.mean(scores)))
    
end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Folder cv 0, Random Forest RMSLE is : 0.37742607420957774
Folder cv 1, Random Forest RMSLE is : 0.32100682342020415
Folder cv 2, Random Forest RMSLE is : 0.3661389362945202
Folder cv 3, Random Forest RMSLE is : 0.46853517499641656
Folder cv 4, Random Forest RMSLE is : 0.36004388217409716
Mean RMSLE is 0.37863017821896316
It takes 4.368503093719482 seconds
