# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

# import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

# import data

In [2]:
train = pd.read_csv("../output/combination.csv")

# data preparation 

In [3]:
#There is no common tube_assembly_id between train and test data. So we drop this variable.
train.drop("tube_assembly_id", axis=1, inplace=True)

In [4]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07,2013-07-07,2013-07-07,2013-07-07,2013-07-07
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [5]:
train.quote_date = pd.to_datetime(train.quote_date)

In [6]:
#add new numeric time features

train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

In [7]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

In [8]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

# Utility Functions

In [9]:
#define a evaluation function

def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [10]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [11]:
# define a function for comparing predictions and true data.

def compare_result(preds, true):
    compare = pd.DataFrame({"test_id": true.index,
                           "real_cost": true,
                           "pred_cost": preds})
    compare = compare[["test_id", "real_cost", "pred_cost"]].reset_index(drop=True)
    
    compare["error_percent_(%)"] = np.abs(compare.real_cost - compare.pred_cost) / compare.real_cost * 100
    
    return compare

# application of machine learning model

### Simple Linear Regression

In [12]:
# split for machine learning model

train_data, valid_data = train_test_split(data, test_size = 0.3)

label = "cost"

data_labels = train_data.columns.tolist()
data_labels.remove(label)

train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

In [13]:
train_df.head()

Unnamed: 0,annual_usage,min_order_quantity,quantity,diameter,wall,length,num_bends,bend_radius,num_boss,num_bracket,other,type_totals,component_totals,spec_totals,year,month,day,day_of_week
10894,0,0,10,6.35,0.71,263.0,10,19.05,0,0,0,2,4.0,0,2013,8,4,6
11808,0,0,100,6.35,0.71,93.0,5,19.05,0,0,0,1,2.0,0,2013,6,30,6
25791,0,0,100,12.7,1.24,30.0,2,25.4,0,0,0,2,4.0,0,2013,6,1,5
21636,1,0,6,6.35,0.71,117.0,4,19.05,0,0,0,3,3.0,0,2012,11,20,1
7226,0,0,1,6.35,0.71,43.0,2,19.05,0,0,0,1,2.0,0,2013,8,11,6


In [14]:
valid_df.head()

Unnamed: 0,annual_usage,min_order_quantity,quantity,diameter,wall,length,num_bends,bend_radius,num_boss,num_bracket,other,type_totals,component_totals,spec_totals,year,month,day,day_of_week
21739,1,20,20,9.52,0.89,141.0,5,19.05,0,0,0,2,4.0,0,2014,8,18,0
9701,0,0,10,25.4,3.05,47.0,2,50.8,0,0,0,3,3.0,0,2013,10,1,1
14714,34,0,5,9.52,0.89,20.0,0,0.0,0,0,0,3,3.0,7,2008,6,10,1
5207,0,0,1,6.35,0.71,85.0,4,19.05,0,0,0,1,2.0,0,2013,7,21,6
29593,0,0,10,6.35,0.71,186.0,3,19.05,0,0,0,1,2.0,0,2013,8,11,6


In [15]:
#Linear regression

from sklearn.linear_model import LinearRegression

start = time.time()
linear=LinearRegression()


label_log=np.log1p(train_label)

model=linear.fit(train_df, label_log)
linear_preds1=model.predict(valid_df)

linear_preds=np.expm1(linear_preds1)
        
rmsle_linear = rmsle_score(linear_preds, valid_label)
print ("Linear RMSLE is : {}".format(rmsle_linear))

compare_linear_log = compare_result(preds=linear_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Linear RMSLE is : 0.6419710406256278
It takes 0.20874595642089844 seconds


### Linear Regression with cross_val_score

In [16]:
# split for cross_val_score machine learning model

label = "cost"

data_labels = data.columns.tolist()
data_labels.remove(label)

X = data[data_labels]
y = data[label]

In [22]:
#Linear regression and cross_val_score
from sklearn.cross_validation import cross_val_score

from sklearn.linear_model import LinearRegression

start = time.time()
linear=LinearRegression()

rmsle_scores = cross_val_score(linear, X, y, scoring=RMSLE, cv=5)
print("RMSLE are:{}".format(rmsle_scores))
print("Mean RMSLE score:{}".format(np.mean(rmsle_scores)))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

RMSLE are:[ 0.82270528  0.82014801  0.87274333  0.85288201  0.86124792]
Mean RMSLE score:0.845945309783391
It takes 0.08819389343261719 seconds


### Simple RandomForest Regression

In [18]:
# RandomForest Regression 
from sklearn.ensemble import RandomForestRegressor

start = time.time()
rf=RandomForestRegressor(random_state=0)

label_log=np.log1p(train_label)

model=rf.fit(train_df, label_log)
rf_preds1=model.predict(valid_df)

rf_preds=np.expm1(rf_preds1)
        
rmsle_rf = rmsle_score(rf_preds, valid_label)
print ("Random Forest RMSLE is : {}".format(rmsle_rf))

compare_rf_log = compare_result(preds=rf_preds, true=valid_label)

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Random Forest RMSLE is : 0.2854051091772355
It takes 0.8479020595550537 seconds


### RandomForest and cross_val_score

In [25]:
#RandomForest Regression and cross_val_score
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestRegressor

start = time.time()
rf=RandomForestRegressor(random_state=0)

rmsle_scores = cross_val_score(rf, X, y, scoring=RMSLE, cv=5)
print("RMSLE are:{}".format(rmsle_scores))
print("Mean RMSLE score:{}".format(np.mean(rmsle_scores)))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

RMSLE are:[ 0.39838117  0.36137671  0.42144332  0.51476111  0.39939352]
Mean RMSLE score:0.4190711670228226
It takes 4.74569296836853 seconds
