# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

In this notebook, we will see how to  merge useful data together and do some advanced feature engineering to have a more complete dataset. 

### import packages 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

### read all 21 csvs

In [2]:
bill_of_materials_df = pd.read_csv('../input/bill_of_materials.csv')
comp_adaptor_df = pd.read_csv('../input/comp_adaptor.csv')
comp_boss_df = pd.read_csv('../input/comp_boss.csv')
comp_elbow_df = pd.read_csv('../input/comp_elbow.csv')
comp_float_df = pd.read_csv('../input/comp_float.csv')
comp_hfl_df = pd.read_csv('../input/comp_hfl.csv')
comp_nut_df = pd.read_csv('../input/comp_nut.csv')
comp_other_df = pd.read_csv('../input/comp_other.csv')
comp_sleeve_df = pd.read_csv('../input/comp_sleeve.csv')
comp_straight_df = pd.read_csv('../input/comp_straight.csv')
comp_tee_df = pd.read_csv('../input/comp_tee.csv')
comp_threaded_df = pd.read_csv('../input/comp_threaded.csv')
components_df = pd.read_csv('../input/components.csv')
specs_df = pd.read_csv('../input/specs.csv')
test_set_df = pd.read_csv('../input/test_set.csv')
train_set_df = pd.read_csv('../input/train_set.csv')
tube_end_form_df = pd.read_csv('../input/tube_end_form.csv')
tube_df = pd.read_csv('../input/tube.csv')
type_component_df = pd.read_csv('../input/type_component.csv')
type_connection_df = pd.read_csv('../input/type_connection.csv')
type_end_form_df = pd.read_csv('../input/type_end_form.csv')

### Merge data

In [3]:
train_set_df.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561


In [4]:
train_set_df.shape

(30213, 8)

In [5]:
test_set_df.head()

Unnamed: 0,id,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity
0,1,TA-00001,S-0066,2013-06-23,0,0,Yes,1
1,2,TA-00001,S-0066,2013-06-23,0,0,Yes,2
2,3,TA-00001,S-0066,2013-06-23,0,0,Yes,5
3,4,TA-00001,S-0066,2013-06-23,0,0,Yes,10
4,5,TA-00001,S-0066,2013-06-23,0,0,Yes,25


In [6]:
train_label = train_set_df["cost"]

train_set_df.drop("cost", axis=1, inplace=True)
test_set_df.drop("id", axis=1, inplace=True)

I did some basic feature engineering during data merge. Because some data metrics are too sparse to use directly, so I extracted information from them.   

Like bill_of_materials_df

In [7]:
#merge1: train + tube_df

merge1 = pd.merge(train_set_df, tube_df, on="tube_assembly_id")
test_merge1 = pd.merge(test_set_df, tube_df, on="tube_assembly_id")

In [8]:
#merge2: train + tube_df + bill_of_materials_df(bill_of_materials_summary_df)

#The 1,3,5,7...15 columns of bill_comp_types_df are informations about component_id.

#We calculate each tube_assembly uses how many different component for assembly, shown as component_series
bill_comp_types_df = bill_of_materials_df.iloc[:,[1,3,5,7,9,11,13,15]]
bill_comp_types_logical_df = ~bill_comp_types_df.isnull()
component_series = bill_comp_types_logical_df.sum(axis = 1)


#The 2,4,6,8...16 columns of bill_comp_types_df are informations about how many number of 
#components needed for assembly

#Then we calculate the total number of components needed for assembly, shown as quants_series.
bill_comp_quants_df = bill_of_materials_df.iloc[:,[2,4,6,8,10,12,14,16]]
quants_series = bill_comp_quants_df.sum(axis = 1)

bill_of_materials_summary_df = bill_of_materials_df.copy()
bill_of_materials_summary_df['type_totals'] = component_series
bill_of_materials_summary_df['component_totals'] = quants_series
bill_of_materials_summary_df['component_average_quality'] = bill_of_materials_summary_df["component_totals"] / bill_of_materials_summary_df["type_totals"]

merge2 = pd.merge(merge1, bill_of_materials_summary_df, on="tube_assembly_id")
test_merge2 = pd.merge(test_merge1, bill_of_materials_summary_df, on="tube_assembly_id")

In [9]:
#merge3: train + tube_df + bill_of_materials_df(bill_of_materials_summary_df) + specs_df(totals_spec)
specs_only_df = specs_df.iloc[:, 1:11]
specs_logical_df = ~specs_only_df.isnull()
specs_totals = specs_logical_df.sum(axis=1)

specs_with_totals_df = specs_df.copy()
specs_with_totals_df['spec_totals'] = specs_totals

merge3 = pd.merge(merge2, specs_with_totals_df[['tube_assembly_id', 'spec_totals']], on="tube_assembly_id")
test_merge3 = pd.merge(test_merge2, specs_with_totals_df[['tube_assembly_id', 'spec_totals']], on="tube_assembly_id")

In [10]:
merge3.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0


In [11]:
#tube_end_form_df.columns = ["end_a", "end_x_forming"]
#merge4 = pd.merge(merge3, tube_end_form_df, on="end_a")
#test_merge4 = pd.merge(test_merge3, tube_end_form_df, on="end_x")

In [12]:
#tube_end_form_df.columns = ["end_x", "end_x_forming"]
#merge5 = pd.merge(merge4, tube_end_form_df, on="end_x")
#test_merge5 = pd.merge(test_merge4, tube_end_form_df, on="end_x")

data merge is done here

In [13]:
result = merge3.copy()
test_result = test_merge3.copy()

In [14]:
result.shape

(30213, 42)

In [15]:
result

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
5,TA-00002,S-0066,2013-07-07,0,0,Yes,50,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
6,TA-00002,S-0066,2013-07-07,0,0,Yes,100,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
7,TA-00002,S-0066,2013-07-07,0,0,Yes,250,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
8,TA-00004,S-0066,2013-07-07,0,0,Yes,1,SP-0019,6.35,0.710,137.0,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0
9,TA-00004,S-0066,2013-07-07,0,0,Yes,2,SP-0019,6.35,0.710,137.0,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0


### Create new features

In [16]:
#create new date features

result["quote_date"] = pd.to_datetime(result["quote_date"])

result["year"] = result["quote_date"].dt.year
result["month"] = result["quote_date"].dt.month
#result["day"] = result["quote_date"].dt.day
#result["dayofweek"] = result["quote_date"].dt.dayofweek

#test data
test_result["quote_date"] = pd.to_datetime(test_result["quote_date"])

test_result["year"] = test_result["quote_date"].dt.year
test_result["month"] = test_result["quote_date"].dt.month
#test_result["day"] = test_result["quote_date"].dt.day
#test_result["dayofweek"] = test_result["quote_date"].dt.dayofweek

In [17]:
#create new numeric features follow its relationship 

result['bend_radius_div_wall'] = result["bend_radius"] / result["wall"]
result['diameter_div_wall'] = result["diameter"] / result["wall"]

#test data
test_result['bend_radius_div_wall'] = test_result["bend_radius"] / test_result["wall"]
test_result['diameter_div_wall'] = test_result["diameter"] / test_result["wall"]

whether end_a and end_x are same is one useful variable.

In [18]:
result["same_end_form"] = (result["end_a"] == result["end_x"])

test_result["same_end_form"] = (test_result["end_a"] == test_result["end_x"])

there were small tube clusters with similar prices and IDs. This means that the ordering of the tubes had a predictive value

In [19]:
def catch_num_tube_assembly_id(row):
    return int(row["tube_assembly_id"][-5:])

def catch_num_supplier(row):
    return int(row["supplier"][-4:])

def catch_num_material_id(row):
    if type(row["material_id"]) == float:
        return row["material_id"]
    
    else:
        return int(row["material_id"][-4:])

In [20]:
#create new numeric features based some categorical features

result['num_tube_assembly_id'] = result.apply (lambda row: catch_num_tube_assembly_id (row),axis=1)
result['num_supplier'] = result.apply (lambda row: catch_num_supplier (row),axis=1)
result['num_material_id'] = result.apply (lambda row: catch_num_material_id (row),axis=1)

#test data
test_result['num_tube_assembly_id'] = test_result.apply (lambda row: catch_num_tube_assembly_id (row),axis=1)
test_result['num_supplier'] = test_result.apply (lambda row: catch_num_supplier (row),axis=1)
test_result['num_material_id'] = test_result.apply (lambda row: catch_num_material_id (row),axis=1)

the weight of tube has a close relationship with its price, so we could calculate the weight of each tube and add it as a new feture.

In [21]:
df1 = pd.merge(components_df, comp_adaptor_df[["component_id", "weight"]], on="component_id")
df2 = pd.merge(components_df, comp_boss_df[["component_id", "weight"]], on="component_id")
df3 = pd.merge(components_df, comp_elbow_df[["component_id", "weight"]], on="component_id")
df4 = pd.merge(components_df, comp_float_df[["component_id", "weight"]], on="component_id")
df5 = pd.merge(components_df, comp_hfl_df[["component_id", "weight"]], on="component_id")
df6 = pd.merge(components_df, comp_nut_df[["component_id", "weight"]], on="component_id")
df7 = pd.merge(components_df, comp_other_df[["component_id", "weight"]], on="component_id")
df8 = pd.merge(components_df, comp_sleeve_df[["component_id", "weight"]], on="component_id")
df9 = pd.merge(components_df, comp_straight_df[["component_id", "weight"]], on="component_id")
df10 = pd.merge(components_df, comp_tee_df[["component_id", "weight"]], on="component_id")
df11 = pd.merge(components_df, comp_threaded_df[["component_id", "weight"]], on="component_id")

In [22]:
frames = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11]

component_weight = pd.concat(frames).sort_values("component_id", ascending=True).reset_index(drop=True)[["component_id", "weight"]]

In [23]:
component_weight.head()

Unnamed: 0,component_id,weight
0,C-0001,0.013
1,C-0002,0.005
2,C-0003,0.014
3,C-0004,0.014
4,C-0005,0.206


In [24]:
tube_weight = bill_of_materials_df.copy()
tube_weight["weight"] = 0

for i in range(1, 9):
    column_names = ["component_id_" + str(i), "weight_" + str(i)]
    component_weight.columns = column_names
    tube_weight = pd.merge(tube_weight, component_weight, how="left", on=column_names[0])
    tube_weight[column_names[1]].fillna(0, inplace=True)
    tube_weight["quantity_"+str(i)].fillna(0, inplace=True)
    tube_weight["weight"] = tube_weight[column_names[1]] * tube_weight["quantity_"+str(i)] + tube_weight["weight"]
    
tube_weight = tube_weight[["tube_assembly_id", "weight"]]

In [25]:
tube_weight.head()

Unnamed: 0,tube_assembly_id,weight
0,TA-00001,0.096
1,TA-00002,0.018
2,TA-00003,0.018
3,TA-00004,0.018
4,TA-00005,0.21


In [26]:
#add weight feature of each tube

result = pd.merge(result, tube_weight, on="tube_assembly_id")

test_result = pd.merge(test_result, tube_weight, on="tube_assembly_id")

feature engineering is done here

In [27]:
result

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals,year,month,bend_radius_div_wall,diameter_div_wall,same_end_form,num_tube_assembly_id,num_supplier,num_material_id,weight
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
5,TA-00002,S-0066,2013-07-07,0,0,Yes,50,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
6,TA-00002,S-0066,2013-07-07,0,0,Yes,100,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
7,TA-00002,S-0066,2013-07-07,0,0,Yes,250,SP-0019,6.35,0.710,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
8,TA-00004,S-0066,2013-07-07,0,0,Yes,1,SP-0019,6.35,0.710,137.0,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,4,66,19.0,0.018
9,TA-00004,S-0066,2013-07-07,0,0,Yes,2,SP-0019,6.35,0.710,137.0,9,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.000000,0,2013,7,26.830986,8.943662,True,4,66,19.0,0.018


### data preparation 

In [28]:
#drop useless features

#data = result.select_dtypes(include=['int', 'float'])
result.drop(["tube_assembly_id", "quote_date"], axis=1, inplace=True)

test_result.drop(["tube_assembly_id", "quote_date"], axis=1, inplace=True)

In [29]:
result.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals,year,month,bend_radius_div_wall,diameter_div_wall,same_end_form,num_tube_assembly_id,num_supplier,num_material_id,weight
0,S-0066,0,0,Yes,1,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
1,S-0066,0,0,Yes,2,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
2,S-0066,0,0,Yes,5,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
3,S-0066,0,0,Yes,10,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
4,S-0066,0,0,Yes,25,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018


### handle categorical features

In [30]:
train_set_df = result.copy()

test_set_df = test_result.copy()

In [31]:
train_set_df.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals,year,month,bend_radius_div_wall,diameter_div_wall,same_end_form,num_tube_assembly_id,num_supplier,num_material_id,weight
0,S-0066,0,0,Yes,1,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
1,S-0066,0,0,Yes,2,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
2,S-0066,0,0,Yes,5,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
3,S-0066,0,0,Yes,10,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018
4,S-0066,0,0,Yes,25,SP-0019,6.35,0.71,137.0,8,19.05,N,N,N,N,EF-008,EF-008,0,0,0,C-1312,2.0,,,,,,,,,,,,,,,1,2.0,2.0,0,2013,7,26.830986,8.943662,True,2,66,19.0,0.018


In [32]:
test_set_df.head()

Unnamed: 0,supplier,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8,type_totals,component_totals,component_average_quality,spec_totals,year,month,bend_radius_div_wall,diameter_div_wall,same_end_form,num_tube_assembly_id,num_supplier,num_material_id,weight
0,S-0066,0,0,Yes,1,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,,2,4.0,2.0,0,2013,6,23.090909,7.69697,True,1,66,35.0,0.096
1,S-0066,0,0,Yes,2,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,,2,4.0,2.0,0,2013,6,23.090909,7.69697,True,1,66,35.0,0.096
2,S-0066,0,0,Yes,5,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,,2,4.0,2.0,0,2013,6,23.090909,7.69697,True,1,66,35.0,0.096
3,S-0066,0,0,Yes,10,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,,2,4.0,2.0,0,2013,6,23.090909,7.69697,True,1,66,35.0,0.096
4,S-0066,0,0,Yes,25,SP-0035,12.7,1.65,164.0,5,38.1,N,N,N,N,EF-003,EF-003,0,0,0,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,,2,4.0,2.0,0,2013,6,23.090909,7.69697,True,1,66,35.0,0.096


### Binary encoding

In [33]:
# perform binary encoding for categorical variable
# this function take in a pair of train and test data set, and the feature that need to be encode.
# it returns the two dataset with input feature encoded in binary representation
# this function assumpt that the feature to be encoded is already been encoded in a numeric manner 
# ranging from 0 to n-1 (n = number of levels in the feature). 

def binary_encoding(train_df, test_df, feat):
    # calculate the highest numerical value used for numeric encoding
    train_feat_max = train_df[feat].max()
    test_feat_max = test_df[feat].max()
    if train_feat_max > test_feat_max:
        feat_max = train_feat_max
    else:
        feat_max = test_feat_max
        
    # use the value of feat_max+1 to represent missing value
    train_df.loc[train_df[feat] == -1, feat] = feat_max + 1
    test_df.loc[test_df[feat] == -1, feat] = feat_max + 1
    
    # create a union set of all possible values of the feature
    union_val = np.union1d(train_df[feat].unique(), test_df[feat].unique())

    # extract the highest value from from the feature in decimal format.
    max_dec = union_val.max()
    
    # work out how the ammount of digtis required to be represent max_dev in binary representation
    max_bin_len = len("{0:b}".format(max_dec))
    index = np.arange(len(union_val))
    columns = list([feat])
    
    # create a binary encoding feature dataframe to capture all the levels for the feature
    bin_df = pd.DataFrame(index=index, columns=columns)
    bin_df[feat] = union_val
    
    # capture the binary representation for each level of the feature 
    feat_bin = bin_df[feat].apply(lambda x: "{0:b}".format(x).zfill(max_bin_len))
    
    # split the binary representation into different bit of digits 
    splitted = feat_bin.apply(lambda x: pd.Series(list(x)).astype(np.uint8))
    splitted.columns = [feat + '_bin_' + str(x) for x in splitted.columns]
    bin_df = bin_df.join(splitted)
    
    # merge the binary feature encoding dataframe with the train and test dataset - Done! 
    train_df = pd.merge(train_df, bin_df, how='left', on=[feat])
    test_df = pd.merge(test_df, bin_df, how='left', on=[feat])
    return train_df, test_df

In [34]:
cat_cols = ["supplier", "bracket_pricing", "material_id", "end_a_1x", 
            "end_a_2x", "end_x_1x", "end_x_2x", "end_a", "end_x", 
            "same_end_form", "component_id_1", "component_id_2", "component_id_3", 
            "component_id_4", "component_id_5", "component_id_6", 
            "component_id_7", "component_id_8"]

In [35]:
#do binary encoding for each category

for col in cat_cols:
    print("is handling {}".format(col))
    
    train_set_df[col].replace(np.nan,' ', regex=True, inplace= True)
    test_set_df[col].replace(np.nan,' ', regex=True, inplace= True)
    
    le = LabelEncoder()
    le.fit(list(train_set_df[col]) + list(test_set_df[col]))
    train_set_df[col] = le.transform(train_set_df[col])
    test_set_df[col] = le.transform(test_set_df[col])
    
    train_set_df, test_set_df = binary_encoding(train_set_df, test_set_df, col)
    
    train_set_df.drop(col, axis=1, inplace=True)
    test_set_df.drop(col, axis=1, inplace=True)

is handling supplier
is handling bracket_pricing
is handling material_id
is handling end_a_1x
is handling end_a_2x
is handling end_x_1x
is handling end_x_2x
is handling end_a
is handling end_x
is handling same_end_form
is handling component_id_1
is handling component_id_2
is handling component_id_3
is handling component_id_4
is handling component_id_5
is handling component_id_6
is handling component_id_7
is handling component_id_8


In [36]:
#fill out all null by 0

train_set_df.fillna(0, inplace=True)
test_set_df.fillna(0, inplace=True)

In [37]:
train_set_df.shape

(30213, 110)

## xgb model with Kfold

In [38]:
train_set_df["cost"] = train_label

data = train_set_df.copy()

In [39]:
#define a evaluation function

def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [40]:
#Define a evaluation matrix 
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [41]:
# split for cross_val_score machine learning model

label = "cost"

data_labels = data.columns.tolist()
data_labels.remove(label)

X = data[data_labels]
y = data[label]

In [42]:
#XGB Regression and KFold
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import KFold

start = time.time()

xgb_regressor=XGBRegressor(max_depth=7, 
                           n_estimators=500, 
                           objective="reg:linear", 
                           min_child_weight = 6,
                           subsample = 0.87,
                           colsample_bytree = 0.50,
                           scale_pos_weight = 1.0,                       
                           learning_rate=0.1)
scores = []

kf = KFold(n_splits=5)

for i, (train_index, test_index) in enumerate(kf.split(X, y)):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index, :], X.loc[test_index, :]
    y_train, y_test = y[train_index], y[test_index]
    
    y_log = np.log1p(y_train)

    model = xgb_regressor.fit(X_train, y_log, eval_metric=RMSLE)
    xgb_preds1 = model.predict(X_test)

    xgb_preds = np.expm1(xgb_preds1)
        
    rmsle_xgb = rmsle_score(xgb_preds, y_test)
    print ("Folder cv {}, XGB RMSLE is : {}".format(i+1, rmsle_xgb))
    scores.append(rmsle_xgb)
    
print("Mean RMSLE is : {}".format(np.mean(scores)))

end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

Folder cv 1, XGB RMSLE is : 0.27345663637689244
Folder cv 2, XGB RMSLE is : 0.24317219516128924
Folder cv 3, XGB RMSLE is : 0.2597193205346117
Folder cv 4, XGB RMSLE is : 0.34049950650441674
Folder cv 5, XGB RMSLE is : 0.277681659526914
Mean RMSLE is : 0.27890586362082487
It takes 39.863399267196655 seconds


## make submission and see the LB score situation

In [43]:
label = "cost"

data_labels = train_set_df.columns.tolist()
data_labels.remove(label)

train_df = train_set_df[data_labels]
train_label = train_set_df[label]

test = test_set_df.copy()

In [44]:
#XGB regression

start = time.time()
xgb_regressor=XGBRegressor(max_depth=7, 
                           n_estimators=500, 
                           objective="reg:linear", 
                           min_child_weight = 6,
                           subsample = 0.87,
                           colsample_bytree = 0.50,
                           scale_pos_weight = 1.0,                       
                           learning_rate=0.1)

label_log=np.log1p(train_label)

model=xgb_regressor.fit(train_df, label_log)
xgb_preds=model.predict(test)

xgb_preds=np.expm1(xgb_preds)



end = time.time()
duration = end - start
print ("It takes {} seconds".format(duration))

It takes 10.449810028076172 seconds


In [45]:
sample_submission = pd.read_csv("../sample_submission.csv")

In [46]:
sample_submission.cost = xgb_preds

sample_submission.to_csv("../output/submission.csv", index=False)

no binary encodding, CV: 0.2822   LB private score : 0.226609/0.196556   450/1323

binary encodding without drop, cv: 0.28043   LB private score : 0.225591; public score: 0.232261

binary encodding with drop, cv: 0.2789       LB private score : 0.225249; public score: 0.229514