# Caterpillar Tube Pricing
## Environment : Python 3
## Author : Arion

In this notebook, we will try to make a competition submission use our model and see the LeadBoard score.

We use the whole train data to set up a RandomForest model and do a submission with test dataset.

### import packages 

In [139]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import time

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 1000)

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline 

### read all 21 csvs

In [180]:
bill_of_materials_df = pd.read_csv('../input/bill_of_materials.csv')
comp_adaptor_df = pd.read_csv('../input/comp_adaptor.csv')
comp_boss_df = pd.read_csv('../input/comp_boss.csv')
comp_elbow_df = pd.read_csv('../input/comp_elbow.csv')
comp_float_df = pd.read_csv('../input/comp_float.csv')
comp_hfl_df = pd.read_csv('../input/comp_hfl.csv')
comp_nut_df = pd.read_csv('../input/comp_nut.csv')
comp_other_df = pd.read_csv('../input/comp_other.csv')
comp_sleeve_df = pd.read_csv('../input/comp_sleeve.csv')
comp_straight_df = pd.read_csv('../input/comp_straight.csv')
comp_tee_df = pd.read_csv('../input/comp_tee.csv')
comp_threaded_df = pd.read_csv('../input/comp_threaded.csv')
components_df = pd.read_csv('../input/components.csv')
specs_df = pd.read_csv('../input/specs.csv')
test_set_df = pd.read_csv('../input/test_set.csv')
train_set_df = pd.read_csv('../input/train_set.csv')
tube_end_form_df = pd.read_csv('../input/tube_end_form.csv')
tube_df = pd.read_csv('../input/tube.csv')
type_component_df = pd.read_csv('../input/type_component.csv')
type_connection_df = pd.read_csv('../input/type_connection.csv')
type_end_form_df = pd.read_csv('../input/type_end_form.csv')

In [141]:
test_set_df.head()

Unnamed: 0,id,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity
0,1,TA-00001,S-0066,2013-06-23,0,0,Yes,1
1,2,TA-00001,S-0066,2013-06-23,0,0,Yes,2
2,3,TA-00001,S-0066,2013-06-23,0,0,Yes,5
3,4,TA-00001,S-0066,2013-06-23,0,0,Yes,10
4,5,TA-00001,S-0066,2013-06-23,0,0,Yes,25


### import sample_submission csv

In [142]:
sample_submission = pd.read_csv("../sample_submission.csv")

In [143]:
sample_submission.head()

Unnamed: 0,id,cost
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


# data cleaning and preparation

In [144]:
train_set_df.quote_date = pd.to_datetime(train_set_df.quote_date)

In [145]:
test_set_df.quote_date = pd.to_datetime(test_set_df.quote_date)

In [146]:
#bill_of_materials_df

#how to replace data that has null id but numeric quantity???

In [147]:
components_df.component_id.replace("9999", "other", inplace=True)

In [148]:
# replace 9999.0 entries in bend_radius column with np.nan entries
tube_df = tube_df.replace(9999.0, np.nan)
tube_df = tube_df.replace('9999', 'other')
print (tube_df.shape)

(21198, 16)


### Following Datasets are data that we have not found how to merge.

In [149]:
tube_end_form_df.head()

Unnamed: 0,end_form_id,forming
0,EF-001,Yes
1,EF-002,No
2,EF-003,No
3,EF-004,No
4,EF-005,Yes


In [150]:
bill_of_materials_df.head()

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
0,TA-00001,C-1622,2.0,C-1629,2.0,,,,,,,,,,,,
1,TA-00002,C-1312,2.0,,,,,,,,,,,,,,
2,TA-00003,C-1312,2.0,,,,,,,,,,,,,,
3,TA-00004,C-1312,2.0,,,,,,,,,,,,,,
4,TA-00005,C-1624,1.0,C-1631,1.0,C-1641,1.0,,,,,,,,,,


In [151]:
comp_adaptor_df.head()

Unnamed: 0,component_id,component_type_id,adaptor_angle,overall_length,end_form_id_1,connection_type_id_1,length_1,thread_size_1,thread_pitch_1,nominal_size_1,end_form_id_2,connection_type_id_2,length_2,thread_size_2,thread_pitch_2,nominal_size_2,hex_size,unique_feature,orientation,weight
0,C-0005,CP-028,,58.4,A-001,B-001,,1.312,12.0,,A-001,B-004,,1.0,11.5,,34.93,No,No,0.206
1,C-0006,CP-028,,34.8,A-001,B-001,,0.437,20.0,,A-001,B-005,,0.75,16.0,,22.2,No,No,0.083
2,C-1435,CP-028,,20.3,A-007,B-004,,,,15.88,A-001,B-007,,0.875,18.0,,22.22,No,No,0.023
3,C-1546,CP-028,,26.4,A-007,B-004,,0.125,27.0,,A-001,B-004,,0.125,27.0,,15.88,No,No,0.026
4,C-1583,CP-028,,44.5,A-001,B-005,,1.312,12.0,,A-007,B-005,,1.062,12.0,,38.1,No,No,0.256


In [152]:
components_df.head()

Unnamed: 0,component_id,name,component_type_id
0,other,OTHER,OTHER
1,C-0001,SLEEVE,CP-024
2,C-0002,SLEEVE,CP-024
3,C-0003,SLEEVE-FLARED,CP-024
4,C-0004,NUT,CP-026


In [153]:
type_component_df.head()

Unnamed: 0,component_type_id,name
0,CP-001,4-bolt Tig Straight
1,CP-002,4-bolt MJ Straight
2,CP-003,4-bolt Braze/Weld Straight
3,CP-004,2-bolt Braze/Weld Straight
4,CP-005,2-bolt MJ Straight


In [154]:
type_connection_df.head()

Unnamed: 0,connection_type_id,name
0,B-001,37 deg Flare-SAE J514
1,B-002,ORFS-SAE J1453
2,B-003,Hi-Duty
3,B-004,NPTF-SAE J476/J514
4,B-005,SAE STOR-SAE J1926


In [155]:
type_end_form_df.head()

Unnamed: 0,end_form_id,name
0,A-001,Male (Stud)
1,A-002,Male (Swivel)
2,A-003,Braze-Weld Boss
3,A-004,Braze-Weld Socket
4,A-005,Swivel Nut


### Merge data

I did some basic feature engineering during data merge. Because some data metrics are too sparse to use directly, so I extracted information from them.

In [156]:
#merge1: train + tube_df

merge1 = train_set_df.merge(tube_df)

test_merge1 = test_set_df.merge(tube_df)

In [157]:
#merge2: train + tube_df + bill_of_materials_df(bill_of_materials_summary_df)
bill_comp_types_df = bill_of_materials_df.iloc[:,[1,3,5,7,9,11,13,15]]
bill_comp_types_logical_df = ~bill_comp_types_df.isnull()
component_series = bill_comp_types_logical_df.sum(axis = 1)

bill_comp_quants_df = bill_of_materials_df.iloc[:,[2,4,6,8,10,12,14,16]]
quants_series = bill_comp_quants_df.sum(axis = 1)

bill_of_materials_summary_df = bill_of_materials_df.copy()
bill_of_materials_summary_df['type_totals'] = component_series
bill_of_materials_summary_df['component_totals'] = quants_series

merge2 = merge1.merge(bill_of_materials_summary_df[['tube_assembly_id', 'type_totals', 'component_totals']])

test_merge2 = test_merge1.merge(bill_of_materials_summary_df[['tube_assembly_id', 'type_totals', 'component_totals']])

In [158]:
#merge3: train + tube_df + bill_of_materials_df(bill_of_materials_summary_df) + specs_df(totals_spec)
specs_only_df = specs_df.iloc[:, 1:11]
specs_logical_df = ~specs_only_df.isnull()
specs_totals = specs_logical_df.sum(axis=1)

specs_with_totals_df = specs_df.copy()
specs_with_totals_df['spec_totals'] = specs_totals

merge3 = merge2.merge(specs_with_totals_df[['tube_assembly_id', 'spec_totals']])

test_merge3 = test_merge2.merge(specs_with_totals_df[['tube_assembly_id', 'spec_totals']])

In [159]:
merge3.head()

Unnamed: 0,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,cost,material_id,diameter,...,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,type_totals,component_totals,spec_totals
0,TA-00002,S-0066,2013-07-07,0,0,Yes,1,21.905933,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
1,TA-00002,S-0066,2013-07-07,0,0,Yes,2,12.341214,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
2,TA-00002,S-0066,2013-07-07,0,0,Yes,5,6.601826,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
3,TA-00002,S-0066,2013-07-07,0,0,Yes,10,4.68777,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0
4,TA-00002,S-0066,2013-07-07,0,0,Yes,25,3.541561,SP-0019,6.35,...,N,N,EF-008,EF-008,0,0,0,1,2.0,0


In [160]:
test_merge3.head()

Unnamed: 0,id,tube_assembly_id,supplier,quote_date,annual_usage,min_order_quantity,bracket_pricing,quantity,material_id,diameter,...,end_x_1x,end_x_2x,end_a,end_x,num_boss,num_bracket,other,type_totals,component_totals,spec_totals
0,1,TA-00001,S-0066,2013-06-23,0,0,Yes,1,SP-0035,12.7,...,N,N,EF-003,EF-003,0,0,0,2,4.0,0
1,2,TA-00001,S-0066,2013-06-23,0,0,Yes,2,SP-0035,12.7,...,N,N,EF-003,EF-003,0,0,0,2,4.0,0
2,3,TA-00001,S-0066,2013-06-23,0,0,Yes,5,SP-0035,12.7,...,N,N,EF-003,EF-003,0,0,0,2,4.0,0
3,4,TA-00001,S-0066,2013-06-23,0,0,Yes,10,SP-0035,12.7,...,N,N,EF-003,EF-003,0,0,0,2,4.0,0
4,5,TA-00001,S-0066,2013-06-23,0,0,Yes,25,SP-0035,12.7,...,N,N,EF-003,EF-003,0,0,0,2,4.0,0


In [161]:
train = merge3.copy()

test = test_merge3.copy()

In [162]:
train.drop("tube_assembly_id", axis=1, inplace=True)

test.drop("tube_assembly_id", axis=1, inplace=True)
test.drop("id", axis=1, inplace=True)

In [163]:
train.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-07-07 00:00:00,2013-07-07 00:00:00,2013-07-07 00:00:00,2013-07-07 00:00:00,2013-07-07 00:00:00
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
cost,21.9059,12.3412,6.60183,4.68777,3.54156
material_id,SP-0019,SP-0019,SP-0019,SP-0019,SP-0019
diameter,6.35,6.35,6.35,6.35,6.35
wall,0.71,0.71,0.71,0.71,0.71


In [164]:
test.head().transpose()

Unnamed: 0,0,1,2,3,4
supplier,S-0066,S-0066,S-0066,S-0066,S-0066
quote_date,2013-06-23 00:00:00,2013-06-23 00:00:00,2013-06-23 00:00:00,2013-06-23 00:00:00,2013-06-23 00:00:00
annual_usage,0,0,0,0,0
min_order_quantity,0,0,0,0,0
bracket_pricing,Yes,Yes,Yes,Yes,Yes
quantity,1,2,5,10,25
material_id,SP-0035,SP-0035,SP-0035,SP-0035,SP-0035
diameter,12.7,12.7,12.7,12.7,12.7
wall,1.65,1.65,1.65,1.65,1.65
length,164,164,164,164,164


In [165]:
train.quote_date = pd.to_datetime(train.quote_date)

test.quote_date = pd.to_datetime(train.quote_date)

In [166]:
train["year"] = train.quote_date.dt.year
train["month"] = train.quote_date.dt.month
train["day"] = train.quote_date.dt.day
train["day_of_week"] = train.quote_date.dt.dayofweek

test["year"] = test.quote_date.dt.year
test["month"] = test.quote_date.dt.month
test["day"] = test.quote_date.dt.day
test["day_of_week"] = test.quote_date.dt.dayofweek

In [167]:
#only use numeric data
data = train.select_dtypes(include=['int', 'float'])

test = test.select_dtypes(include=["int", "float"])

In [168]:
#fill null by 0
data.replace(np.nan, 0, inplace=True)

test.replace(np.nan, 0, inplace=True)

# split for machine learning models

In [169]:
train_data, valid_data = train_test_split(data, test_size = 0)

In [170]:
label = "cost"

In [171]:
data_labels = train_data.columns.tolist()
data_labels.remove(label)

In [172]:
train_df = train_data[data_labels]
valid_df = valid_data[data_labels]
train_label = train_data[label]
valid_label = valid_data[label]

In [173]:
train_df.shape

(30213, 18)

In [174]:
test.shape

(30235, 18)

# application of machine learning models

In [175]:
# sklearn random forest regression
from sklearn.ensemble import RandomForestRegressor

def rf_learning(labels, train, test):
    label_log=np.log1p(labels)
    clf=RandomForestRegressor(n_estimators=50, n_jobs=-1)
    model=clf.fit(train, label_log)
    preds1=model.predict(test)
    preds=np.expm1(preds1)
    return  preds

# submit submission 

In [176]:
rf_preds = rf_learning(train_label, train_df, test)
sample_submission.cost = rf_preds

In [177]:
sample_submission.to_csv("../output/submission.csv", index=False)

In [179]:
#LB with 100% train data is 970/1323 0.354972/0.196556