In [1]:
# mac475의 ipython 표준 style을 적용함
from IPython.core.display import HTML
styles = open("../styles/custom.css", "r").read()
HTML( styles )

#1. component 계열의 dataset 전체통합

- component dataset의 merge 필요성

    → tube → bill에서 발견되는 component 정보는 type별로 adaptor~threaded까지 다양한 종류별로 특성화된 feature를 보유
    → 확보하여 분석목적
    
- 단, 각 component dataset을 보완필요 있는지 사전확인후, 적절한 작업의 병행필요

-<font color='red'><b>본 dataset merge는 meta dataset들간의 통합임</b></font>
	
<img src="images/04.comp.family.png" style="display:inline; width: 100%" />

#2. 각 comp_ dataset내 feature name의 변경

* 궁극적으로 components와 comp_ 로 시작하는 component family dataset들을 하나로 통합하기 위한 목적
* pandas dataset merge시 동일한 column name에 대해서는 임의로 부여되기 때문에 이를 방지하기 위해 prefix 식별자를 추가함

In [2]:
import pandas as pd
import numpy as np
import os

base_location = './dataset/01.original.dataset/'

for root, dirs, files in os.walk( base_location ) :    # 특정 directory 하위를 순환
    for file in files :
        if file.startswith( 'comp_' ) :    # comp_ 계열 file만 대상으로 수행
            prefix = file[5:-4] + '_'    # prefix 생성
            df = pd.read_csv( base_location + file )
            col_list = df.columns.values.tolist()    # dataframe의 column name list 생성
            for col in col_list[1:] :
                df.rename( columns = { col : prefix + col }, inplace = True )    # column name 변경  
            df.to_csv( './dataset/comp.colname.changed/' + file, index = False )    # output
            df = None

#3. 각 dataset의 개별적 검토/ merge통한 feature 확장 및 누락값 처리

#3.1 comp_adaptor

In [3]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_adaptor.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )

component_id
adaptor_component_type_id
adaptor_adaptor_angle
adaptor_overall_length
adaptor_end_form_id_1
adaptor_connection_type_id_1
adaptor_length_1
adaptor_thread_size_1
adaptor_thread_pitch_1
adaptor_nominal_size_1
adaptor_end_form_id_2
adaptor_connection_type_id_2
adaptor_length_2
adaptor_thread_size_2
adaptor_thread_pitch_2
adaptor_nominal_size_2
adaptor_hex_size
adaptor_unique_feature
adaptor_orientation
adaptor_weight


In [4]:
df.count()

component_id                    25
adaptor_component_type_id       25
adaptor_adaptor_angle            1
adaptor_overall_length          25
adaptor_end_form_id_1           25
adaptor_connection_type_id_1    24
adaptor_length_1                 1
adaptor_thread_size_1           17
adaptor_thread_pitch_1          17
adaptor_nominal_size_1           8
adaptor_end_form_id_2           25
adaptor_connection_type_id_2    24
adaptor_length_2                 1
adaptor_thread_size_2           23
adaptor_thread_pitch_2          23
adaptor_nominal_size_2           2
adaptor_hex_size                17
adaptor_unique_feature          25
adaptor_orientation             25
adaptor_weight                  23
dtype: int64

* 누락값 많은 feature 제외

In [5]:
df.drop( [ 'adaptor_component_type_id',
#            'adaptor_adaptor_angle' ,    # 개수가 적음 : 1개
#            'adaptor_length_1',    # 개수가 적음 : 1개
#            'adaptor_nominal_size_1',    # 개수가 적음 : 8개
#            'adaptor_length_2',    # 개수가 적음 : 1개
#            'adaptor_nominal_size_2'    # 개수가 적음 : 2개
         ], axis = 1, inplace = True )

In [6]:
df.count()

component_id                    25
adaptor_adaptor_angle            1
adaptor_overall_length          25
adaptor_end_form_id_1           25
adaptor_connection_type_id_1    24
adaptor_length_1                 1
adaptor_thread_size_1           17
adaptor_thread_pitch_1          17
adaptor_nominal_size_1           8
adaptor_end_form_id_2           25
adaptor_connection_type_id_2    24
adaptor_length_2                 1
adaptor_thread_size_2           23
adaptor_thread_pitch_2          23
adaptor_nominal_size_2           2
adaptor_hex_size                17
adaptor_unique_feature          25
adaptor_orientation             25
adaptor_weight                  23
dtype: int64

* numeric은 median, categorical은 판단하여 fill

In [7]:
################################
# na_num_val = '0'
na_num_val = 'median'
# na_num_val = '-1'

na_cat_val = 'NONE'
# na_cat_val = 'top'
################################

def replace_na_value( sz, is_num ) :
    global na_num_val
    global na_cat_val
    
    if is_num == True :
        if na_num_val == '0' :
            val = 0
        elif na_num_val == 'median' :
            val = sz.median()
        elif na_num_val == '-1' :
            val = -1
    else :
        if na_cat_val == 'NONE' :
            val = 'NONE'
        elif na_cat_val == 'top' :
            val = sz.value_counts()
    
    return val

def replace_9999_value( p_df, colname ) :
    sz = p_df[ colname ][ ( p_df[ colname ].notnull() ) & ( p_df[ colname ] != 9999 ) ]
    val = sz.median()
    return val

In [8]:
df[ 'adaptor_adaptor_angle' ].fillna( replace_na_value( df[ 'adaptor_adaptor_angle' ], True ), inplace = True )
# df[ 'adaptor_overall_length' ].fillna( replace_na_value( df[ 'adaptor_overall_length' ], True ), inplace = True )
df[ 'adaptor_connection_type_id_1' ].fillna( replace_na_value( df[ 'adaptor_connection_type_id_1' ], False ), inplace = True )
df[ 'adaptor_length_1' ].fillna( replace_na_value( df[ 'adaptor_length_1' ], True ), inplace = True )
df[ 'adaptor_thread_size_1' ].fillna( replace_na_value( df[ 'adaptor_thread_size_1' ], True ), inplace = True )
df[ 'adaptor_thread_pitch_1' ].fillna( replace_na_value( df[ 'adaptor_thread_pitch_1' ], True ), inplace = True )
df[ 'adaptor_nominal_size_1' ].fillna( replace_na_value( df[ 'adaptor_nominal_size_1' ], True ), inplace = True )
df[ 'adaptor_connection_type_id_2' ].fillna( replace_na_value( df[ 'adaptor_connection_type_id_2' ], False ), inplace = True )
df[ 'adaptor_length_2' ].fillna( replace_na_value( df[ 'adaptor_nominal_size_1' ], True ), inplace = True )
df[ 'adaptor_thread_size_2' ].fillna( replace_na_value( df[ 'adaptor_thread_size_2' ], True ), inplace = True )
df[ 'adaptor_thread_size_2' ].replace( to_replace = 9999,
                                       value = replace_9999_value( df, 'adaptor_thread_size_2' ),
                                       inplace = True )
df[ 'adaptor_thread_pitch_2' ].fillna( replace_na_value( df[ 'adaptor_thread_pitch_2' ], True ), inplace = True )
df[ 'adaptor_thread_pitch_2' ].replace( to_replace = 9999,
                                        value = replace_9999_value( df, 'adaptor_thread_pitch_2' ),
                                        inplace = True )
df[ 'adaptor_nominal_size_2' ].fillna( replace_na_value( df[ 'adaptor_nominal_size_2' ], True ), inplace = True )
df[ 'adaptor_hex_size' ].fillna( replace_na_value( df[ 'adaptor_hex_size' ], True ), inplace = True )
dic = { 'Yes' : 'Y', 'No' : 'N' }
df[ 'adaptor_unique_feature' ].replace( dic, inplace = True )
df[ 'adaptor_orientation' ].replace( dic, inplace = True )
df[ 'adaptor_weight' ].fillna( replace_na_value( df[ 'adaptor_weight' ], True ), inplace = True )

In [9]:
df.count()

component_id                    25
adaptor_adaptor_angle           25
adaptor_overall_length          25
adaptor_end_form_id_1           25
adaptor_connection_type_id_1    25
adaptor_length_1                25
adaptor_thread_size_1           25
adaptor_thread_pitch_1          25
adaptor_nominal_size_1          25
adaptor_end_form_id_2           25
adaptor_connection_type_id_2    25
adaptor_length_2                25
adaptor_thread_size_2           25
adaptor_thread_pitch_2          25
adaptor_nominal_size_2          25
adaptor_hex_size                25
adaptor_unique_feature          25
adaptor_orientation             25
adaptor_weight                  25
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 19 columns):
component_id                    25 non-null object
adaptor_adaptor_angle           25 non-null float64
adaptor_overall_length          25 non-null float64
adaptor_end_form_id_1           25 non-null object
adaptor_connection_type_id_1    25 non-null object
adaptor_length_1                25 non-null float64
adaptor_thread_size_1           25 non-null float64
adaptor_thread_pitch_1          25 non-null float64
adaptor_nominal_size_1          25 non-null float64
adaptor_end_form_id_2           25 non-null object
adaptor_connection_type_id_2    25 non-null object
adaptor_length_2                25 non-null float64
adaptor_thread_size_2           25 non-null float64
adaptor_thread_pitch_2          25 non-null float64
adaptor_nominal_size_2          25 non-null float64
adaptor_hex_size                25 non-null float64
adaptor_unique_feature          25 non-null object
adaptor_orientation

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 19 columns):
component_id                    25 non-null object
adaptor_adaptor_angle           25 non-null float64
adaptor_overall_length          25 non-null float64
adaptor_end_form_id_1           25 non-null object
adaptor_connection_type_id_1    25 non-null object
adaptor_length_1                25 non-null float64
adaptor_thread_size_1           25 non-null float64
adaptor_thread_pitch_1          25 non-null float64
adaptor_nominal_size_1          25 non-null float64
adaptor_end_form_id_2           25 non-null object
adaptor_connection_type_id_2    25 non-null object
adaptor_length_2                25 non-null float64
adaptor_thread_size_2           25 non-null float64
adaptor_thread_pitch_2          25 non-null float64
adaptor_nominal_size_2          25 non-null float64
adaptor_hex_size                25 non-null float64
adaptor_unique_feature          25 non-null object
adaptor_orientation

In [12]:
df.count()

component_id                    25
adaptor_adaptor_angle           25
adaptor_overall_length          25
adaptor_end_form_id_1           25
adaptor_connection_type_id_1    25
adaptor_length_1                25
adaptor_thread_size_1           25
adaptor_thread_pitch_1          25
adaptor_nominal_size_1          25
adaptor_end_form_id_2           25
adaptor_connection_type_id_2    25
adaptor_length_2                25
adaptor_thread_size_2           25
adaptor_thread_pitch_2          25
adaptor_nominal_size_2          25
adaptor_hex_size                25
adaptor_unique_feature          25
adaptor_orientation             25
adaptor_weight                  25
dtype: int64

In [13]:
df.to_csv( './dataset/comp.verified/comp_adaptor.verified.csv', index = False )

.

##3.2 comp_boss

In [14]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_boss.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
boss_component_type_id
boss_type
boss_connection_type_id
boss_outside_shape
boss_base_type
boss_height_over_tube
boss_bolt_pattern_long
boss_bolt_pattern_wide
boss_groove
boss_base_diameter
boss_shoulder_diameter
boss_unique_feature
boss_orientation
boss_weight


In [15]:
df.count()

component_id               147
boss_component_type_id     147
boss_type                  124
boss_connection_type_id    147
boss_outside_shape         124
boss_base_type             124
boss_height_over_tube      147
boss_bolt_pattern_long      23
boss_bolt_pattern_wide      17
boss_groove                147
boss_base_diameter          57
boss_shoulder_diameter      30
boss_unique_feature        147
boss_orientation           147
boss_weight                145
dtype: int64

In [16]:
df.drop( [ 'boss_component_type_id',
#            'boss_bolt_pattern_long',    # 개수가 적음 : 23개
#            'boss_bolt_pattern_wide',    # 개수가 적음 : 17개
#            'boss_base_diameter',        # 개수가 적음 : 57개
#            'boss_shoulder_diameter',    # 개수가 적음 : 30개
#            'boss_orientation',    # Yes만 있음
         ],
           axis = 1, inplace = True )

In [17]:
df[ 'boss_type' ].fillna( replace_na_value( df[ 'boss_type' ], False ), inplace = True )
df[ 'boss_outside_shape' ].fillna( replace_na_value( df[ 'boss_outside_shape' ], False ), inplace = True )
df[ 'boss_base_type' ].fillna( replace_na_value( df[ 'boss_base_type' ], False ), inplace = True )
df[ 'boss_height_over_tube' ].replace( to_replace = 9999,
                                       value = replace_9999_value( df, 'boss_height_over_tube' ),
                                       inplace = True
                                     )
df[ 'boss_bolt_pattern_long' ].fillna( replace_na_value( df[ 'boss_bolt_pattern_long' ], True ), inplace = True )
df[ 'boss_bolt_pattern_wide' ].fillna( replace_na_value( df[ 'boss_bolt_pattern_wide' ], True ), inplace = True )
df[ 'boss_groove' ].replace( dic, inplace = True )
df[ 'boss_base_diameter' ].fillna( replace_na_value( df[ 'boss_base_diameter' ], True ), inplace = True )
df[ 'boss_shoulder_diameter' ].fillna( replace_na_value( df[ 'boss_shoulder_diameter' ], True ), inplace = True )
df[ 'boss_unique_feature' ].replace( dic, inplace = True )
df[ 'boss_orientation' ].replace( dic, inplace = True )
df[ 'boss_weight' ].fillna( replace_na_value( df[ 'boss_weight' ], True ), inplace = True )

In [18]:
# boss의 부피를 추정
# df[ 'boss_volume' ] = df[ 'boss_bolt_pattern_long' ] * df[ 'boss_bolt_pattern_wide' ] * df[ 'boss_height_over_tube' ]

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Data columns (total 14 columns):
component_id               147 non-null object
boss_type                  147 non-null object
boss_connection_type_id    147 non-null object
boss_outside_shape         147 non-null object
boss_base_type             147 non-null object
boss_height_over_tube      147 non-null float64
boss_bolt_pattern_long     147 non-null float64
boss_bolt_pattern_wide     147 non-null float64
boss_groove                147 non-null object
boss_base_diameter         147 non-null float64
boss_shoulder_diameter     147 non-null float64
boss_unique_feature        147 non-null object
boss_orientation           147 non-null object
boss_weight                147 non-null float64
dtypes: float64(6), object(8)
memory usage: 17.2+ KB


In [20]:
df.count()

component_id               147
boss_type                  147
boss_connection_type_id    147
boss_outside_shape         147
boss_base_type             147
boss_height_over_tube      147
boss_bolt_pattern_long     147
boss_bolt_pattern_wide     147
boss_groove                147
boss_base_diameter         147
boss_shoulder_diameter     147
boss_unique_feature        147
boss_orientation           147
boss_weight                147
dtype: int64

In [21]:
df.to_csv( './dataset/comp.verified/comp_boss.verified.csv', index = False )

.

##3.3 comp_elbow

In [22]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_elbow.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
elbow_component_type_id
elbow_bolt_pattern_long
elbow_bolt_pattern_wide
elbow_extension_length
elbow_overall_length
elbow_thickness
elbow_drop_length
elbow_elbow_angle
elbow_mj_class_code
elbow_mj_plug_class_code
elbow_plug_diameter
elbow_groove
elbow_unique_feature
elbow_orientation
elbow_weight


In [23]:
df.count()

component_id                178
elbow_component_type_id     178
elbow_bolt_pattern_long     171
elbow_bolt_pattern_wide     138
elbow_extension_length      170
elbow_overall_length        175
elbow_thickness             171
elbow_drop_length           171
elbow_elbow_angle           130
elbow_mj_class_code          41
elbow_mj_plug_class_code     40
elbow_plug_diameter           7
elbow_groove                178
elbow_unique_feature        178
elbow_orientation           178
elbow_weight                176
dtype: int64

In [24]:
df.describe()

Unnamed: 0,elbow_bolt_pattern_long,elbow_bolt_pattern_wide,elbow_extension_length,elbow_overall_length,elbow_thickness,elbow_drop_length,elbow_elbow_angle,elbow_plug_diameter,elbow_weight
count,171.0,138.0,170.0,175.0,171.0,171.0,130.0,7.0,176.0
mean,63.624854,33.242826,48.724647,83.251657,46.552398,89.470526,91.618077,50.142857,1.759369
std,17.217936,12.497652,15.134788,28.720575,23.972189,762.389391,8.433971,19.445896,1.91664
min,36.07,17.48,16.5,28.0,7.0,7.9,90.0,25.0,0.064
25%,52.37,26.19,39.0,64.3,30.0,22.325,90.0,35.0,0.663
50%,58.7,30.18,49.16,78.1,46.0,26.92,90.0,50.0,1.1965
75%,69.9,35.7,54.875,100.0,58.75,35.1,90.0,65.0,1.88425
max,152.4,92.08,125.0,190.55,127.7,9999.0,169.25,76.0,10.19


In [25]:
df[ 'elbow_bolt_pattern_long' ].fillna( replace_na_value( df[ 'elbow_bolt_pattern_long' ], True ), inplace = True )
df[ 'elbow_bolt_pattern_wide' ].fillna( replace_na_value( df[ 'elbow_bolt_pattern_wide' ], True ), inplace = True )
df[ 'elbow_extension_length' ].fillna( replace_na_value( df[ 'elbow_extension_length' ], True ), inplace = True )
df[ 'elbow_overall_length' ].fillna( replace_na_value( df[ 'elbow_overall_length' ], True ), inplace = True )
df[ 'elbow_thickness' ].fillna( replace_na_value( df[ 'elbow_thickness' ], True ), inplace = True )
df[ 'elbow_drop_length' ].fillna( replace_na_value( df[ 'elbow_drop_length' ], True ), inplace = True )
df[ 'elbow_drop_length' ].replace( to_replace = 9999,
                                   value = replace_9999_value( df, 'elbow_drop_length' ),
                                   inplace = True )
df[ 'elbow_elbow_angle' ].fillna( replace_na_value( df[ 'elbow_elbow_angle' ], True ), inplace = True )
df[ 'elbow_mj_class_code' ].fillna( replace_na_value( df[ 'elbow_mj_class_code' ], False ), inplace = True )
df[ 'elbow_mj_plug_class_code' ].fillna( replace_na_value( df[ 'elbow_mj_plug_class_code' ], False ), inplace = True )
df[ 'elbow_plug_diameter' ].fillna( replace_na_value( df[ 'elbow_plug_diameter' ], True ), inplace = True )
df[ 'elbow_groove' ].replace( dic, inplace = True )
df[ 'elbow_unique_feature' ].replace( dic, inplace = True )
df[ 'elbow_orientation' ].replace( dic, inplace = True )
df[ 'elbow_weight' ].fillna( replace_na_value( df[ 'elbow_weight' ], True ), inplace = True )

In [26]:
# elbow의 부피를 추정
# df[ 'elbow_volume' ] = df[ 'elbow_bolt_pattern_long' ] * df[ 'elbow_bolt_pattern_wide' ] * df[ 'elbow_thickness' ]

In [27]:
df.drop( [ 'elbow_component_type_id',
#            'elbow_mj_class_code',    # 개수가 적음 : 41개
#            'elbow_mj_plug_class_code',    # 개수가 적음 : 40개
#            'elbow_plug_diameter',    # 개수가 적음 : 7개
         ],
           axis = 1, inplace = True )

In [28]:
df.count()

component_id                178
elbow_bolt_pattern_long     178
elbow_bolt_pattern_wide     178
elbow_extension_length      178
elbow_overall_length        178
elbow_thickness             178
elbow_drop_length           178
elbow_elbow_angle           178
elbow_mj_class_code         178
elbow_mj_plug_class_code    178
elbow_plug_diameter         178
elbow_groove                178
elbow_unique_feature        178
elbow_orientation           178
elbow_weight                178
dtype: int64

In [29]:
df.describe()

Unnamed: 0,elbow_bolt_pattern_long,elbow_bolt_pattern_wide,elbow_extension_length,elbow_overall_length,elbow_thickness,elbow_drop_length,elbow_elbow_angle,elbow_plug_diameter,elbow_weight
count,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0,178.0
mean,63.43118,32.554551,48.744213,83.164831,46.530674,30.987753,91.181742,50.005618,1.753045
std,16.901317,11.069662,14.789081,28.483904,23.493628,13.882777,7.23605,3.580388,1.906709
min,36.07,17.48,16.5,28.0,7.0,7.9,90.0,25.0,0.064
25%,52.37,26.2,39.0,64.3875,31.8125,22.3875,90.0,50.0,0.6695
50%,58.7,30.18,49.16,78.1,46.0,26.92,90.0,50.0,1.1965
75%,69.9,31.8,54.0075,100.0,57.5,35.0,90.0,50.0,1.876
max,152.4,92.08,125.0,190.55,127.7,100.0,169.25,76.0,10.19


In [30]:
df.to_csv( './dataset/comp.verified/comp_elbow.verified.csv', index = False )

.

##3.4 comp_float

In [31]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_float.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
float_component_type_id
float_bolt_pattern_long
float_bolt_pattern_wide
float_thickness
float_orientation
float_weight


In [32]:
df.count()

component_id               16
float_component_type_id    16
float_bolt_pattern_long    16
float_bolt_pattern_wide    16
float_thickness            16
float_orientation          16
float_weight               16
dtype: int64

In [33]:
df.describe()

Unnamed: 0,float_bolt_pattern_long,float_bolt_pattern_wide,float_thickness,float_weight
count,16.0,16.0,16.0,16.0
mean,80.31,43.288125,23.878125,1.147938
std,32.713751,23.786334,6.574115,1.095662
min,47.62,17.0,14.2,0.23
25%,57.14,26.1975,19.5,0.4325
50%,64.325,30.19,23.5,0.5575
75%,108.785,63.91,28.0,1.90575
max,148.0,96.0,36.5,4.06


In [34]:
df.drop( [ 'float_component_type_id' ], axis = 1, inplace = True )

In [35]:
df[ 'float_orientation' ].replace( dic, inplace = True )

In [36]:
# float의 부피를 추정한다
# df[ 'float_volume' ] = df[ 'float_bolt_pattern_long' ] * df[ 'float_bolt_pattern_wide' ] * df[ 'float_thickness' ]

In [37]:
df.to_csv( './dataset/comp.verified/comp_float.verified.csv', index = False )

.

##3.5 comp_htl

In [38]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_hfl.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
hfl_component_type_id
hfl_hose_diameter
hfl_corresponding_shell
hfl_coupling_class
hfl_material
hfl_plating
hfl_orientation
hfl_weight


In [39]:
df.count()

component_id               6
hfl_component_type_id      6
hfl_hose_diameter          6
hfl_corresponding_shell    6
hfl_coupling_class         6
hfl_material               6
hfl_plating                6
hfl_orientation            6
hfl_weight                 6
dtype: int64

In [40]:
df.describe()

Unnamed: 0,hfl_hose_diameter,hfl_weight
count,6.0,6.0
mean,11.933333,0.055667
std,8.549542,0.073421
min,4.8,0.001
25%,4.8,0.01
50%,10.35,0.031
75%,15.9,0.06175
max,25.4,0.196


In [41]:
df[ 'hfl_plating' ].replace( dic, inplace = True )
df[ 'hfl_orientation' ].replace( dic, inplace = True )

In [42]:
def calculate_hfl_area( p_df ) :
    import math
    return np.pi * math.pow( p_df[ 'hfl_hose_diameter' ]/2, 2 )

In [43]:
# df[ 'hfl_area' ] = df.apply( calculate_hfl_area, axis = 1 )

In [44]:
df.drop( [ 'hfl_component_type_id',
         ],
           axis = 1, inplace = True )

In [45]:
df.to_csv( './dataset/comp.verified/comp_hfl.verified.csv', index = False )

.

##3.6 comp_nut

In [46]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_nut.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
nut_component_type_id
nut_hex_nut_size
nut_seat_angle
nut_length
nut_thread_size
nut_thread_pitch
nut_diameter
nut_blind_hole
nut_orientation
nut_weight


In [47]:
df.count()

component_id             65
nut_component_type_id    65
nut_hex_nut_size         42
nut_seat_angle           15
nut_length               65
nut_thread_size          65
nut_thread_pitch         65
nut_diameter             23
nut_blind_hole           23
nut_orientation          65
nut_weight               64
dtype: int64

In [48]:
df.describe()

Unnamed: 0,nut_hex_nut_size,nut_seat_angle,nut_length,nut_thread_pitch,nut_diameter,nut_weight
count,42.0,15.0,65.0,65.0,23.0,64.0
mean,29.810476,38.6,26.451185,11.576923,19.8,0.077094
std,11.528083,3.312315,12.41031,6.230426,6.680692,0.068569
min,14.29,37.0,1.0,1.0,0.625,0.009
25%,22.055,37.0,20.0,12.0,17.5,0.027
50%,25.4,37.0,24.9,13.0,20.0,0.048
75%,35.73,37.0,27.8,16.0,25.0,0.10925
max,57.15,45.0,90.0,20.0,30.0,0.343


In [49]:
df[ 'nut_hex_nut_size' ].fillna( replace_na_value( df[ 'nut_hex_nut_size' ], True ), inplace = True )
df[ 'nut_seat_angle' ].fillna( replace_na_value( df[ 'nut_seat_angle' ], True ), inplace = True )
df[ 'nut_diameter' ].fillna( replace_na_value( df[ 'nut_diameter' ], True ), inplace = True )
df[ 'nut_blind_hole' ].fillna( replace_na_value( df[ 'nut_blind_hole' ], False ), inplace = True )
df[ 'nut_blind_hole' ].replace( dic, inplace = True )
df[ 'nut_orientation' ].replace( dic, inplace = True )
df[ 'nut_weight' ].fillna( replace_na_value( df[ 'nut_weight' ], True ), inplace = True )

In [50]:
def calculate_nut_volume( p_df ) :
    import math
    return np.pi * math.pow( p_df[ 'nut_diameter' ]/2, 2 ) * p_df[ 'nut_length' ]

In [51]:
# df[ 'nut_volume' ] = df.apply( calculate_nut_volume, axis = 1 )

In [52]:
df.drop( [ 'nut_component_type_id',
#            'nut_seat_angle',    # 개수가 적음 : 15개
#            'nut_diameter',    # 개수가 적음 : 23개
#            'nut_blind_hole',    # 개수가 적음 : 23개          
         ],
           axis = 1, inplace = True )

In [53]:
df.count()

component_id        65
nut_hex_nut_size    65
nut_seat_angle      65
nut_length          65
nut_thread_size     65
nut_thread_pitch    65
nut_diameter        65
nut_blind_hole      65
nut_orientation     65
nut_weight          65
dtype: int64

In [54]:
df.describe()

Unnamed: 0,nut_hex_nut_size,nut_seat_angle,nut_length,nut_thread_pitch,nut_diameter,nut_weight
count,65.0,65.0,65.0,65.0,65.0,65.0
mean,28.249846,37.369231,26.451185,11.576923,19.929231,0.076646
std,9.468579,1.691608,12.41031,6.230426,3.918088,0.068127
min,14.29,37.0,1.0,1.0,0.625,0.009
25%,23.81,37.0,20.0,12.0,20.0,0.027
50%,25.4,37.0,24.9,13.0,20.0,0.048
75%,30.0,37.0,27.8,16.0,20.0,0.109
max,57.15,45.0,90.0,20.0,30.0,0.343


In [55]:
df.to_csv( './dataset/comp.verified/comp_nut.verified.csv', index = False )

.

##3.7 comp_other

In [56]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_other.2.csv', encoding = 'utf-8' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
other_weight


In [57]:
df.count()

component_id    1001
other_weight     945
dtype: int64

In [58]:
df[ 'other_weight' ].fillna( replace_na_value( df[ 'other_weight' ], True ), inplace = True )

In [59]:
df.count()

component_id    1001
other_weight    1001
dtype: int64

In [60]:
df.to_csv( './dataset/comp.verified/comp_other.verified.csv', index = False )

.

##3.8 comp_sleeve

In [61]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_sleeve.csv', encoding = 'utf-8' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
sleeve_component_type_id
sleeve_connection_type_id
sleeve_length
sleeve_intended_nut_thread
sleeve_intended_nut_pitch
sleeve_unique_feature
sleeve_plating
sleeve_orientation
sleeve_weight


In [62]:
df.count()

component_id                  50
sleeve_component_type_id      50
sleeve_connection_type_id     50
sleeve_length                 50
sleeve_intended_nut_thread    50
sleeve_intended_nut_pitch     50
sleeve_unique_feature         50
sleeve_plating                50
sleeve_orientation            50
sleeve_weight                 50
dtype: int64

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 0 to 49
Data columns (total 10 columns):
component_id                  50 non-null object
sleeve_component_type_id      50 non-null object
sleeve_connection_type_id     50 non-null object
sleeve_length                 50 non-null float64
sleeve_intended_nut_thread    50 non-null float64
sleeve_intended_nut_pitch     50 non-null int64
sleeve_unique_feature         50 non-null object
sleeve_plating                50 non-null object
sleeve_orientation            50 non-null object
sleeve_weight                 50 non-null float64
dtypes: float64(3), int64(1), object(6)
memory usage: 4.3+ KB


In [64]:
df.describe()

Unnamed: 0,sleeve_length,sleeve_intended_nut_thread,sleeve_intended_nut_pitch,sleeve_weight
count,50.0,50.0,50.0,50.0
mean,1611.8136,1.05716,14.36,0.02268
std,3697.633161,0.422217,2.545584,0.019932
min,8.5,0.437,12.0,0.001
25%,12.0,0.70275,12.0,0.00675
50%,14.5,1.0,14.0,0.0175
75%,18.8,1.40575,16.0,0.02975
max,9999.0,2.0,20.0,0.09


In [65]:
df[ 'sleeve_length' ].replace( to_replace = 9999,
                               value = replace_9999_value( df, 'sleeve_length' ),
                               inplace = True )
df[ 'sleeve_unique_feature' ].replace( dic, inplace = True )
df[ 'sleeve_plating' ].replace( dic, inplace = True )
df[ 'sleeve_orientation' ].replace( dic, inplace = True )

In [66]:
df.drop( [ 'sleeve_component_type_id',
#            'sleeve_orientation',    # No만 있음
         ],
           axis = 1, inplace = True )

In [67]:
df.count()

component_id                  50
sleeve_connection_type_id     50
sleeve_length                 50
sleeve_intended_nut_thread    50
sleeve_intended_nut_pitch     50
sleeve_unique_feature         50
sleeve_plating                50
sleeve_orientation            50
sleeve_weight                 50
dtype: int64

In [68]:
df.describe()

Unnamed: 0,sleeve_length,sleeve_intended_nut_thread,sleeve_intended_nut_pitch,sleeve_weight
count,50.0,50.0,50.0,50.0
mean,14.2296,1.05716,14.36,0.02268
std,3.70116,0.422217,2.545584,0.019932
min,8.5,0.437,12.0,0.001
25%,12.0,0.70275,12.0,0.00675
50%,14.1,1.0,14.0,0.0175
75%,15.5,1.40575,16.0,0.02975
max,28.4,2.0,20.0,0.09


In [69]:
df.to_csv( './dataset/comp.verified/comp_sleeve.verified.csv', index = False )

.

##3.9 comp_straight

In [70]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_straight.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
straight_component_type_id
straight_bolt_pattern_long
straight_bolt_pattern_wide
straight_head_diameter
straight_overall_length
straight_thickness
straight_mj_class_code
straight_groove
straight_unique_feature
straight_orientation
straight_weight


In [71]:
df.count()

component_id                  361
straight_component_type_id    361
straight_bolt_pattern_long    291
straight_bolt_pattern_wide    204
straight_head_diameter         70
straight_overall_length        41
straight_thickness            361
straight_mj_class_code        120
straight_groove               361
straight_unique_feature       361
straight_orientation          361
straight_weight               354
dtype: int64

In [72]:
df.describe()

Unnamed: 0,straight_bolt_pattern_long,straight_bolt_pattern_wide,straight_head_diameter,straight_overall_length,straight_thickness,straight_weight
count,291.0,204.0,70.0,41.0,361.0,354.0
mean,71.77567,40.841225,58.414,27.906098,21.185734,0.813517
std,23.737576,18.915779,17.766911,9.428145,11.751907,0.883847
min,38.1,22.2,38.0,12.7,3.76,0.001
25%,52.4,26.2,45.245,19.5,10.0,0.2
50%,66.68,31.8,50.8,28.0,18.0,0.561
75%,79.4,50.8,65.325,30.0,28.0,1.20075
max,158.8,120.0,127.0,52.0,65.0,9.693


In [73]:
df[ 'straight_bolt_pattern_long' ].fillna( replace_na_value( df[ 'straight_bolt_pattern_long' ], True ), inplace = True )
df[ 'straight_bolt_pattern_wide' ].fillna( replace_na_value( df[ 'straight_bolt_pattern_wide' ], True ), inplace = True )
df[ 'straight_head_diameter' ].fillna( replace_na_value( df[ 'straight_head_diameter' ], True ), inplace = True )
df[ 'straight_overall_length' ].fillna( replace_na_value( df[ 'straight_overall_length' ], True ), inplace = True )
df[ 'straight_mj_class_code' ].fillna( replace_na_value( df[ 'straight_mj_class_code' ], False ), inplace = True )
df[ 'straight_groove' ].replace( dic, inplace = True )
df[ 'straight_unique_feature' ].replace( dic, inplace = True )
df[ 'straight_orientation' ].replace( dic, inplace = True )
df[ 'straight_weight' ].fillna( replace_na_value( df[ 'straight_weight' ], True ), inplace = True )

In [74]:
# straight의 부피를 추정한다
# df[ 'straight_volume' ] = df[ 'straight_bolt_pattern_long' ] * df[ 'straight_bolt_pattern_wide' ] * df[ 'straight_thickness' ]

In [75]:
df.drop( [ 'straight_component_type_id',
#            'straight_head_diameter',    # 개수가 적음 : 70개
#            'straight_overall_length',    # 개수가 적음 : 41개
#            'straight_mj_class_code',    # 개수가 적음 : 120개
         ],
           axis = 1, inplace = True )

In [76]:
df.count()

component_id                  361
straight_bolt_pattern_long    361
straight_bolt_pattern_wide    361
straight_head_diameter        361
straight_overall_length       361
straight_thickness            361
straight_mj_class_code        361
straight_groove               361
straight_unique_feature       361
straight_orientation          361
straight_weight               361
dtype: int64

In [77]:
df.describe()

Unnamed: 0,straight_bolt_pattern_long,straight_bolt_pattern_wide,straight_head_diameter,straight_overall_length,straight_thickness,straight_weight
count,361.0,361.0,361.0,361.0,361.0,361.0
mean,70.78759,36.909169,52.276399,27.989335,21.185734,0.80862
std,21.400428,14.896596,8.341991,3.142857,11.751907,0.875906
min,38.1,22.2,38.0,12.7,3.76,0.001
25%,57.15,31.75,50.8,28.0,10.0,0.2
50%,66.68,31.8,50.8,28.0,18.0,0.561
75%,77.8,35.71,50.8,28.0,28.0,1.2
max,158.8,120.0,127.0,52.0,65.0,9.693


In [78]:
df.to_csv( './dataset/comp.verified/comp_straight.verified.csv', index = False )

.

##3.10 comp_tee

In [79]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_tee.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
tee_component_type_id
tee_bolt_pattern_long
tee_bolt_pattern_wide
tee_extension_length
tee_overall_length
tee_thickness
tee_drop_length
tee_mj_class_code
tee_mj_plug_class_code
tee_groove
tee_unique_feature
tee_orientation
tee_weight


In [80]:
df.count()

component_id              4
tee_component_type_id     4
tee_bolt_pattern_long     4
tee_bolt_pattern_wide     4
tee_extension_length      4
tee_overall_length        4
tee_thickness             4
tee_drop_length           4
tee_mj_class_code         4
tee_mj_plug_class_code    4
tee_groove                4
tee_unique_feature        4
tee_orientation           4
tee_weight                4
dtype: int64

In [81]:
df.describe()

Unnamed: 0,tee_bolt_pattern_long,tee_bolt_pattern_wide,tee_extension_length,tee_overall_length,tee_thickness,tee_drop_length,tee_weight
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,57.13,29.195,53.6975,96.625,55.5,27.75,1.6995
std,3.153347,1.996689,6.798335,13.888694,3.0,1.5,0.464674
min,52.4,26.2,43.5,78.5,51.0,25.5,1.135
25%,57.125,29.185,53.6925,89.375,55.5,27.75,1.42825
50%,58.7,30.19,57.095,100.0,57.0,28.5,1.7395
75%,58.705,30.2,57.1,107.25,57.0,28.5,2.01075
max,58.72,30.2,57.1,108.0,57.0,28.5,2.184


In [82]:
df.drop( [ 'tee_component_type_id',
#            'tee_mj_class_code',    # 동일 data만 존재
#            'tee_groove',    # 동일 data만 존재
#            'tee_orientation',    # 동일 data만 존재
         ],
           axis = 1, inplace = True )

In [83]:
# df[ 'tee_mj_class_code' ] = 1
tmp_dic = { 'Threaded' : 'Y', 'MJ-005' : 'N' }
df[ 'tee_mj_plug_class_code' ].replace( tmp_dic, inplace = True )
df[ 'tee_groove' ].replace( dic, inplace = True )
df[ 'tee_unique_feature' ].replace( dic, inplace = True )
df[ 'tee_orientation' ].replace( dic, inplace = True )

In [84]:
# tee의 부피를 추정
# df[ 'tee_volume' ] = df[ 'tee_bolt_pattern_long' ] * df[ 'tee_bolt_pattern_wide' ] * df[ 'tee_thickness' ]

In [85]:
df.count()

component_id              4
tee_bolt_pattern_long     4
tee_bolt_pattern_wide     4
tee_extension_length      4
tee_overall_length        4
tee_thickness             4
tee_drop_length           4
tee_mj_class_code         4
tee_mj_plug_class_code    4
tee_groove                4
tee_unique_feature        4
tee_orientation           4
tee_weight                4
dtype: int64

In [86]:
df.describe()

Unnamed: 0,tee_bolt_pattern_long,tee_bolt_pattern_wide,tee_extension_length,tee_overall_length,tee_thickness,tee_drop_length,tee_weight
count,4.0,4.0,4.0,4.0,4.0,4.0,4.0
mean,57.13,29.195,53.6975,96.625,55.5,27.75,1.6995
std,3.153347,1.996689,6.798335,13.888694,3.0,1.5,0.464674
min,52.4,26.2,43.5,78.5,51.0,25.5,1.135
25%,57.125,29.185,53.6925,89.375,55.5,27.75,1.42825
50%,58.7,30.19,57.095,100.0,57.0,28.5,1.7395
75%,58.705,30.2,57.1,107.25,57.0,28.5,2.01075
max,58.72,30.2,57.1,108.0,57.0,28.5,2.184


In [87]:
df.to_csv( './dataset/comp.verified/comp_tee.verified.csv', index = False )

.

##3.11 comp_threaded

In [88]:
df = pd.read_csv( './dataset/comp.colname.changed/comp_threaded.csv' )
for col in df.columns.values.tolist() :    # 각 column의 검토
    print( col )    

component_id
threaded_component_type_id
threaded_adaptor_angle
threaded_overall_length
threaded_hex_size
threaded_end_form_id_1
threaded_connection_type_id_1
threaded_length_1
threaded_thread_size_1
threaded_thread_pitch_1
threaded_nominal_size_1
threaded_end_form_id_2
threaded_connection_type_id_2
threaded_length_2
threaded_thread_size_2
threaded_thread_pitch_2
threaded_nominal_size_2
threaded_end_form_id_3
threaded_connection_type_id_3
threaded_length_3
threaded_thread_size_3
threaded_thread_pitch_3
threaded_nominal_size_3
threaded_end_form_id_4
threaded_connection_type_id_4
threaded_length_4
threaded_thread_size_4
threaded_thread_pitch_4
threaded_nominal_size_4
threaded_unique_feature
threaded_orientation
threaded_weight


In [89]:
df.count()

component_id                     194
threaded_component_type_id       194
threaded_adaptor_angle            54
threaded_overall_length          121
threaded_hex_size                112
threaded_end_form_id_1           194
threaded_connection_type_id_1    135
threaded_length_1                 73
threaded_thread_size_1           135
threaded_thread_pitch_1          135
threaded_nominal_size_1           60
threaded_end_form_id_2           194
threaded_connection_type_id_2     63
threaded_length_2                 73
threaded_thread_size_2            63
threaded_thread_pitch_2           63
threaded_nominal_size_2          131
threaded_end_form_id_3            19
threaded_connection_type_id_3     10
threaded_length_3                 19
threaded_thread_size_3            10
threaded_thread_pitch_3           10
threaded_nominal_size_3            9
threaded_end_form_id_4             1
threaded_connection_type_id_4      1
threaded_length_4                  1
threaded_thread_size_4             1
t

In [90]:
df.describe()

Unnamed: 0,threaded_adaptor_angle,threaded_overall_length,threaded_hex_size,threaded_length_1,threaded_thread_size_1,threaded_thread_pitch_1,threaded_length_2,threaded_thread_size_2,threaded_thread_pitch_2,threaded_nominal_size_2,threaded_length_3,threaded_thread_size_3,threaded_thread_pitch_3,threaded_nominal_size_3,threaded_length_4,threaded_thread_size_4,threaded_thread_pitch_4,threaded_nominal_size_4,threaded_weight
count,54.0,121.0,112.0,73.0,135.0,135.0,73.0,63.0,63.0,131.0,19.0,10.0,10.0,9.0,1.0,1.0,1.0,0.0,193.0
mean,89.166667,40.093636,31.629018,39.567534,1.053807,13.937037,34.079178,1.122619,13.460317,96.769618,35.513158,1.2621,12.8,1128.806667,41.7,1.187,12.0,,0.25656
std,6.123724,14.340504,11.18356,13.412294,0.349045,2.174676,11.225542,0.307175,1.907755,871.863691,13.130247,0.269601,1.398412,3326.333176,,,,,0.230857
min,45.0,18.54,12.7,15.88,0.437,8.0,0.0,0.437,12.0,6.35,14.5,0.812,12.0,9.52,41.7,1.187,12.0,,0.005
25%,90.0,28.4,22.22,30.55,0.812,12.0,28.0,0.812,12.0,15.88,25.905,1.04675,12.0,15.88,41.7,1.187,12.0,,0.096
50%,90.0,38.5,31.75,37.3,1.0,14.0,35.5,1.187,12.0,19.05,37.0,1.312,12.0,15.88,41.7,1.187,12.0,,0.175
75%,90.0,48.4,38.1,43.0,1.187,16.0,41.4,1.437,16.0,25.4,41.7,1.437,13.5,30.0,41.7,1.187,12.0,,0.351
max,90.0,80.0,76.2,71.1,2.5,20.0,61.2,2.0,20.0,9999.0,71.2,1.687,16.0,9999.0,41.7,1.187,12.0,,1.17


In [91]:
df.drop( [ 'threaded_component_type_id',
#            'threaded_adaptor_angle',    # 개수가 적음 : 54개
#            'threaded_length_1',    # 개수가 적음 : 73개
#            'threaded_nominal_size_1',    # 개수가 적음 : 60개
#            'threaded_length_2',    # 개수가 적음 : 73개
#            'threaded_thread_size_2',    # 63개
#            'threaded_thread_pitch_2',    # 63개
#            'threaded_end_form_id_3',    # 19개
#            'threaded_length_3',    # 19개
#            'threaded_thread_size_3',    # 10개
#            'threaded_thread_pitch_3',    # 10개
#            'threaded_nominal_size_3',     # 9개
#            'threaded_end_form_id_4',     # 1개
#            'threaded_connection_type_id_4',    # 1개
#            'threaded_length_4',    # 1개
#            'threaded_thread_size_4',    # 1개
#            'threaded_thread_pitch_4',    # 1개
#            'threaded_nominal_size_4',   # 0개
#            'threaded_connection_type_id_3',    #4개
         ],
           axis = 1, inplace = True )

In [92]:
# df[ 'threaded_connection_type_id_1' ].value_counts().index[0]

In [93]:
df[ 'threaded_adaptor_angle' ].fillna( replace_na_value( df[ 'threaded_adaptor_angle' ], True ), inplace = True )
df[ 'threaded_overall_length' ].fillna( replace_na_value( df[ 'threaded_overall_length' ], True ), inplace = True )
df[ 'threaded_hex_size' ].fillna( replace_na_value( df[ 'threaded_hex_size' ], True ), inplace = True )
df[ 'threaded_connection_type_id_1' ].fillna( replace_na_value( df[ 'threaded_connection_type_id_1' ], False ), inplace = True )
df[ 'threaded_length_1' ].fillna( replace_na_value( df[ 'threaded_length_1' ], True ), inplace = True )
df[ 'threaded_thread_size_1' ].fillna( replace_na_value( df[ 'threaded_thread_size_1' ], True ), inplace = True )
df[ 'threaded_thread_pitch_1' ].fillna( replace_na_value( df[ 'threaded_thread_pitch_1' ], True ), inplace = True )
df[ 'threaded_nominal_size_1' ].replace( to_replace = 'See Drawing',
                                         value = np.nan,
                                         inplace = True )
df[ 'threaded_nominal_size_1' ].fillna( replace_na_value( df[ 'threaded_nominal_size_1' ], True ), inplace = True )
df[ 'threaded_connection_type_id_2' ].fillna( replace_na_value( df[ 'threaded_connection_type_id_2' ], False ), inplace = True )
df[ 'threaded_length_2' ].fillna( replace_na_value( df[ 'threaded_length_2' ], True ), inplace = True )
df[ 'threaded_thread_size_2' ].fillna( replace_na_value( df[ 'threaded_thread_size_2' ], True ), inplace = True )
df[ 'threaded_thread_pitch_2' ].fillna( replace_na_value( df[ 'threaded_thread_pitch_2' ], True ), inplace = True )
df[ 'threaded_nominal_size_2' ].fillna( replace_na_value( df[ 'threaded_nominal_size_2' ], True ), inplace = True )
df[ 'threaded_nominal_size_2' ].replace( to_replace = 9999,
                                         value = replace_9999_value( df, 'threaded_nominal_size_2' ),
                                         inplace = True )
df[ 'threaded_end_form_id_3' ].fillna( replace_na_value( df[ 'threaded_end_form_id_3' ], False ), inplace = True )
df[ 'threaded_connection_type_id_3' ].fillna( replace_na_value( df[ 'threaded_connection_type_id_3' ], False ), inplace = True )
df[ 'threaded_length_3' ].fillna( replace_na_value( df[ 'threaded_length_3' ], True ), inplace = True )
df[ 'threaded_thread_size_3' ].fillna( replace_na_value( df[ 'threaded_thread_size_3' ], True ), inplace = True )
df[ 'threaded_thread_pitch_3' ].fillna( replace_na_value( df[ 'threaded_thread_pitch_3' ], True ), inplace = True )
df[ 'threaded_nominal_size_3' ].fillna( replace_na_value( df[ 'threaded_nominal_size_3' ], True ), inplace = True )
df[ 'threaded_end_form_id_4' ].fillna( replace_na_value( df[ 'threaded_end_form_id_4' ], False ), inplace = True )
df[ 'threaded_connection_type_id_4' ].fillna( replace_na_value( df[ 'threaded_connection_type_id_4' ], False ), inplace = True )
df[ 'threaded_length_4' ].fillna( replace_na_value( df[ 'threaded_length_4' ], True ), inplace = True )
df[ 'threaded_thread_size_4' ].fillna( replace_na_value( df[ 'threaded_thread_size_4' ], True ), inplace = True )
df[ 'threaded_thread_pitch_4' ].fillna( replace_na_value( df[ 'threaded_thread_pitch_4' ], True ), inplace = True )
df[ 'threaded_nominal_size_4' ].fillna( replace_na_value( df[ 'threaded_nominal_size_4' ], True ), inplace = True )
df[ 'threaded_unique_feature' ].replace( dic, inplace = True )
df[ 'threaded_orientation' ].replace( dic, inplace = True )
df[ 'threaded_weight' ].fillna( replace_na_value( df[ 'threaded_weight' ], True ), inplace = True )

In [94]:
df.count()

component_id                     194
threaded_adaptor_angle           194
threaded_overall_length          194
threaded_hex_size                194
threaded_end_form_id_1           194
threaded_connection_type_id_1    194
threaded_length_1                194
threaded_thread_size_1           194
threaded_thread_pitch_1          194
threaded_nominal_size_1          194
threaded_end_form_id_2           194
threaded_connection_type_id_2    194
threaded_length_2                194
threaded_thread_size_2           194
threaded_thread_pitch_2          194
threaded_nominal_size_2          194
threaded_end_form_id_3           194
threaded_connection_type_id_3    194
threaded_length_3                194
threaded_thread_size_3           194
threaded_thread_pitch_3          194
threaded_nominal_size_3          194
threaded_end_form_id_4           194
threaded_connection_type_id_4    194
threaded_length_4                194
threaded_thread_size_4           194
threaded_thread_pitch_4          194
t

In [95]:
df.describe()

Unnamed: 0,threaded_adaptor_angle,threaded_overall_length,threaded_hex_size,threaded_length_1,threaded_thread_size_1,threaded_thread_pitch_1,threaded_length_2,threaded_thread_size_2,threaded_thread_pitch_2,threaded_nominal_size_2,threaded_length_3,threaded_thread_size_3,threaded_thread_pitch_3,threaded_nominal_size_3,threaded_length_4,threaded_thread_size_4,threaded_thread_pitch_4,threaded_nominal_size_4,threaded_weight
count,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194.0,194,0.0,194.0
mean,89.768041,39.493969,31.680155,38.153247,1.037443,13.956186,34.965361,1.166093,12.474227,20.087732,36.854381,1.309428,12.041237,67.510619,41.7,1.187,12,,0.256139
std,3.230812,11.334217,8.481521,8.265717,0.291897,1.812275,6.891023,0.176706,1.280324,7.44305,4.034277,0.059261,0.350204,716.736419,1e-06,3.432349e-08,0,,0.230332
min,45.0,18.54,12.7,15.88,0.437,8.0,0.0,0.437,12.0,6.35,14.5,0.812,12.0,9.52,41.7,1.187,12,,0.005
25%,90.0,34.2625,27.05,37.3,0.812,12.0,35.5,1.187,12.0,15.88,37.0,1.312,12.0,15.88,41.7,1.187,12,,0.09625
50%,90.0,38.5,31.75,37.3,1.0,14.0,35.5,1.187,12.0,19.05,37.0,1.312,12.0,15.88,41.7,1.187,12,,0.175
75%,90.0,40.95,31.75,37.3,1.187,16.0,35.5,1.187,12.0,22.22,37.0,1.312,12.0,15.88,41.7,1.187,12,,0.35075
max,90.0,80.0,76.2,71.1,2.5,20.0,61.2,2.0,20.0,63.5,71.2,1.687,16.0,9999.0,41.7,1.187,12,,1.17


In [96]:
df.to_csv( './dataset/comp.verified/comp_threaded.verified.csv', index = False )

In [97]:
del( df )