In [1]:
# mac475의 ipython 표준 style을 적용함
from IPython.core.display import HTML
styles = open("../styles/custom.css", "r").read()
HTML( styles )

#1. tube의 merge

- tube dataset의 merge 필요성

    → tube dataset만 활용시 modeling 활용 feature 제약, tube_assembly_id join하여 bill_of_materials dataset내의 tube 속성정보 modeling에 활용
    
    → component_id_1/ quantity_1 ~ component_id_8/ quantity_8,  16개 속성
    
- 단, bill_of_materials dataset을 보완필요 있는지 사전확인후, 적절한 작업의 병행필요
	
<img src="images/02.tube-bill-specs.png" style="display:inline; width: 60%" />

In [2]:
import pandas as pd
import numpy as np

#2. bill_of_materials 주요 feature 검사

In [3]:
df = pd.read_csv( './dataset/01.original.dataset/bill_of_materials.csv' )

##2.1 bill_of_materials 누락 feature 확인

In [4]:
df.count()    # tube의 전체적인 data 충실도를 확인

tube_assembly_id    21198
component_id_1      19149
quantity_1          19149
component_id_2      14786
quantity_2          14786
component_id_3       4791
quantity_3           4798
component_id_4        607
quantity_4            608
component_id_5         92
quantity_5             92
component_id_6         26
quantity_6             26
component_id_7          7
quantity_7              7
component_id_8          1
quantity_8              1
dtype: int64

* 확인결과
    
    - component_1은 id와 수량 문제 없음 : 확인필요
        
        → component_1도 존재하지 않는 경우가 가능할까? 2049개가 NA이다 9.6% : 이후 tube-bill-comp로 merge시 누락되어 분석한계 발생
    - component_2는 id와 수량 문제 없음
    - component_3는 id보다 수량이 7개 더 많음 : 확인필요
    - component_4는 id보다 수량이 1개 더 많음 : 확인필요
    - component_5는 id와 수량 문제 없음
    - component_6는 id와 수량 문제 없음
    - component_7는 id와 수량 문제 없음
    - component_8은 id와 수량 문제 없음

.

##2.2 각 features의 unique 구성 확인

In [5]:
def get_unique_elements_sorted( name, sz ) :    # series내의 unique 정보를 sorted 제공
    tmp = sz.sort( axis = 0, ascending = True, inplace = False ).unique()
#     print( 'name : ', name, tmp, 'count : ', len( tmp ) )
    sz = None
    tmp = None    

In [6]:
list_cols = df.columns.values.tolist()    # columns

for col in list_cols[1:] :
    get_unique_elements_sorted( col, df[ col ].copy() )

<font color = 'red'>* unique 확인결과, component_id_1에 포함된, 9999 == unknown을 제외하면 이상 data 문제없음</font>
    
    → component의 master인 components dataset 확인결과 9999 코드값이 존재하므로, 9999 역시 의미있는 정보로 인정

##2.3 component_1 확인

In [7]:
top_component_id = df[ 'component_id_1' ].value_counts().index[0]    # component_1 == NA를 채우기 위한 가장 빈번 component_id
top_component_qunatity = df[ 'quantity_1' ][ df[ 'component_id_1' ] == top_component_id ].value_counts().index[0].astype( int )
top_component_id, top_component_qunatity

('C-1621', 2)

* component_id_1 == null이 되면, tube-bill-comp merge가 불가능하므로, 가장 빈번한 comp_id/ quntity로 채운다 (임시방편임)

In [8]:
# component_id_1 == Null인것들 유지
idx_list = df[ df[ 'component_id_1' ].isnull() ].index.tolist()
for idx in idx_list :
    df.loc[ idx, 'component_id_1' ] = top_component_id
    df.loc[ idx, 'quantity_1' ] = top_component_qunatity

* component_id_1 == null인 경우가 2,049개 존재이며, id_1이 null이면 나머지도 모두 null임

In [9]:
df[ 'component_id_1' ].sort( axis = 0, ascending = True, inplace = False ).unique()
get_unique_elements_sorted( 'component_id_1', df[ 'component_id_1' ].copy() )

* 1,080개 존재하나, 9999 : unknown 존재함

In [10]:
df[ df[ 'component_id_1' ] == '9999' ]    # 처리해야 할 대상임 → comp master에 존재하는 값이므로 그대로 인정한다

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
21141,TA-21143,9999,1,,,,,,,,,,,,,,
21142,TA-21144,9999,1,,,,,,,,,,,,,,


In [11]:
# list_comp1_9999 = df[ df[ 'component_id_1' ] == '9999' ].index.values.tolist()
# top_of_comp1 = df[ 'component_id_1' ].value_counts().index[ 0 ]    # component_id_1에서 가장 빈번한 data를 9999에 배정예정

In [12]:
# 당초 component_id_1 == 9999 인 것들에 대해, 보정하려 했으나, component dataset을 보면 9999가 other처리되어 있으므로,
# 그대로 활용하는 것이 맞다고 판단됨
# for idx in list_comp1_9999 :    # component_id_1 == '9999'를 보정
#     df.loc[ idx, 'component_id_1' ] = top_of_comp1

In [13]:
len( df[ df[ 'component_id_1' ] == '9999' ] )    # 없어짐

2

##2.4 component_3 확인 : quantity_3와 개수가 상이함

In [14]:
df_comp3_problem = df[ ( df[ 'component_id_3' ].isnull() ) & ( df[ 'quantity_3' ].notnull() ) ]    # 이상현상 df

In [15]:
df_comp3_problem

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
2644,TA-02645,C-1623,1,C-1630,1,,1,,,,,,,,,,
2722,TA-02723,C-1623,1,C-1630,1,,1,,,,,,,,,,
2727,TA-02728,C-1623,1,C-1630,1,,1,,,,,,,,,,
2919,TA-02920,C-1624,1,C-1631,1,,1,,,,,,,,,,
2922,TA-02923,C-1625,1,C-1632,1,,1,,,,,,,,,,
4171,TA-04172,C-1623,1,C-1630,1,,1,,,,,,,,,,
4172,TA-04173,C-1623,1,C-1630,1,,1,,,,,,,,,,


In [16]:
for idx, data in df_comp3_problem.iterrows() :    # 이상판단되는 data를 순환하며, 각 조건과 일치하는 sampling중 가장 빈번사례 추출
    top_of_comp3_by_condition = df[ 'component_id_3' ][ ( df[ 'component_id_1' ] == data[ 'component_id_1' ] ) &
                                                        ( df[ 'component_id_2' ] == data[ 'component_id_2' ] ) &
                                                        ( df[ 'quantity_1' ] == data[ 'quantity_1' ] ) &
                                                        ( df[ 'quantity_2' ] == data[ 'quantity_2' ] )
                                  ].dropna().value_counts().index[ 0 ]
#     df.loc[ idx, 'component_id_3' ] = top_of_comp3_by_condition

    # component_3의 수량을 0로 만든다
    df.loc[ idx, 'quantity_3' ] = 0

In [17]:
len( df[ ( df[ 'component_id_3' ].isnull() ) & ( df[ 'quantity_3' ].notnull() ) ] )   # 없어짐

7

##2.5 component_4 확인 : quantity_4와 개수가 상이함

In [18]:
df_comp4_problem = df[ ( df[ 'component_id_4' ].isnull() ) & ( df[ 'quantity_4' ].notnull() ) ]    # 이상현상 df

In [19]:
df_comp4_problem

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,quantity_5,component_id_6,quantity_6,component_id_7,quantity_7,component_id_8,quantity_8
2814,TA-02815,C-1621,1,C-1622,1,C-1628,1,,1,,,,,,,,


In [20]:
for idx, data in df_comp4_problem.iterrows() :    # 이상판단되는 data를 순환하며, 각 조건과 일치하는 sampling중 가장 빈번사례 추출
    top_of_comp4_by_condition = df[ 'component_id_4' ][ ( df[ 'component_id_1' ] == data[ 'component_id_1' ] ) &
                                                        ( df[ 'component_id_2' ] == data[ 'component_id_2' ] ) &
                                                        ( df[ 'component_id_3' ] == data[ 'component_id_3' ] ) &
                                                        ( df[ 'quantity_1' ] == data[ 'quantity_1' ] ) &
                                                        ( df[ 'quantity_2' ] == data[ 'quantity_2' ] ) &
                                                        ( df[ 'quantity_3' ] == data[ 'quantity_3' ] )
                                  ].dropna().value_counts().index[ 0 ]
#     df.loc[ idx, 'component_id_4' ] = top_of_comp4_by_condition
    # component_4의 수량을 0로 만든다
    df.loc[ idx, 'quantity_4' ] = 0

In [21]:
len( df[ ( df[ 'component_id_4' ].isnull() ) & ( df[ 'quantity_4' ].notnull() ) ] )    # 없어짐

1

In [22]:
# df[ 'component_id_1' ].value_counts()

#3. 정련된 bill_of_materials 저장

In [23]:
def process_null( p_df ) :    # bill~내의 null값들은 NONE, 0으로 채운다
    
    for i in range( 2, 9 ) :
#     for i in range( 1, 9 ) :    # component_id_1 == Null인것들 유지
        comp_str = 'component_id_' + str( i )
        quan_str = 'quantity_' + str( i )
        
        p_df[ comp_str ].fillna( 'NONE', inplace = True )
        p_df[ quan_str ].fillna( 0, inplace = True )

In [24]:
process_null( df )    # null을 각각 채워둔다

In [25]:
df_comp = pd.read_csv( './dataset/02.ml.verified.dataset/components.verified.csv' )

In [26]:
df[ 'component_id_1' ][18], df[ 'component_id_1' ][19], df[ 'component_id_1' ][20]
if df[ 'component_id_1' ][18] is np.nan :
    print( 'np.nan check' )

In [27]:
def calculate_weight( p_df, i ) :    # 각 bill내의 component들에 대한 무게를 구한다
    global df_comp
    cond = p_df[ 'component_id_' + str( i ) ]
    if cond is not np.nan :
        ret = df_comp[ 'mac475_weight' ][ df_comp[ 'component_id' ] == cond ].iloc[ 0 ] * p_df[ 'quantity_' + str( i ) ]
    else :
        ret = 0
    return ret
        
for i in range( 1, 9 ) :    # 각 무게를 구한다
    df[ 'weight_' + str( i ) ] = df.apply( calculate_weight, axis = 1, args = (i,) )

In [28]:
def calculate_weight_sum( p_df ) :    # 각 무게의 합을 구한다
    sum = 0
    for i in range( 1, 9 ) :
        sum += p_df[ 'weight_' + str( i ) ]
    return sum

In [29]:
df[ 'weight_sum' ] = df.apply( calculate_weight_sum, axis = 1 )    # 각 무게의 합을 구한다

In [30]:
def calculate_length( p_df, i ) :    # 각 bill내의 component들에 대한 길이를 구한다
    global df_comp
    cond = p_df[ 'component_id_' + str( i ) ]
    if cond is not np.nan :
        ret = df_comp[ 'overall_length' ][ df_comp[ 'component_id' ] == cond ].iloc[ 0 ] * p_df[ 'quantity_' + str( i ) ]
    else :
        ret = 0
    return ret
        
for i in range( 1, 9 ) :    # 각 길이를 구한다
    df[ 'length_' + str( i ) ] = df.apply( calculate_length, axis = 1, args = (i,) )

In [31]:
def calculate_length_sum( p_df ) :    # 각 길이의 합을 구한다
    sum = 0
    for i in range( 1, 9 ) :
        sum += p_df[ 'length_' + str( i ) ]
    return sum

In [32]:
df[ 'length_sum' ] = df.apply( calculate_length_sum, axis = 1 )    # 각 길이의 합을 구한다

In [33]:
def calculate_uniqueness_count( p_df, p_type ) :    # uniqueness, orientation count를 계산한다
    global df_comp
    ret = 0
    for i in range( 1, 9 ) :
        cond = p_df[ 'component_id_' + str( i ) ]
        if cond != 'NONE' :
            data = df_comp[ p_type ][ df_comp[ 'component_id' ] == cond ].iloc[ 0 ]
            if data == 'Y' :
                ret +=  p_df[ 'quantity_' + str( i ) ]
        else :
            break
    return ret

#     p_df[ 'component_id_' + str( i ) ]

In [34]:
df[ 'uniqueness_count' ] = df.apply( calculate_uniqueness_count, axis = 1, args = ('uniqueness',)  )    # uniq count를 계산한다
df[ 'orientation_count' ] = df.apply( calculate_uniqueness_count, axis = 1, args = ('orientation',)  )    # orient 계산한다

In [35]:
df.head( 3 )

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,...,length_2,length_3,length_4,length_5,length_6,length_7,length_8,length_sum,uniqueness_count,orientation_count
0,TA-00001,C-1622,2,C-1629,2,NONE,0,NONE,0,NONE,...,29,0,0,0,0,0,0,69.0,0,0
1,TA-00002,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0,0,0,28.4,0,0
2,TA-00003,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0,0,0,28.4,0,0


In [36]:
def retrieve_component_type( p_df, i ) :    # 각 component_id_1~8의 type을 알아낸다
    global df_comp
    cond = p_df[ 'component_id_' + str( i ) ]
    return df_comp[ 'component_mac475' ][ df_comp[ 'component_id' ] == cond ].iloc[ 0 ]    

In [37]:
for i in range( 1, 9 ) :
    df[ 'comp_type_' + str( i ) ] = df.apply( retrieve_component_type, axis = 1, args = (i,) )    # 각 componenet의 type을 알아낸다

In [38]:
df.head( 5 )

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,...,uniqueness_count,orientation_count,comp_type_1,comp_type_2,comp_type_3,comp_type_4,comp_type_5,comp_type_6,comp_type_7,comp_type_8
0,TA-00001,C-1622,2,C-1629,2,NONE,0,NONE,0,NONE,...,0,0,nut,sleeve,NONE,NONE,NONE,NONE,NONE,NONE
1,TA-00002,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,adaptor,NONE,NONE,NONE,NONE,NONE,NONE,NONE
2,TA-00003,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,adaptor,NONE,NONE,NONE,NONE,NONE,NONE,NONE
3,TA-00004,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,adaptor,NONE,NONE,NONE,NONE,NONE,NONE,NONE
4,TA-00005,C-1624,1,C-1631,1,C-1641,1,NONE,0,NONE,...,0,0,nut,sleeve,threaded,NONE,NONE,NONE,NONE,NONE


In [39]:
def get_sum_by_component_type( p_df, p_type, calc_type ) :    # 각 component들의 합을 구한다
    sum_adaptor = 0
    sum_boss = 0
    sum_elbow = 0
    sum_float = 0
    sum_hfl = 0
    sum_nut = 0
    sum_other = 0
    sum_sleeve = 0
    sum_straight = 0
    sum_tee = 0
    sum_threaded = 0
    
    for i in range( 1, 9 ) :
        data = p_df[ 'comp_type_' + str( i ) ]
        if  data == 'adaptor' :
            sum_adaptor += p_df[ calc_type + str( i ) ]
        elif data == 'boss' :
            sum_boss += p_df[ calc_type + str( i ) ]
        elif data == 'elbow' :
            sum_elbow += p_df[ calc_type + str( i ) ]
        elif data == 'float' :
            sum_float += p_df[ calc_type + str( i ) ]
        elif data == 'hfl' :
            sum_hfl += p_df[ calc_type + str( i ) ]
        elif data == 'nut' :
            sum_nut += p_df[ calc_type + str( i ) ]
        elif data == 'other' :
            sum_other += p_df[ calc_type + str( i ) ]
        elif data == 'sleeve' :
            sum_sleeve += p_df[ calc_type + str( i ) ]
        elif data == 'straight' :
            sum_straight += p_df[ calc_type + str( i ) ]
        elif data == 'tee' :
            sum_tee += p_df[ calc_type + str( i ) ]
        elif data == 'threaded' :
            sum_threaded += p_df[ calc_type + str( i ) ]

        ret_val = 0
        # return value 결정    
        if  p_type == 'adaptor' :
            ret_val = sum_adaptor
        elif p_type == 'boss' :
            ret_val = sum_boss
        elif p_type == 'elbow' :
            ret_val = sum_elbow
        elif p_type == 'float' :
            ret_val = sum_float
        elif p_type == 'hfl' :
            ret_val = sum_hfl
        elif p_type == 'nut' :
            ret_val = sum_nut
        elif p_type == 'other' :
            ret_val = sum_other
        elif p_type == 'sleeve' :
            ret_val = sum_sleeve
        elif p_type == 'straight' :
            ret_val = sum_straight
        elif p_type == 'tee' :
            ret_val = sum_tee
        elif p_type == 'threaded' :
            ret_val = sum_threaded
            
    return ret_val

In [40]:
# 각 component의 개수합을 구한다
df[ 'adaptor_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('adaptor', 'quantity_' ) )
df[ 'boss_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('boss', 'quantity_' ) )
df[ 'elbow_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('elbow', 'quantity_' ) )
df[ 'float_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('float', 'quantity_' ) )
df[ 'hfl_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('hfl', 'quantity_' ) )
df[ 'nut_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('nut', 'quantity_' ) )
df[ 'other_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('other', 'quantity_' ) )
df[ 'sleeve_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('sleeve', 'quantity_' ) )
df[ 'straight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('straight', 'quantity_' ) )
df[ 'tee_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('tee', 'quantity_' ) )
df[ 'threaded_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('threaded', 'quantity_' ) )

In [41]:
# 각 component의 무게합을 구한다
df[ 'adaptor_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('adaptor', 'weight_' ) )
df[ 'boss_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('boss', 'weight_' ) )
df[ 'elbow_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('elbow', 'weight_' ) )
df[ 'float_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('float', 'weight_' ) )
df[ 'hfl_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('hfl', 'weight_' ) )
df[ 'nut_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('nut', 'weight_' ) )
df[ 'other_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('other', 'weight_' ) )
df[ 'sleeve_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('sleeve', 'weight_' ) )
df[ 'straight_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('straight', 'weight_' ) )
df[ 'tee_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('tee', 'weight_' ) )
df[ 'threaded_weight_sum' ] = df.apply( get_sum_by_component_type, axis = 1, args = ('threaded', 'weight_' ) )

In [42]:
df.head( 3 )

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,...,boss_weight_sum,elbow_weight_sum,float_weight_sum,hfl_weight_sum,nut_weight_sum,other_weight_sum,sleeve_weight_sum,straight_weight_sum,tee_weight_sum,threaded_weight_sum
0,TA-00001,C-1622,2,C-1629,2,NONE,0,NONE,0,NONE,...,0,0,0,0,0.072,0,0.024,0,0,0
1,TA-00002,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0.0,0,0.0,0,0,0
2,TA-00003,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0.0,0,0.0,0,0,0


In [43]:
for i in range( 1, 9 ) :    # component type feature 제거
    df.drop( [ 'comp_type_' + str( i ) ], axis = 1, inplace = True )

for i in range( 1, 9 ) :    # 무게합 산출위해 추가되었던 각 comp들의 weight feature를 제거한다
    df.drop( [ 'weight_' + str( i ) ], axis = 1, inplace = True )

In [44]:
df.head( 3 )

Unnamed: 0,tube_assembly_id,component_id_1,quantity_1,component_id_2,quantity_2,component_id_3,quantity_3,component_id_4,quantity_4,component_id_5,...,boss_weight_sum,elbow_weight_sum,float_weight_sum,hfl_weight_sum,nut_weight_sum,other_weight_sum,sleeve_weight_sum,straight_weight_sum,tee_weight_sum,threaded_weight_sum
0,TA-00001,C-1622,2,C-1629,2,NONE,0,NONE,0,NONE,...,0,0,0,0,0.072,0,0.024,0,0,0
1,TA-00002,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0.0,0,0.0,0,0,0
2,TA-00003,C-1312,2,NONE,0,NONE,0,NONE,0,NONE,...,0,0,0,0,0.0,0,0.0,0,0,0


In [45]:
# df = df[ df[ 'component_id_1' ].notnull() ]    # comp1이 null인 것들은 가장 빈번한 comp id로 대체하여 저장
df.to_csv( './dataset/02.ml.verified.dataset/bill_of_materials.verified.csv', index = False )
del( df )

.

#4. Merge 시도

In [46]:
df_tube = pd.read_csv( './dataset/02.ml.verified.dataset/tube.material_id.verified.csv' )
df_bill = pd.read_csv( './dataset/02.ml.verified.dataset/bill_of_materials.verified.csv' )

In [47]:
len( df_tube ), len( df_bill )

(21198, 21198)

In [48]:
df_tube_bill = df_tube.merge( df_bill, how = 'inner', on = 'tube_assembly_id' )

In [49]:
len( df_tube_bill )

21198

In [50]:
df_tube_bill[ [ 'component_id_1', 'component_id_2' ] ].head( 3 )
# df_tube_bill[ 'component_id_1' ].describe()

Unnamed: 0,component_id_1,component_id_2
0,C-1622,C-1629
1,C-1312,NONE
2,C-1312,NONE


##component type count와 component total count feature, tube volume 추가

In [51]:
def calculate_component_type_count( p_df ) :    # component type의 count를 계산
    type_count = 0    # type count 초기화
    for i in range( 1, 9 ) :
        comp_str = 'component_id_' + str( i )
        
        if p_df[ comp_str ] != 'NONE' :
            type_count += 1
            
    return type_count

def calculate_comp_total_count( p_df ) :    # component의 total count를 계산
    total_count = 0
    for i in range( 1, 9 ) :
        quan_str = 'quantity_' + str( i )
        if p_df[ quan_str ] > 0 :
            total_count += p_df[ quan_str ]
    return total_count

def calculate_tube_volume( p_df ) :    # tube의 volume을 대략적으로 계산
    import math
    
    outer_r = p_df[ 'diameter' ] / 2    # radius
    inner_r = ( p_df[ 'diameter' ] - p_df[ 'wall' ] * 2 ) / 2    # 내 식으로 판단
#     inner_r = ( p_df[ 'diameter' ] - p_df[ 'wall' ]  ) / 2    # forum의 공식
    len = p_df[ 'length' ]
    tube_vol = ( np.pi * math.pow( outer_r, 2 ) - np.pi * math.pow( inner_r, 2 ) ) * len
    return tube_vol

def calculate_tube_area( p_df ) :    # tube의 area를 대략적으로 계산
    import math
    outer_r = p_df[ 'diameter' ] / 2    # radius
    inner_r = ( p_df[ 'diameter' ] - p_df[ 'wall' ] * 2 ) / 2
    tube_area = np.pi * math.pow( p_df[ 'diameter' ]/2, 2 ) - np.pi * math.pow( p_df[ 'diameter' ]/2 - p_df[ 'wall' ], 2 )
    return tube_area

In [52]:
for i in range( 1, 9 ) :    # component/ quantity feature들에 대한 type casting 수행
    comp_str = 'component_id_' + str( i )
    quan_str = 'quantity_' + str( i )
    df_tube_bill[ comp_str ].astype( str )
#     df_tube_bill[ quan_str ].astype( int )

In [53]:
df_tube_bill[ 'comp_type_count' ] = df_tube_bill.apply( calculate_component_type_count, axis = 1 )    # comp type의 개수 확보
df_tube_bill[ 'comp_total_count' ] = df_tube_bill.apply( calculate_comp_total_count, axis = 1 )    # total comp 개수 확보
df_tube_bill[ 'tube_volume' ] = df_tube_bill.apply( calculate_tube_volume, axis = 1 )    # tube 부피 확보
df_tube_bill[ 'tube_area' ] = df_tube_bill.apply( calculate_tube_area, axis = 1 )    # tube 면적 확보

In [54]:
len( df_tube_bill )

21198

In [55]:
df_tube_bill.head( 3 )

Unnamed: 0,tube_assembly_id,material_id,diameter,wall,length,num_bends,bend_radius,end_a_1x,end_a_2x,end_x_1x,...,nut_weight_sum,other_weight_sum,sleeve_weight_sum,straight_weight_sum,tee_weight_sum,threaded_weight_sum,comp_type_count,comp_total_count,tube_volume,tube_area
0,TA-00001,SP-0035,12.7,1.65,164,5,38.1,0,0,0,...,0.072,0,0.024,0,0,0,2,4,9393.770441,57.279088
1,TA-00002,SP-0019,6.35,0.71,137,8,19.05,0,0,0,...,0.0,0,0.0,0,0,0,1,2,1723.486526,12.580194
2,TA-00003,SP-0019,6.35,0.71,127,7,19.05,0,0,0,...,0.0,0,0.0,0,0,0,1,2,1597.68459,12.580194


* Merged DF의 저장전에 categorical feature들을 string형으로 변환해둔다

In [56]:
list_cols = df_tube_bill.columns.values.tolist()

for col in list_cols :
    if df_tube_bill[ col ].dtype.name.endswith( 'object' ) :
        df_tube_bill.astype( str )

In [57]:
df_tube_bill.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21198 entries, 0 to 21197
Data columns (total 71 columns):
tube_assembly_id       21198 non-null object
material_id            21198 non-null object
diameter               21198 non-null float64
wall                   21198 non-null float64
length                 21198 non-null float64
num_bends              21198 non-null int64
bend_radius            21198 non-null float64
end_a_1x               21198 non-null int64
end_a_2x               21198 non-null int64
end_x_1x               21198 non-null int64
end_x_2x               21198 non-null int64
end_a                  21198 non-null object
end_x                  21198 non-null object
num_boss               21198 non-null int64
num_bracket            21198 non-null int64
other                  21198 non-null int64
bend_num_by_radius     21198 non-null float64
component_id_1         21198 non-null object
quantity_1             21198 non-null float64
component_id_2         21198 non-null 

In [58]:
df_tube_bill.to_csv( './dataset/03.merged/tube_bill_merged.csv', index = False )

In [59]:
df_tube_bill.count()

tube_assembly_id    21198
material_id         21198
diameter            21198
wall                21198
length              21198
num_bends           21198
bend_radius         21198
end_a_1x            21198
end_a_2x            21198
end_x_1x            21198
end_x_2x            21198
end_a               21198
end_x               21198
num_boss            21198
num_bracket         21198
...
adaptor_weight_sum     21198
boss_weight_sum        21198
elbow_weight_sum       21198
float_weight_sum       21198
hfl_weight_sum         21198
nut_weight_sum         21198
other_weight_sum       21198
sleeve_weight_sum      21198
straight_weight_sum    21198
tee_weight_sum         21198
threaded_weight_sum    21198
comp_type_count        21198
comp_total_count       21198
tube_volume            21198
tube_area              21198
Length: 71, dtype: int64

In [60]:
del( df_comp )

del( df_tube )
del( df_bill )
del( df_tube_bill )