In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [2]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

In [3]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>2:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            elif (len(arg_df[col].unique())==2) & (df_summary.loc['Missing (%)',col]==0):
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
'============================================================='        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category' and str(datatype) !='datetime64[ns]':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')
'==================================================================='        
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        elif str(arg_df[c].dtypes)!='datetime64[ns]':
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
            elif (len(arg_df[c].unique())==2) & (arg_df[c].isna().mean()>0):
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list
'===================================================================' 
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')
'==================================================================='        
def drop_columns(arg_df,column_names):
    arg_df.drop(columns=column_names,axis='columns',inplace=True)
    print('Drop {} columns : \n{}'.format(len(column_names),column_names))
'============================================================='        
def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    roomrow=roomrow.unique().tolist()
    roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow['Unique_Row'].values
'============================================================='  
def generate_dif_columns(arg_df,left_column,left_sffx,right_sffx):
    common_title=[]
    for idx,c in enumerate(left_column):
        common_title.append(left_column[idx].split(left_sffx)[0])
    print('There are {} common columns : \n{}'.format(len(left_column),common_title))

    for idx,c in enumerate(common_title):
        compare_col=common_title[idx].replace(' ','')
        arg_df['dif_'+compare_col]=(arg_df[c+left_sffx]!=arg_df[c+right_sffx]) & (arg_df[c+right_sffx].notna())
    print('\nColumns Generated : {}'.format(len(common_title)))
'==================================================================='        
def find_time_dif_hour(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')

def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    roomrow=roomrow.unique().tolist()
    roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow['Unique_Row'].values

In [4]:
file_1='/home/nan/Desktop/GitHub_Data/p_transaction.csv'
file_2='/home/nan/Desktop/GitHub_Data/p_dispatched.csv'
filename_1=file_1.rstrip('.csv')
filename_2=file_2.rstrip('.csv')

sffx_transaction='_Trsc'
sffx_dispatched='_Dptch'
target_process='SPQI'

In [5]:
trans=pd.read_csv(filename_1)
disp=pd.read_csv(filename_2)

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
trans.shape

(207259, 42)

In [7]:
trans.head(2)

Unnamed: 0,Transaction Date Time,Customer Label Code,Dry Matter Code,Fruit Size Code,Labelling Indicator Code,Location Mission Request Destination,Marketer Code,Material Number,Ok Until Date,Pack Code,Pack Date,Packhouse Code,Pack Label Code,Pack Make Code,Pallet Number,Pallet Rework Count,Plant Code,Previous Value,Protocol Code,Purchase Pool Code,Quality Inspection Indicator Code,Storage Method Code,Transaction Date,Transaction Sub Type Code,Username,Variety Code,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code,Pack_Base,Stacking Configuration Code,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway
0,2019-07-01 00:00:10,N,Y,42,LG,na,ZIL,62853,2019-07-21 00:00:00,41236,2019-06-06 00:00:00,3TPP,N,KC1,59717008,0,1103,Q02-03-3-1,N,1,A,CN,2019-07-01 00:00:10,MLA,agubs,HW,19.0,-26.0,False,MB,E,N,False,0,Sun,True,True,True,Q02-03,3,1,10
1,2019-07-01 00:00:15,N,Y,36,LG,na,ZIL,62809,2019-07-21 00:00:00,41717,2019-06-05 00:00:00,3TPP,N,KC1,59708334,0,1103,Q14-02-10-1,N,1,A,CN,2019-07-01 00:00:15,MTA,jaspsi,HW,19.0,-27.0,False,IT,E,3,False,0,Sun,True,True,True,Q14-02,10,1,3


In [8]:
disp.shape

(32251, 67)

## Data Processing so that common columns have same set of data label
    -change to string:Fruit Size Code_Trsc
    -truncate 0 at first position:Purchase Pool Code_Trsc
    -incorrect calculation as referring to dif,should remain in datetime: date columns(incl Ok_Until_Date_day,Pack_Date_day)

In [9]:
trans['Fruit Size Code']=trans['Fruit Size Code'].astype('str')
disp['Fruit Size Code']=disp['Fruit Size Code'].astype('str')

In [10]:
trans['Purchase Pool Code']=trans['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)
disp['Purchase Pool Code']=disp['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)

In [11]:
drop_columns(disp,['Pack_Date_day', 'Ok_Until_Date_day'])

Drop 2 columns : 
['Pack_Date_day', 'Ok_Until_Date_day']


## Filter Process

In [12]:
before_drop=trans.shape
df=trans[trans['Location Mission Request Destination']==target_process]
print('Total number of rows deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-df.shape[0]),
    (before_drop[0]-df.shape[0])/before_drop[0]*100))

Total number of rows deleted :178492 (86.12 %)


In [13]:
df=df.merge(disp,how='left',on='Pallet Number',suffixes=(sffx_transaction, sffx_dispatched))

In [14]:
print('There are {} pallets went through {}.'.format(len(df['Pallet Number'].unique()),target_process))
print('Container order : {} '.format(len(df['Container Number'].unique())-1))
print('Envelope Number : {} '.format(len(df['Envelope Number'].unique())))
#print('Chartered order : {} '.format(len(df[df['Container Number']=='na'])))

There are 4101 pallets went through SPQI.
Container order : 230 
Envelope Number : 299 


In [15]:
df['Transaction Date']=pd.to_datetime(df['Transaction Date'],format='%Y-%m-%d %H:%M:%S')

## Extract Process Completion Time

In [16]:
process=pd.DataFrame()
process['start_day']=df.groupby('Pallet Number')['Transaction Date Time'].first()
process['end_day']=df.groupby('Pallet Number')['Transaction Date Time'].last()

In [17]:
process['start_day']=pd.to_datetime(process['start_day'],format='%Y-%m-%d %H:%M:%S')
process['end_day']=pd.to_datetime(process['end_day'],format='%Y-%m-%d %H:%M:%S')

In [18]:
process['duration_hr']=process['end_day']-process['start_day']

In [19]:
process['duration_hr']=process['duration_hr'].astype('timedelta64[s]')

In [20]:
process['duration_hr']=process['duration_hr']/(60*60)

In [21]:
#process['duration_hr'].plot(kind='hist',cumulative=True,normed=True)

In [22]:
process['First_move']=True

In [23]:
process=process.drop(columns='end_day',axis='columns').reset_index()
process.head(3)

Unnamed: 0,Pallet Number,start_day,duration_hr,First_move
0,53188620,2019-07-27 09:55:21,0.001389,True
1,53188675,2019-07-04 20:33:04,180.518056,True
2,53189382,2019-07-01 23:46:32,8.263889,True


## Retrive First Move 

In [24]:
df_pallet=df.merge(process,how='left',left_on=['Pallet Number','Transaction Date'],right_on=['Pallet Number','start_day'])

In [25]:
df_pallet['First_move'].fillna(False,inplace=True)

In [26]:
before_drop=df_pallet.shape
df_pallet=df_pallet[df_pallet['First_move']]
print('Total number of rows deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-df_pallet.shape[0]),
    (before_drop[0]-df_pallet.shape[0])/before_drop[0]*100))

Total number of rows deleted :24666 (85.74 %)


In [27]:
df_pallet.reset_index(inplace=True,drop=True)

In [28]:
# delete 'start_day' which is duplicate of Transaction Date
drop_columns(df_pallet,['start_day'])

Drop 1 columns : 
['start_day']


In [29]:
df_pallet.head(3)

Unnamed: 0,Transaction Date Time,Customer Label Code_Trsc,Dry Matter Code_Trsc,Fruit Size Code_Trsc,Labelling Indicator Code_Trsc,Location Mission Request Destination,Marketer Code_Trsc,Material Number_Trsc,Ok Until Date_Trsc,Pack Code_Trsc,Pack Date_Trsc,Packhouse Code_Trsc,Pack Label Code_Trsc,Pack Make Code_Trsc,Pallet Number,Pallet Rework Count,Plant Code_Trsc,Previous Value,Protocol Code,Purchase Pool Code_Trsc,Quality Inspection Indicator Code_Trsc,Storage Method Code,Transaction Date,Transaction Sub Type Code,Username,Variety Code_Trsc,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code_Trsc,Pack_Base_Trsc,Stacking Configuration Code_Trsc,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway,Location Room Code,Location Row Code,Location Column,Location Height,Location Request Number,Storage Source Code,Pack Label Code_Dptch,Pack Indicator Code,Clearance Protocol Code,Customer Label Code_Dptch,Dry Matter Code_Dptch,Pack Date_Dptch,Storing Characteristic Code,CCKPassed Failed,Quality Inspection Indicator Code_Dptch,Japan Sub Brand Code,Trial Packing Indicator Code,Marketer Code_Dptch,Loadout Priority,Material Number_Dptch,Material Group Code,Brand Code,Variety Code_Dptch,Fruit Class Code,Fruit Size Code_Dptch,Labelling Indicator Code_Dptch,Purchase Pool Code_Dptch,Pack Code_Dptch,Growing Method,Maturity Indicator Code,Ok Until Date_Dptch,Plant Code_Dptch,Ignore Psr,Packhouse Code_Dptch,Pallet Card Reference,Order Number,Order Line Number,Loadout Date,Shipment Type Code,Destination Port Code,Trucking Company Code,Full,Market Holds,Stacking Configuration Code_Dptch,Pack Style Code_Dptch,Pack Make Code_Dptch,Load Start Date,Envelope Number,Container,Container Number,isDoi_Number,isInternal_Holds,isTemporary_Seal_Number,isTemperature_Recorder_Number,isLocation_Mission_Request_Destination,isRf_Id_Tag1,isPermanentSeal,Pallet Note,Repack_Date_day,Last_Spqi_Date_day,Doi_Clearance_Date_day,Tkl_Email_Date_day,Load_Start_Date_hour,Pack_Base_Dptch,duration_hr,First_move
0,2019-07-01 00:00:39,N,Y,33,GR,SPQI,ZIL,61554,2019-07-07 00:00:00,10193,2019-05-25 00:00:00,3TPP,N,KC1,59669482,0,1103,ALLO,N,1,A,CN,2019-07-01 00:00:39,MLA,teaio,HW,5.0,-38.0,False,P1,E,N,False,0,Sun,True,True,True,ALLO,na,na,na,UNKNOWN,na,na,0.0,na,CN,N,N,N,N,Y,2019-05-25 00:00:00,N,True,A,N,N,ZIL,6.0,61554,Zespri,Z,HW,1.0,33,GR,1,10193.0,CK,N,2019-07-21 23:59:00,1103.0,False,3TPP,2216NIGHT,5334502.0,70.0,2019-07-17 11:00:00,18.0,BEZEE,OMOK,True,"ID, IL, IN, JP, KR, MX, RE, TH, TW, Z7, Z9",N,P1,KC1,2019-07-17 00:00:00,199054.0,8328.0,TTNU8962992,False,True,True,True,False,True,False,False,,-13.0,,-2.0,-11.0,E,335.496667,True
1,2019-07-01 00:01:20,N,Y,39,LG,SPQI,ZIL,62846,2019-07-14 00:00:00,41716,2019-05-28 00:00:00,3TPP,N,KC1,59685451,0,1103,Q01-02-5-1,N,1,A,CN,2019-07-01 00:01:20,MTA,yasms,HW,12.0,-35.0,False,MB,E,N,False,0,Sun,True,True,True,Q01-02,5,1,8,UNKNOWN,na,na,0.0,na,CN,N,N,N,I,Y,2019-05-24 00:00:00,N,True,A,N,N,ZIL,6.0,62846,Zespri,Z,HW,1.0,39,LG,1,41716.0,CK,N,2019-08-11 23:59:00,1103.0,False,3TPP,3222NIGHT,5332701.0,30.0,2019-07-31 12:15:00,4.0,INMAA,RAYT,True,Z1,N,MB,KC1,2019-07-29 00:00:00,199154.0,529.0,TRIU8970380,False,True,True,True,False,True,False,False,,-27.0,,-3.0,-61.0,E,266.217778,True
2,2019-07-01 00:02:20,N,Y,33,GR,SPQI,ZIL,61555,2019-07-07 00:00:00,10192,2019-05-22 00:00:00,3TPP,N,KC1,59431225,0,1103,ALLO,N,1,A,CN,2019-07-01 00:02:20,MLA,teaio,HW,5.0,-41.0,False,IT,E,N,False,0,Sun,True,True,True,ALLO,na,na,na,CONTR,na,na,1.0,na,CN,N,N,N,N,Y,2019-05-22 00:00:00,N,True,A,N,N,ZIL,6.0,61556,Zespri,Z,HW,1.0,33,GR,1,10192.0,CK,N,2019-08-04 23:59:00,1103.0,False,3TPP,3213,5340902.0,40.0,2019-07-29 13:30:00,18.0,CNSHG,K&ST,True,"IL, TH, Z1",3,IT,KC1,2019-07-29 00:00:00,199236.0,8542.0,TEMU9001169,False,True,True,True,False,True,False,False,,-11.0,,-5.0,-14.0,E,443.263056,True


## Generate Differences Column from Common Columns

In [30]:
left_col=[c for c in df_pallet if sffx_transaction in c ]
right_col=[c for c in df_pallet if sffx_dispatched in c ]

In [31]:
generate_dif_columns(df_pallet,left_col,sffx_transaction,sffx_dispatched)

There are 19 common columns : 
['Customer Label Code', 'Dry Matter Code', 'Fruit Size Code', 'Labelling Indicator Code', 'Marketer Code', 'Material Number', 'Ok Until Date', 'Pack Code', 'Pack Date', 'Packhouse Code', 'Pack Label Code', 'Pack Make Code', 'Plant Code', 'Purchase Pool Code', 'Quality Inspection Indicator Code', 'Variety Code', 'Pack Style Code', 'Pack_Base', 'Stacking Configuration Code']

Columns Generated : 19


### Drop Unilabel Column

In [32]:
drop_unilable_column(df_pallet)

Drop 11 Columns with Single Label:
['Location Mission Request Destination', 'Pack Indicator Code', 'Storing Characteristic Code', 'Maturity Indicator Code', 'Full', 'isRf_Id_Tag1', 'First_move', 'dif_FruitSizeCode', 'dif_PackhouseCode', 'dif_PurchasePoolCode', 'dif_VarietyCode']


In [33]:
df_pallet.head(2)

Unnamed: 0,Transaction Date Time,Customer Label Code_Trsc,Dry Matter Code_Trsc,Fruit Size Code_Trsc,Labelling Indicator Code_Trsc,Marketer Code_Trsc,Material Number_Trsc,Ok Until Date_Trsc,Pack Code_Trsc,Pack Date_Trsc,Packhouse Code_Trsc,Pack Label Code_Trsc,Pack Make Code_Trsc,Pallet Number,Pallet Rework Count,Plant Code_Trsc,Previous Value,Protocol Code,Purchase Pool Code_Trsc,Quality Inspection Indicator Code_Trsc,Storage Method Code,Transaction Date,Transaction Sub Type Code,Username,Variety Code_Trsc,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code_Trsc,Pack_Base_Trsc,Stacking Configuration Code_Trsc,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway,Location Room Code,Location Row Code,Location Column,Location Height,Location Request Number,Storage Source Code,Pack Label Code_Dptch,Clearance Protocol Code,Customer Label Code_Dptch,Dry Matter Code_Dptch,Pack Date_Dptch,CCKPassed Failed,Quality Inspection Indicator Code_Dptch,Japan Sub Brand Code,Trial Packing Indicator Code,Marketer Code_Dptch,Loadout Priority,Material Number_Dptch,Material Group Code,Brand Code,Variety Code_Dptch,Fruit Class Code,Fruit Size Code_Dptch,Labelling Indicator Code_Dptch,Purchase Pool Code_Dptch,Pack Code_Dptch,Growing Method,Ok Until Date_Dptch,Plant Code_Dptch,Ignore Psr,Packhouse Code_Dptch,Pallet Card Reference,Order Number,Order Line Number,Loadout Date,Shipment Type Code,Destination Port Code,Trucking Company Code,Market Holds,Stacking Configuration Code_Dptch,Pack Style Code_Dptch,Pack Make Code_Dptch,Load Start Date,Envelope Number,Container,Container Number,isDoi_Number,isInternal_Holds,isTemporary_Seal_Number,isTemperature_Recorder_Number,isLocation_Mission_Request_Destination,isPermanentSeal,Pallet Note,Repack_Date_day,Last_Spqi_Date_day,Doi_Clearance_Date_day,Tkl_Email_Date_day,Load_Start_Date_hour,Pack_Base_Dptch,duration_hr,dif_CustomerLabelCode,dif_DryMatterCode,dif_LabellingIndicatorCode,dif_MarketerCode,dif_MaterialNumber,dif_OkUntilDate,dif_PackCode,dif_PackDate,dif_PackLabelCode,dif_PackMakeCode,dif_PlantCode,dif_QualityInspectionIndicatorCode,dif_PackStyleCode,dif_Pack_Base,dif_StackingConfigurationCode
0,2019-07-01 00:00:39,N,Y,33,GR,ZIL,61554,2019-07-07 00:00:00,10193,2019-05-25 00:00:00,3TPP,N,KC1,59669482,0,1103,ALLO,N,1,A,CN,2019-07-01 00:00:39,MLA,teaio,HW,5.0,-38.0,False,P1,E,N,False,0,Sun,True,True,True,ALLO,na,na,na,UNKNOWN,na,na,0.0,na,CN,N,N,N,Y,2019-05-25 00:00:00,True,A,N,N,ZIL,6.0,61554,Zespri,Z,HW,1.0,33,GR,1,10193.0,CK,2019-07-21 23:59:00,1103.0,False,3TPP,2216NIGHT,5334502.0,70.0,2019-07-17 11:00:00,18.0,BEZEE,OMOK,"ID, IL, IN, JP, KR, MX, RE, TH, TW, Z7, Z9",N,P1,KC1,2019-07-17 00:00:00,199054.0,8328.0,TTNU8962992,False,True,True,True,False,False,False,,-13.0,,-2.0,-11.0,E,335.496667,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False
1,2019-07-01 00:01:20,N,Y,39,LG,ZIL,62846,2019-07-14 00:00:00,41716,2019-05-28 00:00:00,3TPP,N,KC1,59685451,0,1103,Q01-02-5-1,N,1,A,CN,2019-07-01 00:01:20,MTA,yasms,HW,12.0,-35.0,False,MB,E,N,False,0,Sun,True,True,True,Q01-02,5,1,8,UNKNOWN,na,na,0.0,na,CN,N,N,I,Y,2019-05-24 00:00:00,True,A,N,N,ZIL,6.0,62846,Zespri,Z,HW,1.0,39,LG,1,41716.0,CK,2019-08-11 23:59:00,1103.0,False,3TPP,3222NIGHT,5332701.0,30.0,2019-07-31 12:15:00,4.0,INMAA,RAYT,Z1,N,MB,KC1,2019-07-29 00:00:00,199154.0,529.0,TRIU8970380,False,True,True,True,False,False,False,,-27.0,,-3.0,-61.0,E,266.217778,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False


### check
common_title=[]
for idx,c in enumerate(left_col):
    common_title.append(left_col[idx].split(sffx_transaction)[0])
        
temp_list=[c for c in df_pallet.columns if common_title[8] in c]
temp='dif_PackDate'
temp_list.append(temp)
df_pallet.loc[df_pallet[temp]==True,temp_list]

## Select Important of particular process
    - dif_CustomerLabelCode : True means labelling or delabelling needed, 181 rows are True
    - dif_LabellingIndicatorCode: True means change in labelling indicator,83 rows True, cannot find description
    - dif_PackCode : cannot find description, there are 103 rows are True
    - dif_PackMakeCode : True means change in pack make,64 rows are True
    - dif_PackStyleCode : True means change in pack box, 7 rows are True
    - dif_Pack_Base : True means change in pallet base, 57 rows are True
    - dif_StackingConfigurationCode : True means change in pallet height, 1461 rows are True

In [34]:
list_process=['dif_CustomerLabelCode','dif_LabellingIndicatorCode','dif_PackCode','dif_PackMakeCode','dif_PackStyleCode','dif_Pack_Base','dif_StackingConfigurationCode']

## Drop Columns

In [35]:
temp_list=[]
for idx,c in enumerate(left_col):
    temp_list.append(left_col[idx].split(sffx_transaction)[0])

In [36]:
temp_list.extend(c for c in df_pallet if 'dif_' in c)

In [37]:
for c in list_process:
    temp_list.remove(c)

In [38]:
for c in temp_list:
    drop_columns(df_pallet,[l for l in df_pallet.columns if c in l])

Drop 2 columns : 
['Customer Label Code_Trsc', 'Customer Label Code_Dptch']
Drop 2 columns : 
['Dry Matter Code_Trsc', 'Dry Matter Code_Dptch']
Drop 2 columns : 
['Fruit Size Code_Trsc', 'Fruit Size Code_Dptch']
Drop 2 columns : 
['Labelling Indicator Code_Trsc', 'Labelling Indicator Code_Dptch']
Drop 2 columns : 
['Marketer Code_Trsc', 'Marketer Code_Dptch']
Drop 2 columns : 
['Material Number_Trsc', 'Material Number_Dptch']
Drop 2 columns : 
['Ok Until Date_Trsc', 'Ok Until Date_Dptch']
Drop 2 columns : 
['Pack Code_Trsc', 'Pack Code_Dptch']
Drop 2 columns : 
['Pack Date_Trsc', 'Pack Date_Dptch']
Drop 2 columns : 
['Packhouse Code_Trsc', 'Packhouse Code_Dptch']
Drop 2 columns : 
['Pack Label Code_Trsc', 'Pack Label Code_Dptch']
Drop 2 columns : 
['Pack Make Code_Trsc', 'Pack Make Code_Dptch']
Drop 2 columns : 
['Plant Code_Trsc', 'Plant Code_Dptch']
Drop 2 columns : 
['Purchase Pool Code_Trsc', 'Purchase Pool Code_Dptch']
Drop 2 columns : 
['Quality Inspection Indicator Code_Trsc', '

In [39]:
temp=df_pallet[df_pallet['Order Number'].isna()]['Pallet Number'].count()
print('Pallet that went through {} but are not dispatched : {} ({:.4f} %)'.format(target_process,temp,temp/len(df_pallet)))

Pallet that went through SPQI but are not dispatched : 1484 (0.3619 %)


### Drop unrelated date time columns from Dispatched

In [40]:
temp_list=['Repack_Date_day','Last_Spqi_Date_day','Doi_Clearance_Date_day','Tkl_Email_Date_day','Load_Start_Date_hour']
drop_columns(df_pallet,temp_list)

Drop 5 columns : 
['Repack_Date_day', 'Last_Spqi_Date_day', 'Doi_Clearance_Date_day', 'Tkl_Email_Date_day', 'Load_Start_Date_hour']


### Drop position columns from Dispatched

In [41]:
temp_list=['Location Room Code','Location Row Code','Location Column','Location Height']
drop_columns(df_pallet,temp_list)

Drop 4 columns : 
['Location Room Code', 'Location Row Code', 'Location Column', 'Location Height']


In [42]:
# 'Storage Method Code' and 'Storage Source Code' are same except for 11 rows, thus delete
#df_pallet[(df_pallet['Storage Method Code']!=df_pallet['Storage Source Code']) & (df_pallet['Storage Source Code'].notna())][['Storage Method Code','Storage Source Code','Loadout Date']]
drop_columns(df_pallet,['Storage Method Code','Storage Source Code'])

Drop 2 columns : 
['Storage Method Code', 'Storage Source Code']


In [43]:
# 'Protocol Code' and 'Clearance Protocol Code' are same except for 94 rows, these columns are deleted because protocol change is not associate with order
df_pallet[(df_pallet['Protocol Code']!=df_pallet['Clearance Protocol Code']) & (df_pallet['Clearance Protocol Code'].notna())][['Protocol Code','Clearance Protocol Code','Loadout Date']]
drop_columns(df_pallet,['Protocol Code','Clearance Protocol Code'])

Drop 2 columns : 
['Protocol Code', 'Clearance Protocol Code']


In [44]:
# drop unilabel columns with missing data
drop_unilable_column(df_pallet)

No Columns with Single Label/Value


In [45]:
# drop duplication : 'Material Group Code' is identical to 'Fruit Class Code' (1 order is class 2)
drop_columns(df_pallet,['Material Group Code','Previous Value'])

Drop 2 columns : 
['Material Group Code', 'Previous Value']


In [46]:
# drop columns unlikely linked to process
temp_list=['Username','Loadout Date','Order Line Number','Ignore Psr','Transaction Sub Type Code','CCKPassed Failed']
drop_columns(df_pallet,temp_list)

Drop 6 columns : 
['Username', 'Loadout Date', 'Order Line Number', 'Ignore Psr', 'Transaction Sub Type Code', 'CCKPassed Failed']


In [47]:
# 'Japan Sub Brand Code'(4),'Trial Packing Indicator Code'(3),'Fruit Class Code'(19),'Growing Method'(4)
temp_list=['Japan Sub Brand Code','Trial Packing Indicator Code','Brand Code','Fruit Class Code','Growing Method']
drop_columns(df_pallet,temp_list)

Drop 5 columns : 
['Japan Sub Brand Code', 'Trial Packing Indicator Code', 'Brand Code', 'Fruit Class Code', 'Growing Method']


### Derive Load Start hour

In [48]:
df_pallet['Load Start Date']=pd.to_datetime(df_pallet['Load Start Date'],format='%Y-%m-%d %H:%M:%S')

In [49]:
df_pallet['Load Start Date'].dtypes

dtype('<M8[ns]')

In [50]:
find_time_dif_hour(df_pallet,'Transaction Date',['Load Start Date'])

In [51]:
drop_columns(df_pallet,['Transaction Date','Load Start Date'])

Drop 2 columns : 
['Transaction Date', 'Load Start Date']


### Derive Quantity of Pallet with same Order/Envelope Number in same process

In [52]:
df_pallet['Order Number'].value_counts()

5337606.0    104
5329401.0     79
5311401.0     62
5330101.0     60
5329402.0     60
5348601.0     59
5169502.0     54
5311301.0     49
5210901.0     48
5351201.0     48
5164502.0     46
5156802.0     43
5210801.0     42
5156402.0     42
5351301.0     40
5337801.0     40
5349004.0     39
5349001.0     38
5329302.0     38
5169601.0     34
5313301.0     34
5316101.0     34
5303802.0     33
5331602.0     32
5156901.0     32
5287403.0     30
5164501.0     29
5340902.0     28
5148501.0     24
5330310.0     24
            ... 
5299901.0      4
5324001.0      4
5156702.0      3
5321202.0      3
5337802.0      3
5307202.0      3
5326701.0      3
5312401.0      3
5309601.0      3
5320502.0      3
5307203.0      2
5340402.0      2
5285803.0      2
5156401.0      1
5330402.0      1
5312901.0      1
5297301.0      1
5286002.0      1
5297201.0      1
5306803.0      1
5249401.0      1
5287204.0      1
5287404.0      1
5280303.0      1
5320501.0      1
5322401.0      1
5210701.0      1
5275904.0     

In [53]:
temp_df=df_pallet.groupby('Order Number')['Pallet Number'].count().reset_index()

In [54]:
temp_df.rename(index=str,columns={'Pallet Number':'qty_same_order_process'},inplace=True)

In [55]:
df_pallet.merge(temp_df,how='left',on='Order Number')

Unnamed: 0,Transaction Date Time,Pallet Number,Pallet Rework Count,Ok_Until_Date_day,Pack_Date_day,isDOINumber,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway,Location Request Number,Loadout Priority,Pallet Card Reference,Order Number,Shipment Type Code,Destination Port Code,Trucking Company Code,Market Holds,Envelope Number,Container,Container Number,isDoi_Number,isInternal_Holds,isTemporary_Seal_Number,isTemperature_Recorder_Number,isLocation_Mission_Request_Destination,isPermanentSeal,Pallet Note,duration_hr,dif_CustomerLabelCode,dif_LabellingIndicatorCode,dif_PackCode,dif_PackMakeCode,dif_PackStyleCode,dif_StackingConfigurationCode,Load_Start_Date_hour,qty_same_order_process
0,2019-07-01 00:00:39,59669482,0,5.0,-38.0,False,False,0,Sun,True,True,True,ALLO,na,na,na,na,6.0,2216NIGHT,5334502.0,18.0,BEZEE,OMOK,"ID, IL, IN, JP, KR, MX, RE, TH, TW, Z7, Z9",199054.0,8328.0,TTNU8962992,False,True,True,True,False,False,False,335.496667,False,False,False,False,False,False,383.0,24.0
1,2019-07-01 00:01:20,59685451,0,12.0,-35.0,False,False,0,Sun,True,True,True,Q01-02,5,1,8,na,6.0,3222NIGHT,5332701.0,4.0,INMAA,RAYT,Z1,199154.0,529.0,TRIU8970380,False,True,True,True,False,False,False,266.217778,True,False,False,False,False,False,671.0,4.0
2,2019-07-01 00:02:20,59431225,0,5.0,-41.0,False,False,0,Sun,True,True,True,ALLO,na,na,na,na,6.0,3213,5340902.0,18.0,CNSHG,K&ST,"IL, TH, Z1",199236.0,8542.0,TEMU9001169,False,True,True,True,False,False,False,443.263056,False,False,False,False,False,True,671.0,28.0
3,2019-07-01 00:03:09,59445246,0,5.0,-39.0,False,False,0,Sun,True,True,True,Q02-03,2,1,11,,,,,,,,,,,,,,,,,,,286.965833,False,False,False,False,False,False,,
4,2019-07-01 00:04:46,59330290,0,5.0,-40.0,False,False,0,Sun,True,True,True,Q28-02,12,2,1,,,,,,,,,,,,,,,,,,,333.211944,False,False,False,False,False,False,,
5,2019-07-01 00:05:16,59442689,0,5.0,-39.0,False,False,0,Sun,True,True,True,Q01-01,10,1,3,,,,,,,,,,,,,,,,,,,266.170556,False,False,False,False,False,False,,
6,2019-07-01 00:05:37,59338296,0,5.0,-39.0,False,False,0,Sun,True,True,True,Q28-02,12,1,1,,,,,,,,,,,,,,,,,,,166.326111,False,False,False,False,False,False,,
7,2019-07-01 00:05:46,59443235,0,5.0,-39.0,False,False,0,Sun,True,True,True,Q02-03,1,1,12,na,6.0,3215,5334601.0,18.0,BEZEE,K&ST,na,199078.0,8319.0,TTNU8956773,False,True,True,True,False,False,False,217.253611,False,False,False,True,False,True,383.0,8.0
8,2019-07-01 00:09:46,59445550,0,5.0,-39.0,False,False,0,Sun,True,True,True,Q02-12,13,2,0,,,,,,,,,,,,,,,,,,,309.177778,False,False,False,False,False,False,,
9,2019-07-01 00:53:21,59620339,1,5.0,-37.0,False,False,0,Sun,True,True,True,Q30-23,12,1,1,,,,,,,,,,,,,,,,,,,555.381389,False,False,False,False,False,False,,


### Derive Quantity of Pallet with same order (incl pallet in other status)

### Derive Quantity of Pallet ald sitting in Process Preparation Room - need to define the Room

### Derive quantity of pallet tagged with process as per transaction time

### Derive forklift driver on duty in that shift (incl all rows in transaction)

### Derive pallet process per day ( as an estimation of number of process worker)

In [70]:
df_pallet.head(3)

Unnamed: 0,Transaction Date Time,Pallet Number,Pallet Rework Count,Ok_Until_Date_day,Pack_Date_day,isDOINumber,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway,Location Request Number,CCKPassed Failed,Loadout Priority,Pallet Card Reference,Order Number,Shipment Type Code,Destination Port Code,Trucking Company Code,Market Holds,Envelope Number,Container,Container Number,isDoi_Number,isInternal_Holds,isTemporary_Seal_Number,isTemperature_Recorder_Number,isLocation_Mission_Request_Destination,isPermanentSeal,Pallet Note,duration_hr,dif_CustomerLabelCode,dif_LabellingIndicatorCode,dif_PackCode,dif_PackMakeCode,dif_PackStyleCode,dif_StackingConfigurationCode,Load_Start_Date_hour
0,2019-07-01 00:00:39,59669482,0,5.0,-38.0,False,False,0,Sun,True,True,True,ALLO,na,na,na,na,True,6.0,2216NIGHT,5334502.0,18.0,BEZEE,OMOK,"ID, IL, IN, JP, KR, MX, RE, TH, TW, Z7, Z9",199054.0,8328.0,TTNU8962992,False,True,True,True,False,False,False,335.496667,False,False,False,False,False,False,383.0
1,2019-07-01 00:01:20,59685451,0,12.0,-35.0,False,False,0,Sun,True,True,True,Q01-02,5,1,8,na,True,6.0,3222NIGHT,5332701.0,4.0,INMAA,RAYT,Z1,199154.0,529.0,TRIU8970380,False,True,True,True,False,False,False,266.217778,True,False,False,False,False,False,671.0
2,2019-07-01 00:02:20,59431225,0,5.0,-41.0,False,False,0,Sun,True,True,True,ALLO,na,na,na,na,True,6.0,3213,5340902.0,18.0,CNSHG,K&ST,"IL, TH, Z1",199236.0,8542.0,TEMU9001169,False,True,True,True,False,False,False,443.263056,False,False,False,False,False,True,671.0


In [56]:
df_pallet.shape

(4101, 42)

In [None]:
['Japan Sub Brand Code','Trial Packing Indicator Code','Brand Code','Fruit Class Code','Growing Method']

In [71]:
df_pallet['CCKPassed Failed'].value_counts(dropna=False)

True     2267
NaN      1484
False     280
na         70
Name: CCKPassed Failed, dtype: int64

In [59]:
df_pallet[df_pallet['Growing Method']=='OB']['Japan Sub Brand Code']

1504    N
2657    N
2659    N
2662    N
Name: Japan Sub Brand Code, dtype: object

In [57]:
summary_object(df_pallet)

SUMMARY OF 31 NON-NUMERICAL ATTRIBUTES:

20 Object Columns
11 Bool Columns


Unnamed: 0,Attribute,Count,Unique,Missing (%),Top,Top (%),Bottom,Bottom (%)
1,Pallet Card Reference,2617,281,36.1863,na,9.90002,2167DAY,0.0243843
2,Container,2617,231,36.1863,na,10.6559,8247.0,0.0243843
3,Container Number,2617,231,36.1863,na,10.6559,BMOU9615190,0.0243843
4,Market Holds,2617,170,36.1863,na,13.3382,"AU, ID, IN, KR, MX, RE, TW, Z1, Z7",0.0243843
5,Destination Port Code,2617,39,36.1863,CNSHG,20.3365,BRSSZ,0.0243843
6,Location Request Number,2617,26,36.1863,na,60.2536,1835.0,0.0243843
7,Trucking Company Code,2617,21,36.1863,BUKL,10.8998,FMBT,0.17069
8,isDoi_Number,2617,3,36.1863,False,60.5706,True,3.24311
9,isInternal_Holds,2617,3,36.1863,True,35.1622,False,28.6515
10,isTemporary_Seal_Number,2617,3,36.1863,True,52.6945,False,11.1192


In [77]:
summary_numerical(df_pallet)

SUMMARY OF 12 NUMERICAL ATTRIBUTES:


Unnamed: 0,Attribute,Count,Missing (%),Mean,Median,Min,Max,Skewness,Kurtosis
1,Loadout Priority,2617.0,36.186296,4.976691,5.0,1.0,30.0,,
2,Fruit Class Code,2617.0,36.186296,1.00726,1.0,1.0,2.0,,
3,Order Number,2617.0,36.186296,5268613.0,5316303.0,1905900.0,5353901.0,,
4,Order Line Number,2617.0,36.186296,40.93198,30.0,10.0,290.0,,
5,Shipment Type Code,2617.0,36.186296,13.65533,18.0,1.0,18.0,,
6,Envelope Number,2617.0,36.186296,198998.8,199152.0,193026.0,199331.0,,
7,Pallet Number,4101.0,0.0,58380870.0,57434710.0,53188620.0,80500980.0,6.863408,75.559917
8,Pallet Rework Count,4101.0,0.0,0.1663009,0.0,0.0,5.0,4.402782,19.420978
9,duration_hr,4101.0,0.0,81.74529,28.41778,0.0,707.3836,2.266328,4.567369
10,Pack_Date_day,4101.0,0.0,-65.83663,-67.0,-112.0,-2.0,0.243254,-0.70732


In [146]:
df_pallet[df_pallet['Load Start Date'].isna()]['Pallet Number'].count()

1484

In [149]:
df_pallet[df_pallet['Order Number'].isna()]['Pallet Number'].count()

1484

In [150]:
df_pallet.shape

(4101, 75)