## 1) Import Library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [2]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

## 2) Read Dataset

In [3]:
filepath='/home/nan/Desktop/GitHub_Data/Dispatched.csv'

In [4]:
filename=filepath.rstrip('.csv')
df=pd.read_csv(filepath)

  interactivity=interactivity, compiler=compiler, result=result)


## 3) User Defined Function

In [5]:
def get_datatype(arg_df):
    
    col_bool=[]
    col_object=[]
    col_number=[]
    col_catogory=[]
    
    for col in arg_df.columns:
        datatype=arg_df[col].dtypes
        if datatype==bool:
            col_bool.append(col)
        elif datatype==object :
            col_object.append(col)
        elif str(datatype)=='category':
            col_catogory.append(col)
        else:
            col_number.append(col)
    print('This dataset has {} Columns\nbool\t:{} \nobject\t:{}  \ncategory:{} \nnumeric\t:{} '
          .format(len(arg_df.columns),len(col_bool),len(col_object),len(col_catogory),len(col_number)))
    
    del arg_df
    gc.collect()
    
    return col_bool,col_object,col_catogory,col_number

'==================================================================='
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
'==================================================================='        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category' and str(datatype)!='datetime64[ns]':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')
'==================================================================='        
def export_Data_Description(arg_df,**kwarg):
    from scipy.stats import skew
    data_description=pd.DataFrame()
    for c in arg_df.columns:
        data_description.loc[c,'Datatype']=arg_df[c].dtypes
        data_description.loc[c,'Missing%']='{:.3f}'.format((len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100)
        if (len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100!=100:
            if arg_df[c].dtypes==object:
                data_description.loc[c,'Unique']=len(arg_df[c].unique())
                if len(arg_df[c].unique())==1:
                    data_description.loc[c,'Remark']='Dropped because this column has only single lable'
                else:
                    data_description.loc[c,'Remark']='Frequent: {} ({:.3f} %)'.format(
                        arg_df[c].mode()[0],arg_df[arg_df[c]==arg_df[c].mode()[0]][c].count()/len(arg_df[c])*100)
            else:
                if skew(arg_df[c])==0:
                    data_description.loc[c,'Unique']=1
                    data_description.loc[c,'Remark']='Dropped because this column has only single value'
                else:
                    data_description.loc[c,'Remark']='MAX: {:.3f} MIN: {:.3f} MEAN: {:.3f} STD: {:.3f}'.format(
                        arg_df[c].max(),arg_df[c].min(),arg_df[c].mean(),arg_df[c].std())
        else:
            data_description.loc[c,'Remark']='Dropped because this column is empty'
    data_description.reset_index(inplace=True)
    data_description.index=data_description.index+1
    data_description=data_description.rename(columns={'index':'Attribute'})
    if ('surfix' in kwarg):
        data_description.to_excel('data_description_{}.xlsx'.format(kwarg['surfix']))
    else:
        import datetime
        currentDT = datetime.datetime.now()
        time=str(currentDT.year)+'-'+str(currentDT.month)+'-'+str(currentDT.day)+' '+str(currentDT.hour)+str(currentDT.minute)+str(currentDT.second)
        data_description.to_excel('data_description_{}.xlsx'.format(time))
'==================================================================='        
def rectify_to_object(arg_df,col_list):

    if len(col_list)>0:
        for columns in col_list:
            arg_df[columns]=arg_df[columns].astype('object',inplace=True)
        print('Change Datatype of {} Column to Object : \n{}'.format(len(col_list),col_list))
'==================================================================='        
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        else:
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')
'==================================================================='        
def export_description(arg_df,str_1,str_2):
    arg_df.groupby(str_1)[str_2].value_counts(dropna=False,
                                              ascending=False).to_frame().to_csv('{}.csv'.format(str_2))
'==================================================================='        
def find_time_dif_day(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_day']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_day']=arg_df[new_date_attribute[idx]+'_day'].astype('timedelta64[D]')
'==================================================================='        
def find_time_dif_hour(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')
'==================================================================='        
def drop_columns(arg_df,column_names):
    arg_df.drop(columns=column_names,axis='columns',inplace=True)
    print('Drop {} columns : \n{}'.format(len(column_names),column_names))
'===================================================================' 
def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    #roomrow=roomrow.unique().tolist()
    #roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow

## 4) Basic Checks

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32257 entries, 0 to 32256
Columns: 133 entries, Pallet Number to Permanent Seal Number
dtypes: bool(3), float64(15), int64(22), object(93)
memory usage: 32.1+ MB


In [7]:
bool_list,object_list,cat_list,num_list=get_datatype(df)

This dataset has 133 Columns
bool	:3 
object	:93  
category:0 
numeric	:37 


In [8]:
df.head(3)

Unnamed: 0,Pallet Number,Coolstore Code,Coolstore Description,Location Room Code,Location Room Description,Location Row Code,Location Column,Location Height,Location Request Number,Location Mission Request Destination,Storage Source Code,Storage Source Description,Storage Method Code,Storage Method Description,Pack Label Code,Pack Label Description,Pack Indicator Code,Pack Indicator Description,Clearance Protocol Code,Clearance Protocol Description,Customer Label Code,Customer Label Description,Last Ppqi Date,Ppqi Passed,Repack Date,Last Spqi Date,Dry Matter Code,Dry Matter Description,Pack Date,Pest Indicator Code,Pest Indicator Description,Storing Characteristic Code,Storing Characteristic Description,Conditioning Indicator Code,Conditioning Indicator Description,CCKPassed Failed,Fruit Indicator Code,Fruit Indicator Description,Quality Inspection Indicator Code,Quality Inspection Indicator Description,Disorder Indicator Code,Disorder Indicator Description,Global Gap Brc Indicator Code,Global Gap Brc Indicator Description,Japan Sub Brand Code,Japan Sub Brand Description,Trial Packing Indicator Code,Trial Packing Indicator Description,Marketer Description,Marketer Code,Loadout Priority,Online Sequence Number,Batch Sequence Number,Material Number,Material Mnemonic,Material Group Code,Nir Indicator Code,Origin Code,Brand Code,Origin Description,Brand Description,Variety Code,Variety Description,Fruit Class Code,Fruit Class Description,Fruit Size Code,Fruit Size Description,Pack Category Description,Pack Category Code,Labelling Indicator Code,Purchase Pool Code,Purchase Pool Description,Pack Code,Growing Method,Packs Per Pallet,Ean Indicator Code,Ean Indicator Description,Maturity Indicator Code,Maturity Indicator Description,Ok Until Date,Supplier Code,Supplier Description,Plant Code,Plant Description,Ignore Psr,Pack Type Code,Packed Fruit,Packed Trays,Pallet Rework Count,Rework Description,Packhouse Code,Zespri Li Number,Pallet Card Reference,Pallet Status Description,Pallet Card Note1,Pallet Card Note2,Pallet Note Type,Pallet Note Text,Fruit,Order Number,Order Line Number,Loadout Date,Shipment Type Code,Destination Port Code,Trucking Company Code,Packs,Tray Equivalent,Kg,Full,Market Holds,Blocked Holds,Internal Holds,Stacking Configuration Code,Pack Style Code,Pack Make Code,Pack Iso,OKUntil ISODate,Pack Week,Rf Id Tag1,Pre Cooled,Load Start Date,Australian Inspection Reference,Doi Number,Doi Clearance Date,Envelope Number,Tkl Email Date,Pallets,Pallet Equivalents,Container,Container Number,Temperature Recorder Number,Temporary Seal Number,Permanent Seal Number
0,54510208,3TPK,EastPack Quarry Road,Q27,Coolstore 27,A,11.0,1.0,,,CN,Conventional,CN,Conventional,N,,N,,A,Protocol A,N,,,,,,Y,Extra Taste,18/03/2019 0:00,N,Not Applicable,P,KiwiStart Submit any Wk ? Ship by Wk 21 Mainpa...,N,Not Applicable,,N,Not Applicable,A,Zespri Australia\Argentina Inspected,,,1,GlobalGAP Compliant,N,Not applicable,N,,Zespri International,ZIL,20,0,0,61600,ENIJ22CK1GAZNZJB0,Zespri,N,NZ,Z,New Zealand,Zespri?,GA,ZESY002 SunGold,1,Class 1,22,Size 22,EN International Jumbo,ENIJ,JB,16,NZ Class 1 Sungold,10168,CK,200,E,EAN Compliant,C,Protocol C,5/5/2019 0:00,434,Southlink,1103,Tauranga,False,ENIJKC3,4400,200.0,0,,3TPP,LI1015327,3121DAY,Despatched,,,,,4400,5146501,20,28/03/2019 13:00,1,JPTYO,PRHA,200,200.0,715.0,True,"KR, RE, Z7",,,N,IJ,KC3,121,187,12,,False,27/03/2019 0:00,,,,193017,,1,1.0,,,,,
1,54510260,3TPK,EastPack Quarry Road,UNKNOWN,Unknown Location,,,0.0,,,CN,Conventional,CN,Conventional,N,,N,,A,Protocol A,N,,,,,,Y,Extra Taste,18/03/2019 0:00,N,Not Applicable,P,KiwiStart Submit any Wk ? Ship by Wk 21 Mainpa...,N,Not Applicable,,N,Not Applicable,A,Zespri Australia\Argentina Inspected,,,1,GlobalGAP Compliant,N,Not applicable,N,,Zespri International,ZIL,20,0,0,61577,ENML25CK1GAZNZGR0,Zespri,N,NZ,Z,New Zealand,Zespri?,GA,ZESY002 SunGold,1,Class 1,25,Size 25,Euro Normal Modular Loose,ENML,GR,16,NZ Class 1 Sungold,10182,CK,160,E,EAN Compliant,C,Protocol C,5/5/2019 0:00,434,Southlink,1103,Tauranga,False,ENMLKC6,6560,262.4,0,,3TPP,LI1015327,3121DAY,Despatched,,,,,6560,5166002,20,3/4/2019 16:00,18,SGSIN,TOLL,160,262.4,931.52,True,"KR, RE, Z7",,,N,ML,KC6,121,187,12,,False,2/4/2019 0:00,,,,193074,,1,1.0,7839.0,PCIU6069215,FH1812C2K0,T164297,
2,54510253,3TPK,EastPack Quarry Road,ALLO,Allocations,,,1.0,,,CN,Conventional,CN,Conventional,N,,N,,A,Protocol A,N,,,,,,Y,Extra Taste,18/03/2019 0:00,N,Not Applicable,P,KiwiStart Submit any Wk ? Ship by Wk 21 Mainpa...,N,Not Applicable,,N,Not Applicable,A,Zespri Australia\Argentina Inspected,,,1,GlobalGAP Compliant,N,Not applicable,N,,Zespri International,ZIL,20,0,0,61579,ENML27CK1GAZNZGR0,Zespri,N,NZ,Z,New Zealand,Zespri?,GA,ZESY002 SunGold,1,Class 1,27,Size 27,Euro Normal Modular Loose,ENML,GR,16,NZ Class 1 Sungold,10181,CK,160,E,EAN Compliant,A,Protocol A,5/5/2019 0:00,434,Southlink,1103,Tauranga,False,ENMLKC6,7040,260.740741,0,,3TPP,LI1015327,3121DAY,Despatched,,,,,7040,5147801,130,30/03/2019 8:45,1,BEZEE,OMOK,160,260.740741,922.24,True,"KR, RE, Z7",,,N,ML,KC6,121,187,12,,False,29/03/2019 0:00,,,,193042,,1,1.0,,,,,


In [9]:
summary_object(df)

SUMMARY OF 96 NON-NUMERICAL ATTRIBUTES:

93 Object Columns
3 Bool Columns

19 Columns with Single Label : 
['Coolstore Code', 'Coolstore Description', 'Storage Method Code', 'Storage Method Description', 'Pest Indicator Code', 'Pest Indicator Description', 'Conditioning Indicator Code', 'Conditioning Indicator Description', 'Fruit Indicator Code', 'Fruit Indicator Description', 'Global Gap Brc Indicator Description', 'Nir Indicator Code', 'Origin Code', 'Origin Description', 'Ean Indicator Code', 'Ean Indicator Description', 'Supplier Description', 'Pallet Status Description', 'Pre Cooled']


Unnamed: 0,Attribute,Count,Unique,Missing (%),Top,Top (%),Bottom,Bottom (%)
1,Last Ppqi Date,2,3,99.9938,7/5/2019 15:34,0.0031001,7/5/2019 15:37,0.0031001
2,Ppqi Passed,2,2,99.9938,False,0.0062002,False,0.0062002
3,Permanent Seal Number,176,14,99.4544,Z2810704,0.062002,z2810709,0.0155005
4,Pallet Note Text,192,52,99.4048,mast damage,0.089903,Base\n,0.0031001
5,Pallet Note Type,192,3,99.4048,GENERAL,0.589019,IEHOLD,0.0062002
6,Location Mission Request Destination,257,4,99.2033,CONTR,0.771925,SPQI,0.0031001
7,Australian Inspection Reference,488,7,98.4872,20-Jun,0.368912,15-Jul,0.114704
8,Doi Number,488,7,98.4872,MF030,0.368912,RT034,0.114704
9,Doi Clearance Date,488,7,98.4872,20/06/2019 12:40,0.368912,25/06/2019 13:19,0.114704
10,Repack Date,717,694,97.7772,4/7/2019 9:56,0.00930031,8/7/2019 9:18,0.0031001


In [10]:
summary_numerical(df)

SUMMARY OF 37 NUMERICAL ATTRIBUTES:

6 Columns with Single Value: 
['Global Gap Brc Indicator Code', 'Online Sequence Number', 'Batch Sequence Number', 'Supplier Code', 'Pallet Rework Count', 'Pallets']

6 Empty Columns: 
['Disorder Indicator Code', 'Disorder Indicator Description', 'Rework Description', 'Pallet Card Note1', 'Pallet Card Note2', 'Blocked Holds']


Unnamed: 0,Attribute,Count,Missing (%),Mean,Median,Min,Max,Skewness,Kurtosis
1,Location Request Number,257.0,99.203274,1925.529,1922.0,1815.0,2065.0,,
2,Location Column,1157.0,96.413182,6.375108,6.0,1.0,13.0,,
3,Rf Id Tag1,4073.0,87.373283,5366427.0,5385446.0,507972.0,5417305.0,,
4,Container,19853.0,38.453669,6624.575,7985.0,100.0,9979.0,,
5,Location Height,32231.0,0.080603,0.4809966,0.0,0.0,2.0,,
6,Fruit Class Code,32257.0,0.0,1.025793,1.0,1.0,3.0,7.645052,64.203954
7,Pallet Number,32257.0,0.0,57738580.0,57133590.0,53170460.0,98500130.0,6.468416,49.068727
8,Order Line Number,32257.0,0.0,60.62321,50.0,10.0,416.0,1.868933,5.133527
9,Loadout Priority,32257.0,0.0,13.86986,7.0,1.0,50.0,1.493256,2.049571
10,Pack Code,32257.0,0.0,21223.41,10309.0,0.0,76332.0,1.370846,1.101559


## 5) Export Data Description

In [11]:
#export_Data_Description(transaction,surfix='transaction')

## 6) Data Cleaning

### Delete Columns

In [12]:
# drop empty columns
drop_empty_column(df)

Delete 6 Empty Column : 
['Disorder Indicator Code', 'Disorder Indicator Description', 'Rework Description', 'Pallet Card Note1', 'Pallet Card Note2', 'Blocked Holds']


In [13]:
# drop unilabel/univalue columns
drop_unilable_column(df)

Drop 25 Columns with Single Label:
['Coolstore Code', 'Coolstore Description', 'Storage Method Code', 'Storage Method Description', 'Pest Indicator Code', 'Pest Indicator Description', 'Conditioning Indicator Code', 'Conditioning Indicator Description', 'Fruit Indicator Code', 'Fruit Indicator Description', 'Global Gap Brc Indicator Description', 'Nir Indicator Code', 'Origin Code', 'Origin Description', 'Ean Indicator Code', 'Ean Indicator Description', 'Supplier Description', 'Pallet Status Description', 'Pre Cooled', 'Global Gap Brc Indicator Code', 'Online Sequence Number', 'Batch Sequence Number', 'Supplier Code', 'Pallet Rework Count', 'Pallets']


### Drop Duplicate Labels

In [14]:
before_drop=df.shape
df.drop_duplicates(inplace=True)
print('Total number of rows deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-df.shape[0]),
    (before_drop[0]-df.shape[0])/before_drop[0]*100))

Total number of rows deleted :6 (0.02 %)


### Export and Delete Descriptions

In [15]:
code=['Location Room Code','Storage Source Code','Pack Label Code','Pack Indicator Code','Clearance Protocol Code',
     'Customer Label Code','Dry Matter Code','Quality Inspection Indicator Code','Japan Sub Brand Code',
     'Trial Packing Indicator Code','Marketer Code','Brand Code','Variety Code','Fruit Size Code',
      'Pack Category Code','Purchase Pool Code','Maturity Indicator Code','Plant Code']

descrip=[x.replace('Code','Description') for x in code]

In [16]:
#for c,d in zip(code,descrip):
#    export_description(dispatched,c,d)

In [17]:
drop_columns(df,descrip)

Drop 18 columns : 
['Location Room Description', 'Storage Source Description', 'Pack Label Description', 'Pack Indicator Description', 'Clearance Protocol Description', 'Customer Label Description', 'Dry Matter Description', 'Quality Inspection Indicator Description', 'Japan Sub Brand Description', 'Trial Packing Indicator Description', 'Marketer Description', 'Brand Description', 'Variety Description', 'Fruit Size Description', 'Pack Category Description', 'Purchase Pool Description', 'Maturity Indicator Description', 'Plant Description']


### Drop 'OKUntil ISODate' because it is duplication of 'Ok Until Date'

In [18]:
drop_columns(df,['OKUntil ISODate'])

Drop 1 columns : 
['OKUntil ISODate']


### Drop 'Australian Inspection Reference' because it is duplication of 'DOI Number'

In [19]:
drop_columns(df,['Australian Inspection Reference'])

Drop 1 columns : 
['Australian Inspection Reference']


### Drop columns that have missing rate > 99.99%

In [20]:
drop_columns(df,['Last Ppqi Date','Ppqi Passed'])

Drop 2 columns : 
['Last Ppqi Date', 'Ppqi Passed']


### Drop columns that can be represented by Pack Type Code : 'Packs','Tray Equivalent','Fruit','Kg'

In [21]:
drop_columns(df,['Packs','Tray Equivalent','Fruit','Kg'])

Drop 4 columns : 
['Packs', 'Tray Equivalent', 'Fruit', 'Kg']


### Drop columns that can be represented by 'Full' : 'Packs Per Pallet','Packed Fruit','Packed Trays','Pallet Equivalents'
    - Found 75 pallets that is not full

In [22]:
drop_columns(df,['Packs Per Pallet','Packed Fruit','Packed Trays','Pallet Equivalents'])

Drop 4 columns : 
['Packs Per Pallet', 'Packed Fruit', 'Packed Trays', 'Pallet Equivalents']


### Drop columns that can be represented by 'Pack Date' :'Pack Week','Pack ISO'

In [23]:
drop_columns(df,['Pack Week','Pack Iso'])

Drop 2 columns : 
['Pack Week', 'Pack Iso']


    -'Zespri Li Number' duplicate of 'Packhouse Code'
    -'Storing Characteristic Description' duplicate of 'Storing Characteristic Code'
    -'Material Mnemonic' duplicate of 'Material Number'
    -'Fruit Class Description' duplicates of 'Fruit Class Code'

In [24]:
drop_columns(df,['Zespri Li Number','Storing Characteristic Description','Material Mnemonic','Fruit Class Description'])

Drop 4 columns : 
['Zespri Li Number', 'Storing Characteristic Description', 'Material Mnemonic', 'Fruit Class Description']


## 7) Data Imputation

### Replace Missing Data in Location Row Code, Column and Height,Location Mission Request Destination with 'na'

In [25]:
temp_list=['Location Row Code','Location Column','Location Height']

In [26]:
for c in temp_list:
    df.loc[df[c].isna(),c]='na'

### Replace Missing Data in Location Room Code with 'UNKNOWN'

In [27]:
df.loc[df['Location Room Code'].isna(),'Location Room Code']='UNKNOWN'

### Replace Missing Data with na

In [28]:
temp_list=['CCKPassed Failed','Container Number','Market Holds','Pallet Card Reference','Location Request Number','Rf Id Tag1','Container']

In [29]:
for c in temp_list:
    df.loc[df[c].isna(),c]='na'

### Change to Boolean : Doi Number, Internal Holds,Temporary Seal Number, Temperature Recorder Number

In [30]:
temp_list=['Doi Number','Internal Holds','Temporary Seal Number','Temperature Recorder Number','Location Mission Request Destination','Rf Id Tag1']

In [31]:
for c in temp_list:
    new_c=c.replace(' ','_')
    new_c='is'+new_c
    df[new_c]=df[c].notnull()

In [32]:
drop_columns(df,temp_list)

Drop 6 columns : 
['Doi Number', 'Internal Holds', 'Temporary Seal Number', 'Temperature Recorder Number', 'Location Mission Request Destination', 'Rf Id Tag1']


### Permanent Seal Number: replace missing data with False, data with True since missing rate is 99.45%

In [33]:
df['isPermanentSeal']=df['Permanent Seal Number'].notnull()

In [34]:
drop_columns(df,['Permanent Seal Number'])

Drop 1 columns : 
['Permanent Seal Number']


### Pallet Note: combine Pallet Note Text and Type into Pallet Note (True/False)

In [35]:
df['Pallet Note']=(df['Pallet Note Text'].notnull()) & (df['Pallet Note Type'].notnull())

In [36]:
drop_columns(df,['Pallet Note Type','Pallet Note Text'])

Drop 2 columns : 
['Pallet Note Type', 'Pallet Note Text']


## 8) Derive Attributes

### Derive Time Difference from Date Columns

In [37]:
date_attribute=[c for c in df.columns if 'Date' in c]
date_attribute

['Repack Date',
 'Last Spqi Date',
 'Pack Date',
 'Ok Until Date',
 'Loadout Date',
 'Load Start Date',
 'Doi Clearance Date',
 'Tkl Email Date']

In [38]:
for c in date_attribute:
    df[c]=pd.to_datetime(df[c],format='%d/%m/%Y %H:%M')

In [39]:
date_attribute.remove('Loadout Date')
date_attribute.remove('Load Start Date')

In [40]:
find_time_dif_day(df,'Loadout Date',date_attribute)

In [41]:
find_time_dif_hour(df,'Loadout Date',['Load Start Date'])

In [42]:
drop_columns(df,date_attribute)

Drop 6 columns : 
['Repack Date', 'Last Spqi Date', 'Pack Date', 'Ok Until Date', 'Doi Clearance Date', 'Tkl Email Date']


### Convert Numeric Category Attributes back to object datatype

In [43]:
temp_list=['Fruit Class Code','Pallet Number','Order Line Number','Loadout Priority','Pack Code','Envelope Number','Shipment Type Code','Plant Code','Order Number']

In [44]:
rectify_to_object(df,temp_list)

Change Datatype of 9 Column to Object : 
['Fruit Class Code', 'Pallet Number', 'Order Line Number', 'Loadout Priority', 'Pack Code', 'Envelope Number', 'Shipment Type Code', 'Plant Code', 'Order Number']


In [45]:
summary_numerical(df)

SUMMARY OF 7 NUMERICAL ATTRIBUTES:


Unnamed: 0,Attribute,Count,Missing (%),Mean,Median,Min,Max,Skewness,Kurtosis
1,Doi_Clearance_Date_day,488.0,98.486869,-21.639344,-20.0,-62.0,-3.0,,
2,Repack_Date_day,717.0,97.776813,-5.856346,-5.0,-38.0,-1.0,,
3,Last_Spqi_Date_day,4874.0,84.88729,-9.365819,-9.0,-66.0,-1.0,,
4,Tkl_Email_Date_day,7052.0,78.134011,-3.370959,-3.0,-9.0,-1.0,,
5,Ok_Until_Date_day,32251.0,0.0,21.564727,23.0,-2.0,46.0,-0.006213,-1.317638
6,Pack_Date_day,32251.0,0.0,-30.462993,-20.0,-116.0,-1.0,-1.198677,0.381977
7,Load_Start_Date_hour,32251.0,0.0,-27.58423,-31.0,-456.0,87.0,-2.078956,23.74502


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32251 entries, 0 to 32256
Data columns (total 66 columns):
Pallet Number                             32251 non-null object
Location Room Code                        32251 non-null object
Location Row Code                         32251 non-null object
Location Column                           32251 non-null object
Location Height                           32251 non-null object
Location Request Number                   32251 non-null object
Storage Source Code                       32251 non-null object
Pack Label Code                           32251 non-null object
Pack Indicator Code                       32251 non-null object
Clearance Protocol Code                   32251 non-null object
Customer Label Code                       32251 non-null object
Dry Matter Code                           32251 non-null object
Storing Characteristic Code               32251 non-null object
CCKPassed Failed                          32251 non-null ob

## Export

In [49]:
df.to_csv('p_dispatched',index=False)