Updates:
11 oct : comment out dropping 'Packed Fruit','Fruit','Packs','Trays','Fruit Per Pack' to check for changes in FW 

## 1) Import library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [2]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

## 2) Read Dataset

In [3]:
filepath_1='/home/nan/Desktop/GitHub_Data/TransactionDetails.csv'

In [4]:
filename_1=filepath_1.rstrip('.csv')
transaction=pd.read_csv(filepath_1)

  interactivity=interactivity, compiler=compiler, result=result)


## 3) User Defined Function

In [5]:
def get_datatype(arg_df):
    
    col_bool=[]
    col_object=[]
    col_number=[]
    col_catogory=[]
    
    for col in arg_df.columns:
        datatype=arg_df[col].dtypes
        if datatype==bool:
            col_bool.append(col)
        elif datatype==object :
            col_object.append(col)
        elif str(datatype)=='category':
            col_catogory.append(col)
        else:
            col_number.append(col)
    print('This dataset has {} Columns\nbool\t:{} \nobject\t:{}  \ncategory:{} \nnumeric\t:{} '
          .format(len(arg_df.columns),len(col_bool),len(col_object),len(col_catogory),len(col_number)))
    
    del arg_df
    gc.collect()
    
    return col_bool,col_object,col_catogory,col_number

'==================================================================='
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
'==================================================================='        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category' and str(datatype)!='datetime64[ns]':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')
'==================================================================='        
def export_Data_Description(arg_df,**kwarg):
    from scipy.stats import skew
    data_description=pd.DataFrame()
    for c in arg_df.columns:
        data_description.loc[c,'Datatype']=arg_df[c].dtypes
        data_description.loc[c,'Missing%']='{:.3f}'.format((len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100)
        if (len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100!=100:
            if arg_df[c].dtypes==object:
                data_description.loc[c,'Unique']=len(arg_df[c].unique())
                if len(arg_df[c].unique())==1:
                    data_description.loc[c,'Remark']='Dropped because this column has only single lable'
                else:
                    data_description.loc[c,'Remark']='Frequent: {} ({:.3f} %)'.format(
                        arg_df[c].mode()[0],arg_df[arg_df[c]==arg_df[c].mode()[0]][c].count()/len(arg_df[c])*100)
            else:
                if skew(arg_df[c])==0:
                    data_description.loc[c,'Unique']=1
                    data_description.loc[c,'Remark']='Dropped because this column has only single value'
                else:
                    data_description.loc[c,'Remark']='MAX: {:.3f} MIN: {:.3f} MEAN: {:.3f} STD: {:.3f}'.format(
                        arg_df[c].max(),arg_df[c].min(),arg_df[c].mean(),arg_df[c].std())
        else:
            data_description.loc[c,'Remark']='Dropped because this column is empty'
    data_description.reset_index(inplace=True)
    data_description.index=data_description.index+1
    data_description=data_description.rename(columns={'index':'Attribute'})
    if ('surfix' in kwarg):
        data_description.to_excel('data_description_{}.xlsx'.format(kwarg['surfix']))
    else:
        import datetime
        currentDT = datetime.datetime.now()
        time=str(currentDT.year)+'-'+str(currentDT.month)+'-'+str(currentDT.day)+' '+str(currentDT.hour)+str(currentDT.minute)+str(currentDT.second)
        data_description.to_excel('data_description_{}.xlsx'.format(time))
'==================================================================='        
def rectify_to_object(arg_df,col_list):

    if len(col_list)>0:
        for columns in col_list:
            arg_df[columns]=arg_df[columns].astype('object',inplace=True)
        print('Change Datatype of {} Column to Object : \n{}'.format(len(col_list),col_list))
'==================================================================='        
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        elif str(arg_df[c].dtypes)!='datetime64[ns]':
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list
'===================================================================' 
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')
'==================================================================='        
def export_description(arg_df,str_1,str_2):
    arg_df.groupby(str_1)[str_2].value_counts(dropna=False,
                                              ascending=False).to_frame().to_csv('{}.csv'.format(str_2))
'==================================================================='        
def find_time_dif_day(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_day']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_day']=arg_df[new_date_attribute[idx]+'_day'].astype('timedelta64[D]')
'==================================================================='        
def find_time_dif_hour(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')
'==================================================================='        
def drop_columns(arg_df,column_names):
    arg_df.drop(columns=column_names,axis='columns',inplace=True)
    print('Drop {} columns : \n{}'.format(len(column_names),column_names))
'===================================================================' 
def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    #roomrow=roomrow.unique().tolist()
    #roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow

## 4) Basic Checks

In [6]:
transaction.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277075 entries, 0 to 277074
Columns: 103 entries, Activity Team Code to Variety Code
dtypes: bool(2), float64(44), int64(11), object(46)
memory usage: 214.0+ MB


In [7]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

This dataset has 103 Columns
bool	:2 
object	:46  
category:0 
numeric	:55 


In [8]:
transaction.head(3)

Unnamed: 0,Activity Team Code,Colour Card,Conditioning Indicator Code,Coolstore Code,Customer Label Code,Disorder Indicator Code,Document Reference,Doi Number,Dry Matter Code,EDISent Date Time,EDIProcessed Date Time,Event Type Code,Firmness,Fruit,Fruit Indicator Code,Fruit Per Pack,Fruit Size Code,Global Gap Brc Indicator Code,Growing Method,Grower Number,Is MVQDestination Pallet,Is Zespri,Japan Sub Brand Code,Labelling Indicator Code,Location Column,Location Height,Location Room Code,Location Row Code,Location Request Number,Location Mission Request Destination,Location Mission Request Reason,Loss Code,Marketer,Material Number,Maturity Area Code,MVQDestination Material Number,MVQDestination Pallet Number,MVQSource Pallet Number,New Value,Ok Until Date,Pack Code,Pack Date,Packed Fruit,Packhouse Code,Pack Indicator Code,Pack Label Code,Pack Make,Packrun Batch Code,Packrun Description,Packs,Pack Type,Pallet Adjustment Reason Code,Pallet Adjustment Reason Description,Pallet Number,Pallet Rework Count,Pallet Note Type,Pallet Note Text,Passed Or Failed,Pest Indicator Code,Plant Code,Previous Value,Product Restriction Action,Product Restriction Code,Protocol Code,Purchase Pool Code,Quality Inspection Indicator Code,Quantity B,Quantity D,Quantity F,Quantity M,Quantity P,Quantity R,Quantity S,Quantity X,Quantity Y,Quantity Z,Reason Code,Reason Description,Record Type,Record Type Code,Record Type Description,Reference Note,Rf Id Tag,Room,Sample Quantity,Scope,Source,Source Pallet Number,Storage Method Code,Storing Characteristic Code,Supplier Code,Temperature Indicator Code,Transaction Date,Transaction Date Time,Transaction Sub Type Code,Transfer To Coolstore,Transfer To Supplier,Trays,Trial Packing Indicator Code,Username,Value Type,Zil Message Number,Variety Code
0,,,N,3TPK,N,,,,Y,,,SET,,13000,N,130,42,1,CK,,False,False,N,LG,,0.0,UNKNOWN,,0,,,,ZIL,62853,,,,,Q02-03-4-1,21/07/2019 12:00:00 AM,41236,6/06/2019 12:00:00 AM,13000,3TPP,N,N,KC1,,,100,ENMBKC1,,,59717008,0,,,N,N,1103,Q02-03-3-1,,,N,1,A,,,,,,,,,,,,,LOC,LOC,Locate Pallets,,,UNKNOWN---0,,Internal,,,CN,N,434,N,1/07/2019 12:00:10 AM,1/07/2019 12:00:10 AM,MLA,,,309.52381,N,agubs,Location,,HW
1,,,N,3TPK,N,,,,Y,,,SET,,9216,N,36,36,1,CK,,False,False,N,LG,1.0,1.0,Q52,RACK,0,,,,ZIL,62809,,,,,Q14-15-9-2,21/07/2019 12:00:00 AM,41717,5/06/2019 12:00:00 AM,9216,3TPP,N,N,KC1,,,256,E3ITKC1,,,59708334,0,,,N,N,1103,Q14-02-10-1,,,N,1,A,,,,,,,,,,,,,LOC,LOC,Locate Pallets,,,Q52-RACK-1-1,,Internal,,,CN,N,434,N,1/07/2019 12:00:15 AM,1/07/2019 12:00:15 AM,MTA,,,256.0,N,jaspsi,Location,,HW
2,,,N,3TPK,N,,,,Y,,,SET,,12650,N,115,39,1,CK,,False,False,N,LG,,0.0,UNKNOWN,,0,,,,ZIL,65735,,,,,Q14-02-10-1,14/07/2019 12:00:00 AM,76332,1/06/2019 12:00:00 AM,12650,3TPP,N,N,KCT,,,110,E1BMKCT,,,59699489,0,,,N,N,1103,Q14-02-7-1,,,N,1,A,,,,,,,,,,,,,LOC,LOC,Locate Pallets,,,UNKNOWN---0,,Internal,,,CN,N,434,N,1/07/2019 12:00:29 AM,1/07/2019 12:00:29 AM,MLA,,,324.358974,N,jaspsi,Location,,HW


In [9]:
summary_object(transaction)

SUMMARY OF 48 NON-NUMERICAL ATTRIBUTES:

46 Object Columns
2 Bool Columns

13 Columns with Single Label : 
['Conditioning Indicator Code', 'Coolstore Code', 'Event Type Code', 'Fruit Indicator Code', 'Passed Or Failed', 'Pest Indicator Code', 'Record Type', 'Record Type Code', 'Record Type Description', 'Scope', 'Temperature Indicator Code', 'Is MVQDestination Pallet', 'Is Zespri']


Unnamed: 0,Attribute,Count,Unique,Missing (%),Top,Top (%),Bottom,Bottom (%)
1,Transfer To Coolstore,49,3,99.9823,2DNM,0.0173238,5AFK,0.000360913
2,Doi Number,5715,7,97.9374,RT030,0.925742,RT034,0.137869
3,Location Mission Request Destination,90225,10,67.4366,SPQI,11.738,CSOF,0.0923938
4,Location Row Code,97003,55,64.9903,RACK,6.68231,27,0.00397004
5,Previous Value,207141,20358,25.2401,UNKNOWN---0,15.642,ZE07-02-4-2,0.000360913
6,Value Type,208971,2,24.5796,Location,75.4204,Location,75.4204
7,New Value,208977,16794,24.5775,UNKNOWN,17.1001,Q01-05-13-2,0.000360913
8,Location Room Code,276921,59,0.0555806,UNKNOWN,37.2069,LAB,0.000721826
9,Transaction Date,277075,205494,0.0,24/07/2019 3:34:27 PM,4.92322,3/07/2019 12:09:32 PM,0.000360913
10,Transaction Date Time,277075,205494,0.0,24/07/2019 3:34:27 PM,4.92322,3/07/2019 12:09:32 PM,0.000360913


In [10]:
summary_numerical(transaction)

SUMMARY OF 55 NUMERICAL ATTRIBUTES:

2 Columns with Single Value: 
['Global Gap Brc Indicator Code', 'Supplier Code']

40 Empty Columns: 
['Activity Team Code', 'Colour Card', 'Disorder Indicator Code', 'Document Reference', 'EDISent Date Time', 'EDIProcessed Date Time', 'Firmness', 'Grower Number', 'Location Mission Request Reason', 'Loss Code', 'Maturity Area Code', 'MVQDestination Material Number', 'MVQDestination Pallet Number', 'MVQSource Pallet Number', 'Packrun Batch Code', 'Packrun Description', 'Pallet Adjustment Reason Code', 'Pallet Adjustment Reason Description', 'Pallet Note Type', 'Pallet Note Text', 'Product Restriction Action', 'Product Restriction Code', 'Quantity B', 'Quantity D', 'Quantity F', 'Quantity M', 'Quantity P', 'Quantity R', 'Quantity S', 'Quantity X', 'Quantity Y', 'Quantity Z', 'Reason Code', 'Reason Description', 'Reference Note', 'Rf Id Tag', 'Sample Quantity', 'Source', 'Source Pallet Number', 'Zil Message Number']


Unnamed: 0,Attribute,Count,Missing (%),Mean,Median,Min,Max,Skewness,Kurtosis
1,Transfer To Supplier,49.0,99.982315,433.9184,434.0,430.0,434.0,,
2,Location Column,97003.0,64.990346,4.47813,4.0,1.0,13.0,,
3,Location Height,276373.0,0.253361,0.7586016,1.0,0.0,3.0,,
4,Pallet Number,277075.0,0.0,58889470.0,58510273.0,53179215.0,98002550.0,6.623155,49.287252
5,Pallet Rework Count,277075.0,0.0,0.161339,0.0,0.0,5.0,4.469062,20.379457
6,Fruit Per Pack,277075.0,0.0,56.67018,48.0,18.0,497.0,2.040215,15.442817
7,Pack Code,277075.0,0.0,18709.89,10308.0,0.0,76332.0,1.512128,1.633322
8,Location Request Number,277075.0,0.0,521.1967,0.0,0.0,1935.0,0.802085,-1.278149
9,Packed Fruit,277075.0,0.0,8009.309,7680.0,-5916.0,20355.0,0.062545,0.727772
10,Fruit,277075.0,0.0,8122.945,7680.0,0.0,15620.0,0.021848,0.57581


## 5) Export Data Description

In [11]:
#export_Data_Description(transaction,surfix='transaction')

## 6) Data Cleaning

### Delete Columns

In [12]:
# drop empty columns
drop_empty_column(transaction)

Delete 40 Empty Column : 
['Activity Team Code', 'Colour Card', 'Disorder Indicator Code', 'Document Reference', 'EDISent Date Time', 'EDIProcessed Date Time', 'Firmness', 'Grower Number', 'Location Mission Request Reason', 'Loss Code', 'Maturity Area Code', 'MVQDestination Material Number', 'MVQDestination Pallet Number', 'MVQSource Pallet Number', 'Packrun Batch Code', 'Packrun Description', 'Pallet Adjustment Reason Code', 'Pallet Adjustment Reason Description', 'Pallet Note Type', 'Pallet Note Text', 'Product Restriction Action', 'Product Restriction Code', 'Quantity B', 'Quantity D', 'Quantity F', 'Quantity M', 'Quantity P', 'Quantity R', 'Quantity S', 'Quantity X', 'Quantity Y', 'Quantity Z', 'Reason Code', 'Reason Description', 'Reference Note', 'Rf Id Tag', 'Sample Quantity', 'Source', 'Source Pallet Number', 'Zil Message Number']


In [13]:
# drop unilabel/univalue columns
drop_unilable_column(transaction)

Drop 15 Columns with Single Label:
['Conditioning Indicator Code', 'Coolstore Code', 'Event Type Code', 'Fruit Indicator Code', 'Is MVQDestination Pallet', 'Is Zespri', 'Passed Or Failed', 'Pest Indicator Code', 'Record Type', 'Record Type Code', 'Record Type Description', 'Scope', 'Temperature Indicator Code', 'Global Gap Brc Indicator Code', 'Supplier Code']


In [14]:
# drop columns that have missing rate > 99%
temp_list=['Pack Indicator Code','Japan Sub Brand Code','Growing Method','Trial Packing Indicator Code','Storing Characteristic Code','Transfer To Coolstore','Transfer To Supplier']
drop_columns(transaction,temp_list)

Drop 7 columns : 
['Pack Indicator Code', 'Japan Sub Brand Code', 'Growing Method', 'Trial Packing Indicator Code', 'Storing Characteristic Code', 'Transfer To Coolstore', 'Transfer To Supplier']


### Fix Duplicate Labels 

In [15]:
# Fix duplicate fruit size in 'Fruit Size Code'
# some labels defined in string, some labels defined in integer
transaction['Fruit Size Code']=transaction['Fruit Size Code'].astype('str')

In [16]:
# checking
#print(len(transaction['Fruit Size Code'].unique()))
#transaction['Fruit Size Code'].unique().tolist()

### Derive Time Difference from Date Columns

In [17]:
date_attribute=[c for c in transaction.columns if 'Date' in c]
date_attribute

['Ok Until Date', 'Pack Date', 'Transaction Date', 'Transaction Date Time']

In [18]:
# convert 'Ok Until Date', 'Pack Date' into differences in days
date_attribute=['Ok Until Date', 'Pack Date','Transaction Date']
for c in date_attribute:
    transaction[c]=pd.to_datetime(transaction[c],format='%d/%m/%Y %I:%M:%S %p')

In [19]:
find_time_dif_day(transaction,'Transaction Date',['Ok Until Date', 'Pack Date'])

In [20]:
# checking
#transaction[['Transaction Date','Ok Until Date','Pack Date','Ok_Until_Date_day','Pack_Date_day']]

### Drop column : 'Ok Until Date', 'Pack Date'

In [21]:
# keep these columns for merging process later
#drop_columns(transaction,['Ok Until Date', 'Pack Date'])

### Drop 'Fruit Per Pack' and 'Packs' because multiplication of these two equal to 'Fruit'

In [22]:
# 11 Oct : comment out to identify changes in FW
# drop_columns(transaction,['Packs','Fruit Per Pack'])
## ['Packed Fruit','Fruit','Packs','Trays','Fruit Per Pack']

In [23]:
## delete trays because there are up to 1283 different number of trays
# 11 Oct : comment out to identify changes in FW
# drop_columns(transaction,['Trays'])

### Drop duplicated transactions

In [24]:
# convert transaction to Time-Series data,set 'Transaction Date Time' as index
transaction['Transaction Date Time']=pd.to_datetime(transaction['Transaction Date Time'],format='%d/%m/%Y %I:%M:%S %p')
transaction=transaction.set_index(keys='Transaction Date Time')

In [25]:
before_drop_duplicate=transaction.shape
transaction.drop_duplicates(inplace=True)
print('Total number of transaction deleted :{} ({:.2f} %)'.format(
    (before_drop_duplicate[0]-transaction.shape[0]),
    (before_drop_duplicate[0]-transaction.shape[0])/before_drop_duplicate[0]*100))

Total number of transaction deleted :50401 (18.19 %)


### Drop transactions with 'Value Type'= NaN, then drop column 'Value Type'

In [26]:
before_drop=transaction.shape
transaction=transaction.loc[transaction['Value Type'].notnull(),:]

In [27]:
print('Total number of transaction deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-transaction.shape[0]),
    (before_drop[0]-transaction.shape[0])/before_drop[0]*100))

Total number of transaction deleted :17753 (7.83 %)


In [28]:
drop_columns(transaction,['Value Type'])

Drop 1 columns : 
['Value Type']


### Drop transactions with duplicated information in subset ['Pallet Number','Transaction Date']

In [29]:
before_drop=transaction.shape
transaction.drop_duplicates(subset=['Pallet Number','Transaction Date'],keep='last',inplace=True)

In [30]:
print('Total number of transaction deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-transaction.shape[0]),
    (before_drop[0]-transaction.shape[0])/before_drop[0]*100))

Total number of transaction deleted :1662 (0.80 %)


### Discard symbol '---' in Previous Value and Room

In [31]:
transaction['Previous Value']=transaction['Previous Value'].apply(lambda x:str(x).split('---')[0] if '---' in str(x) else x)

In [32]:
transaction['Room']=transaction['Room'].apply(lambda x:str(x).split('---')[0] if '---' in str(x) else x)

## 7) Data Imputation

### Replace Missing Data in Previous Value with 'UNKNOWN'

In [33]:
before_drop=transaction.shape
transaction.loc[transaction['Previous Value'].isna(),'Previous Value']='UNKNOWN'

In [34]:
print('Total number of transaction deleted :{} ({:.2f} %)'.format(
    (before_drop[0]-transaction.shape[0]),
    (before_drop[0]-transaction.shape[0])/before_drop[0]*100))

Total number of transaction deleted :0 (0.00 %)


### Replace Missing Data in Location Row Code, Column and Height,Location Mission Request Destination with 'na'

In [35]:
# checking
#transaction[transaction['Location Row Code']=='na'][['Location Room Code','Location Row Code','Location Column','Location Row Code']]

In [36]:
transaction.loc[transaction['Location Row Code'].isna(),'Location Row Code']='na'

In [37]:
transaction.loc[transaction['Location Column'].isna(),'Location Column']='na'

In [38]:
transaction.loc[transaction['Location Height'].isna(),'Location Height']='na'

In [39]:
transaction.loc[transaction['Location Mission Request Destination'].isna(),'Location Mission Request Destination']='na'

### Replace Missing Data in 'Room' and 'Location Room Code' with UNKNOWN

In [40]:
transaction.loc[transaction['Room'].isna(),'Room']='UNKNOWN'

In [41]:
transaction.loc[transaction['Location Room Code'].isna(),'Location Room Code']='UNKNOWN'

### Replace Doi Number with True/ False (for row with missing data)

In [42]:
transaction['isDOINumber']=transaction['Doi Number'].notna()

In [43]:
drop_columns(transaction,['Doi Number'])

Drop 1 columns : 
['Doi Number']


## 8) Derive Attributes

### Derive 'Pack Style Code',Pack_Base,Stacking Configuration Code from Pack Type

In [44]:
transaction['Pack Style Code']=transaction['Pack Type'].apply(lambda x:x[2:4])

In [45]:
transaction['Pack_Base']=transaction['Pack Type'].apply(lambda x:x[0])
transaction['Stacking Configuration Code']=transaction['Pack Type'].apply(lambda x:x[1])

### Drop columns : 'Pack Type'

In [46]:
drop_columns(transaction,['Pack Type'])

Drop 1 columns : 
['Pack Type']


### Derive isDayShift, Shift_Date

In [47]:
transaction['isDayShift']=(transaction.index.hour>=7) & (transaction.index.hour<19)

In [48]:
transaction['Shift_Hour']=transaction.index.hour

In [49]:
transaction['Shift_Date']=transaction.index.date

In [50]:
transaction['Shift_Date']=pd.to_datetime(transaction['Shift_Date'],format='%Y-%m-%d')

In [51]:
# handle shift date of night shift with transaction time over 12am
transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']=transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']+datetime.timedelta(days=-1)

### Derive Day_of_week,isWeekend form Shift_Date

In [52]:
transaction['Day_of_week']=transaction['Shift_Date'].dt.dayofweek+1

In [53]:
transaction['isWeekend']=(transaction['Day_of_week']==6) | (transaction['Day_of_week']==7)

In [54]:
temp_list={1:'Mon',2:'Tue',3:'Wed',4:'Thu',5:'Fri',6:'Sat',7:'Sun'}
transaction['Day_of_week']=transaction['Day_of_week'].map(temp_list)

### Derive 'isActualMovement' by asigning False to transaction made by 'QuarryRdCoolstoreAdministration' and 'allocations'

In [55]:
transaction['isActualMovement']=(transaction['Username']!='QuarryRdCoolstoreAdministration') & (transaction['Username']!='allocations')

### Derive isPackedFruitequalFruit

In [56]:
transaction['isPackedFruitequalFruit']=transaction['Packed Fruit']==transaction['Fruit']

In [57]:
# comment out on 11 Oct to identify changes in FW
# drop_columns(transaction,['Packed Fruit','Fruit'])

### Derive Room_row,column,height of Previous and New Value

In [58]:
transaction['Previous_RoomRow']=extract_room_row(transaction,'Previous Value')

In [59]:
transaction['New_RoomRow']=extract_room_row(transaction,'New Value')

In [60]:
transaction['Previous_Column']=transaction['Previous Value'].apply(lambda x :x.split('-')[2]+'-' if '-' in x else 'na')
transaction['Previous_Height']=transaction['Previous Value'].apply(lambda x :x.split('-')[3]+'-' if '-' in x else 'na')

In [61]:
transaction['New_Column']=transaction['New Value'].apply(lambda x :x.split('-')[2]+'-' if '-' in x else 'na')
transaction['New_Height']=transaction['New Value'].apply(lambda x :x.split('-')[3]+'-' if '-' in x else 'na')

In [62]:
temp_list=['Previous_Column','Previous_Height','New_Column','New_Height']
for c in temp_list:
    transaction[c]=transaction[c].apply(lambda x:x.split('-')[0] if '-' in x else x)

### Derive distance to driveway of Previous Value

In [63]:
transaction.loc[transaction['Previous_Column']=='na','distance_to_driveway']='na'

In [64]:
transaction['distance_to_driveway']=transaction['Previous_Column'].apply(lambda x:13-int(x) if x!='na' else 'na')

### Change Columns Name to match with Dispatched Data

In [65]:
transaction.rename(index=str,columns={'Marketer':'Marketer Code','Pack Make':'Pack Make Code'},inplace=True)

### Convert Numeric Category Attributes back to object datatype

In [66]:
temp_list=['Pallet Number','Pack Code','Location Request Number','Plant Code','Pallet Rework Count']

In [67]:
rectify_to_object(transaction,temp_list)

Change Datatype of 5 Column to Object : 
['Pallet Number', 'Pack Code', 'Location Request Number', 'Plant Code', 'Pallet Rework Count']


## Optional Process

### Drop Shift_Date

In [68]:
drop_columns(transaction,['Shift_Date'])

Drop 1 columns : 
['Shift_Date']


### Drop columns related to Location
    - Room is combination of 'Location Room Code','Location Row Code','Location Column','Location Height'
    - The value of Room is not pallet location 

In [69]:
temp_list=['Room','Location Request Number','Location Room Code','Location Row Code','Location Column','Location Height']

In [70]:
drop_columns(transaction,temp_list)

Drop 6 columns : 
['Room', 'Location Request Number', 'Location Room Code', 'Location Row Code', 'Location Column', 'Location Height']


### Drop columns related to New Value

In [71]:
#temp_list=['New Value','New_RoomRow','New_Column','New_Height']
temp_list=['New_RoomRow','New_Column','New_Height']

In [72]:
drop_columns(transaction,temp_list)

Drop 3 columns : 
['New_RoomRow', 'New_Column', 'New_Height']


In [73]:
transaction.reset_index(inplace=True)

In [74]:
summary_object(transaction)

SUMMARY OF 37 NON-NUMERICAL ATTRIBUTES:

32 Object Columns
5 Bool Columns


Unnamed: 0,Attribute,Count,Unique,Missing (%),Top,Top (%),Bottom,Bottom (%)
1,Transaction Date Time,207259,196458,0,2019-07-18 23:19:01,0.00337742,2019-07-19 01:09:44,0.000482488
2,Previous Value,207259,20331,0,UNKNOWN,21.7168,ZE07-02-4-2,0.000482488
3,Pallet Number,207259,17913,0,5.85053e+07,0.0371516,5.96658e+07,0.000482488
4,New Value,207259,16767,0,UNKNOWN,22.8178,Q17-12-8-2,0.000482488
5,Previous_RoomRow,207259,1232,0,UNKNOWN,21.7168,ZE04-01,0.000482488
6,Material Number,207259,266,0,62845,5.69143,56443,0.000482488
7,Pack Code,207259,183,0,41724,7.28412,73815,0.000482488
8,Username,207259,68,0,agubs,5.62822,juaam,0.000482488
9,Purchase Pool Code,207259,30,0,16,54.0469,32,0.000482488
10,Pack Make Code,207259,25,0,KC6,47.9646,KC7,0.000964976


In [75]:
summary_numerical(transaction)

SUMMARY OF 8 NUMERICAL ATTRIBUTES:


Unnamed: 0,Attribute,Count,Missing (%),Mean,Median,Min,Max,Skewness,Kurtosis
1,Fruit Per Pack,207259.0,0.0,55.758032,48.0,18.0,497.0,2.406976,18.49138
2,Pack_Date_day,207259.0,0.0,-62.921769,-67.0,-129.0,-1.0,0.707396,0.007082
3,Packed Fruit,207259.0,0.0,7895.28709,7656.0,-5916.0,20355.0,0.120598,0.936007
4,Fruit,207259.0,0.0,7969.989733,7680.0,0.0,15620.0,0.048472,0.731638
5,Shift_Hour,207259.0,0.0,12.073102,12.0,0.0,23.0,-0.067395,-1.060507
6,Packs,207259.0,0.0,169.182178,160.0,0.0,256.0,-0.152523,-0.888059
7,Trays,207259.0,0.0,255.674655,256.0,0.0,340.47619,-1.859026,7.992295
8,Ok_Until_Date_day,207259.0,0.0,-295.418447,5.0,-10803.0,47.0,-5.833387,32.034261


In [76]:
transaction.head(3)

Unnamed: 0,Transaction Date Time,Customer Label Code,Dry Matter Code,Fruit,Fruit Per Pack,Fruit Size Code,Labelling Indicator Code,Location Mission Request Destination,Marketer Code,Material Number,New Value,Ok Until Date,Pack Code,Pack Date,Packed Fruit,Packhouse Code,Pack Label Code,Pack Make Code,Packs,Pallet Number,Pallet Rework Count,Plant Code,Previous Value,Protocol Code,Purchase Pool Code,Quality Inspection Indicator Code,Storage Method Code,Transaction Date,Transaction Sub Type Code,Trays,Username,Variety Code,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code,Pack_Base,Stacking Configuration Code,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway
0,2019-07-01 00:00:10,N,Y,13000,130,42,LG,na,ZIL,62853,Q02-03-4-1,2019-07-21,41236,2019-06-06,13000,3TPP,N,KC1,100,59717008,0,1103,Q02-03-3-1,N,1,A,CN,2019-07-01 00:00:10,MLA,309.52381,agubs,HW,19.0,-26.0,False,MB,E,N,False,0,Sun,True,True,True,Q02-03,3,1,10
1,2019-07-01 00:00:15,N,Y,9216,36,36,LG,na,ZIL,62809,Q14-15-9-2,2019-07-21,41717,2019-06-05,9216,3TPP,N,KC1,256,59708334,0,1103,Q14-02-10-1,N,1,A,CN,2019-07-01 00:00:15,MTA,256.0,jaspsi,HW,19.0,-27.0,False,IT,E,3,False,0,Sun,True,True,True,Q14-02,10,1,3
2,2019-07-01 00:00:29,N,Y,12650,115,39,LG,na,ZIL,65735,Q14-02-10-1,2019-07-14,76332,2019-06-01,12650,3TPP,N,KCT,110,59699489,0,1103,Q14-02-7-1,N,1,A,CN,2019-07-01 00:00:29,MLA,324.358974,jaspsi,HW,12.0,-31.0,False,BM,E,1,False,0,Sun,True,True,True,Q14-02,7,1,6


## Export

In [77]:
#transaction.to_csv('p_transaction',index=False)

In [78]:
## 11 oct :  export p_transaction_2 that include 'Packed Fruit','Fruit','Packs','Trays','Fruit Per Pack'
#transaction.to_csv('p_transaction_2',index=False)