In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

In [None]:
%matplotlib inline
pd.set_option('display.max_columns',500)

# Function

In [None]:
def get_datatype(arg_df):
    
    col_bool=[]
    col_object=[]
    col_number=[]
    col_catogory=[]
    
    for col in arg_df.columns:
        datatype=arg_df[col].dtypes
        if datatype==bool:
            col_bool.append(col)
        elif datatype==object :
            col_object.append(col)
        elif str(datatype)=='category':
            col_catogory.append(col)
        else:
            col_number.append(col)
    print('This dataset has {} Columns\nbool\t:{} \nobject\t:{}  \ncategory:{} \nnumeric\t:{} '
          .format(len(arg_df.columns),len(col_bool),len(col_object),len(col_catogory),len(col_number)))
    
    del arg_df
    gc.collect()
    
    return col_bool,col_object,col_catogory,col_number

In [None]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
    
    

In [None]:
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')

In [None]:
def export_Data_Description(arg_df,**kwarg):
    from scipy.stats import skew
    data_description=pd.DataFrame()
    for c in arg_df.columns:
        data_description.loc[c,'Datatype']=arg_df[c].dtypes
        data_description.loc[c,'Missing%']='{:.3f}'.format((len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100)
        if (len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100!=100:
            if arg_df[c].dtypes==object:
                data_description.loc[c,'Unique']=len(arg_df[c].unique())
                if len(arg_df[c].unique())==1:
                    data_description.loc[c,'Remark']='Dropped because this column has only single lable'
                else:
                    data_description.loc[c,'Remark']='Frequent: {} ({:.3f} %)'.format(
                        arg_df[c].mode()[0],arg_df[arg_df[c]==arg_df[c].mode()[0]][c].count()/len(arg_df[c])*100)
            else:
                if skew(arg_df[c])==0:
                    data_description.loc[c,'Unique']=1
                    data_description.loc[c,'Remark']='Dropped because this column has only single value'
                else:
                    data_description.loc[c,'Remark']='MAX: {:.3f} MIN: {:.3f} MEAN: {:.3f} STD: {:.3f}'.format(
                        arg_df[c].max(),arg_df[c].min(),arg_df[c].mean(),arg_df[c].std())
        else:
            data_description.loc[c,'Remark']='Dropped because this column is empty'
    data_description.reset_index(inplace=True)
    data_description.index=data_description.index+1
    data_description=data_description.rename(columns={'index':'Attribute'})
    if ('surfix' in kwarg):
        data_description.to_excel('data_description_{}.xlsx'.format(kwarg['surfix']))
    else:
        import datetime
        currentDT = datetime.datetime.now()
        time=str(currentDT.year)+'-'+str(currentDT.month)+'-'+str(currentDT.day)+' '+str(currentDT.hour)+str(currentDT.minute)+str(currentDT.second)
        data_description.to_excel('data_description_{}.xlsx'.format(time))

In [None]:
def rectify_to_category(arg_df,actual_col_list):
    
    object_list=[]
    '''
    for c in arg_df.columns:
        if arg_df[c].dtypes==object or str(arg_df[c].dtypes)=='category':
            object_list.append(c)
    if len(object_list)>0: 
        for column in [c for c in object_list if c not in actual_col_list]: '''
    for c in arg_df.columns:
        if arg_df[c].dtype!=bool:
            object_list.append(c)
    if len(object_list)>0:
        for columns in [c for c in object_list if c not in actual_col_list]:
            arg_df[columns]=arg_df[columns].astype('category',inplace=True)
        print('Change Datatype of {} Column to Category : \n{}'.format(len(object_list),object_list))

In [None]:
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        else:
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list

In [None]:
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')

# Configuration

In [None]:
filepath_1='C:/Users/Nan/Documents/GitHub_Data/TransactionDetails.csv'
filepath_2='C:/Users/Nan/Documents/GitHub_Data/Dispatched.csv'
filename_1=filepath_1.rstrip('.csv')
filename_2=filepath_2.rstrip('.csv')

# 1) Import Data

In [None]:
transaction=pd.read_csv(filepath_1)
dispatched=pd.read_csv(filepath_2)

In [None]:
print('Shape of {} \t: {} Rows {} Columns'.format(filename_1,transaction.shape[0],transaction.shape[1]))
print('Shape of {} \t\t: {} Rows {} Columns'.format(filename_2,dispatched.shape[0],dispatched.shape[1]))

In [None]:
transaction.head(2)

In [None]:
dispatched.head(2)

## 1.1) Export Data Description

In [None]:
#export_Data_Description(transaction,surfix='transaction')
#export_Data_Description(dispatched,surfix='dispatched')

# 2) Describe Data

## 2.1) Transaction Detail

In [None]:
transaction.info()

In [None]:
transaction.dtypes.value_counts()

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

In [None]:
#summary_object(transaction)

In [None]:
#summary_numerical(transaction)

In [None]:
#summary_object(transaction,col_bool)

In [None]:
# Export Data Description
# export_Data_Description(transaction,surfix='Transaction Details')

## Drop Unilabel Column

In [None]:
drop_unilable_column(transaction)

In [None]:
transaction.shape

## Drop Empty Numerical Columns 

In [None]:
drop_empty_column(transaction)

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

## Rectify Feature Datatypes

### Numerical to Object

In [None]:
actual_int_col=['Fruit Per Pack','Packed Fruit','Fruit']

In [None]:
#rectify_to_category(transaction,actual_int_col)

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

In [None]:
# temporary stop converting to category, suspect get error after convert dtypes to category
list_num_object=[c for c in num_list if c not in actual_int_col]
transaction[list_num_object]=transaction[list_num_object].astype('object')

In [None]:
#Day 2

In [None]:
transaction.head(2)

In [None]:
#summary_object(transaction)

# Column 'Transaction DT'
- convert to datetime format
- set as index

In [None]:
%timeit transaction['Transaction Date Time']=pd.to_datetime(transaction['Transaction Date Time'],format='%d/%m/%Y %I:%M:%S %p')

In [None]:
transaction.set_index(keys='Transaction Date Time',inplace=True)

In [None]:
transaction.head(2)

In [None]:
df_pallet=transaction['Pallet Number']

In [None]:
daily_pallet_movement=transaction['Pallet Number'].resample('D').count()

In [None]:
daily_pallet_movement.plot()
plt.title('Daily Pallet Movement')
plt.xlabel('Date')
plt.ylabel('Pallet Quantity')

In [None]:
#daily_pallet_movement=transaction.groupby()['Pallet Number'].resample('D').count()

In [None]:
#No Relationship between Room and Location Mission Request Destination
# Room = Location Room Code-Location Row Code-Location Column-Location Height
transaction[transaction['Location Mission Request Destination']=='SPQI'][['Room','Location Room Code','Location Row Code','Location Column','Location Height','Location Request Number',
 'Location Mission Request Destination']].sample(10)

In [None]:
transaction['Location Request Number'].value_counts().head()

In [None]:
transaction[transaction['Location Request Number']==1815]['Pallet Number'].unique()

In [None]:
test=transaction.groupby('Location Mission Request Destination')['Location Request Number'].value_counts().unstack()
test=test.T
#test.columns=test.columns.add_categories('Total Mission')
test['Total Mission']=test.count(axis='columns')
test.sort_values(by='Total Mission',ascending=False).head(2)

In [None]:
# Request Number with Transaction over 1000
test[test.sum(axis='columns')>1000]

In [None]:
test.drop(columns='Total Mission',axis='columns',inplace=True)
#test.columns=test.columns.add_categories('Total Transaction')
test['Total Transaction']=test.sum(axis='columns')

In [None]:
test.head(2)

In [None]:
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.lineplot(x=test.index,y='Total Transaction',data=test,ax=axes[0])
sns.boxplot(y='Total Transaction',data=test,ax=axes[1])

## Plot Total Transactions of each mission

In [None]:
total_transc_destination=transaction.pivot_table(index='Location Request Number',columns='Location Mission Request Destination',
                        values='Pallet Number',aggfunc='count')
#total_transc_destination.columns=total_transc_destination.columns.add_categories('Location_Request_Number')
#total_transc_destination['Location_Request_Number']=total_transc_destination.index
#pd.melt(total_transc_destination,id_vars='Location_Request_Number')

In [None]:
total_transc_destination=total_transc_destination[total_transc_destination.sum(axis='columns')<1000]

In [None]:
'''colors=['skyblue','yellow','red','green','blue','orange','violet','grey','olive']
f, axes = plt.subplots(1, len(total_transc_destination.columns), figsize=(16, 4))
for ind,c in enumerate(total_transc_destination.columns):
    sns.boxplot(data=total_transc_destination[total_transc_destination[c]>0][c],ax=axes[ind],color=colors[ind])
    axes[ind].set_title('Total:{}'.format(total_transc_destination[total_transc_destination[c]>0][c].sum()))
    #axes[ind].set_ylim([0,1000])
    axes[ind].set_xlabel(c)
    if ind==0:
        axes[ind].set_ylabel('Pallet Transaction')

plt.tight_layout()'''

In [None]:
colors=['skyblue','cyan','red','green','blue','orange','violet','grey','olive']
#f, axes = plt.subplots(1, len(total_transc_destination.columns), figsize=(16, 4))
f, axes=plt.subplots(1, len(total_transc_destination.columns), figsize=(16, 4))
#plt.figure(figsize=(16, 4))
for ind,c in enumerate(total_transc_destination.columns):
    #plt.figure(figsize=(16, 4))
    plt.subplot(1,len(total_transc_destination.columns),ind+1)
    total_transc_destination[total_transc_destination[c].notnull()][c].plot.box(color=colors[ind],
                                                                                title='Total:{}'.format(total_transc_destination[total_transc_destination[c]>0][c].sum()))
    #plt.title(ind)
    #axes[ind].set_title('Total:{}'.format(total_transc_destination[total_transc_destination[c]>0][c].sum()))
    #axes[ind].set_ylim([0,1000])
    #axes[ind].set_xlabel(c)
    #if ind==0:
    #    axes[ind].set_ylabel('Pallet Transaction')

plt.tight_layout()


In [None]:
# Value Type isnull() but New Value notnull()
transaction[(transaction['Value Type'].isna()) & (transaction['New Value'].notnull())][['Room','Previous Value','New Value','Value Type']]

In [None]:
transaction[(transaction['Value Type'].isna()) & (transaction['Previous Value'].notnull())][['Room','Previous Value','New Value','Value Type']]

In [None]:
tray_top=pd.pivot_table(data=transaction,index='Trays',values='Pallet Number',aggfunc='count').sort_values(by='Pallet Number',ascending=False)
tray_top.reset_index(inplace=True)
tray_top['Trays']=tray_top['Trays'].apply(lambda x:'{:.3f}'.format(x))
tray_top.set_index(keys='Trays',inplace=True)
tray_top=tray_top.head(30)

In [None]:
tray_top.plot(kind='bar',figsize=(15,5))
plt.ylabel='Transaction'
plt.title='Transaction'
plt.xticks(rotation=295)


In [None]:
#temp=pd.crosstab(transaction['Fruit Size Code'],transaction['Pack Type'])
#temp.plot(kind='bar',stacked=True,figsize=(15,5))
#plt.tight_layout
#del temp

## Found Data Quality Issue in 'Fruit Size Code'

In [None]:
transaction['Fruit Size Code'].unique().tolist()

In [None]:
transaction['Fruit Size Code'].dtypes

In [None]:
transaction['Fruit Size Code']=transaction['Fruit Size Code'].astype('str')

In [None]:
transaction['Fruit Size Code'].unique().tolist()

In [None]:
pd.crosstab(transaction['Fruit Size Code'],transaction['Pack Type']).head()

In [None]:
pallet_request_destination=transaction.pivot_table(index='Pallet Number',columns='Location Mission Request Destination',
                        values='Username',aggfunc='count').sum(axis='columns').sort_values(ascending=False)

In [None]:
pallet_request_destination.head()

In [None]:
plt.figure(figsize=(12,6))
pallet_request_destination.plot.hist(bins=3000,title='Total Pallet vs Mission Request')
plt.xlim([0,100])
plt.xlabel('Mission Request Quantity')
#plt.ylabel('Total Pallet')


## Percentage of Pallet Assigned to Mission Request

In [None]:
pallet_request_destination.count()/len(transaction['Pallet Number'].unique())

In [None]:
pallet_transaction=pd.pivot_table(data=transaction,index='Pallet Number',columns='New Value',
               values='Username',aggfunc='count').sum(axis='columns')

In [None]:
plt.figure(figsize=(12,6))
pallet_transaction.plot.hist(bins=80,title='Pallet Quantity vs Pallet Transaction Quantity')
plt.xlabel('Transaction Quantity')

In [None]:
pallet_transaction.sort_values(ascending=False).head()

In [None]:
summary_object(transaction)