# Summary

# Content:
    1) Import library
    2) Configuration
    3) Define Functions
    4) Basic Checks
    5) Export Data Description
    6) Data Cleaning
        6.1) Drop columns
            6.1.1) Drop empty columns
            6.1.2) Drop unilabel/univalue columns
        6.2) Transaction
            6.2) Fix Data Quality Issue in 'Fruit Size Code' 
        6.3) Dispatched
    7) Convert to Time Series Data
        7.1) Transaction
            7.1.1) Drop duplicated transactions
        7.2) Dispatched

# Coding

## 1) Import library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [None]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)

## 2) Configuration

In [None]:
filepath_1='C:/Users/Nan/Documents/GitHub_Data/TransactionDetails.csv'
filepath_2='C:/Users/Nan/Documents/GitHub_Data/Dispatched.csv'
filename_1=filepath_1.rstrip('.csv')
filename_2=filepath_2.rstrip('.csv')
transaction=pd.read_csv(filepath_1)
dispatched=pd.read_csv(filepath_2)

## 3) Define Functions

In [None]:
def get_datatype(arg_df):
    
    col_bool=[]
    col_object=[]
    col_number=[]
    col_catogory=[]
    
    for col in arg_df.columns:
        datatype=arg_df[col].dtypes
        if datatype==bool:
            col_bool.append(col)
        elif datatype==object :
            col_object.append(col)
        elif str(datatype)=='category':
            col_catogory.append(col)
        else:
            col_number.append(col)
    print('This dataset has {} Columns\nbool\t:{} \nobject\t:{}  \ncategory:{} \nnumeric\t:{} '
          .format(len(arg_df.columns),len(col_bool),len(col_object),len(col_catogory),len(col_number)))
    
    del arg_df
    gc.collect()
    
    return col_bool,col_object,col_catogory,col_number

In [None]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')

In [None]:
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')

In [None]:
def export_Data_Description(arg_df,**kwarg):
    from scipy.stats import skew
    data_description=pd.DataFrame()
    for c in arg_df.columns:
        data_description.loc[c,'Datatype']=arg_df[c].dtypes
        data_description.loc[c,'Missing%']='{:.3f}'.format((len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100)
        if (len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100!=100:
            if arg_df[c].dtypes==object:
                data_description.loc[c,'Unique']=len(arg_df[c].unique())
                if len(arg_df[c].unique())==1:
                    data_description.loc[c,'Remark']='Dropped because this column has only single lable'
                else:
                    data_description.loc[c,'Remark']='Frequent: {} ({:.3f} %)'.format(
                        arg_df[c].mode()[0],arg_df[arg_df[c]==arg_df[c].mode()[0]][c].count()/len(arg_df[c])*100)
            else:
                if skew(arg_df[c])==0:
                    data_description.loc[c,'Unique']=1
                    data_description.loc[c,'Remark']='Dropped because this column has only single value'
                else:
                    data_description.loc[c,'Remark']='MAX: {:.3f} MIN: {:.3f} MEAN: {:.3f} STD: {:.3f}'.format(
                        arg_df[c].max(),arg_df[c].min(),arg_df[c].mean(),arg_df[c].std())
        else:
            data_description.loc[c,'Remark']='Dropped because this column is empty'
    data_description.reset_index(inplace=True)
    data_description.index=data_description.index+1
    data_description=data_description.rename(columns={'index':'Attribute'})
    if ('surfix' in kwarg):
        data_description.to_excel('data_description_{}.xlsx'.format(kwarg['surfix']))
    else:
        import datetime
        currentDT = datetime.datetime.now()
        time=str(currentDT.year)+'-'+str(currentDT.month)+'-'+str(currentDT.day)+' '+str(currentDT.hour)+str(currentDT.minute)+str(currentDT.second)
        data_description.to_excel('data_description_{}.xlsx'.format(time))

In [None]:
def rectify_to_category(arg_df,actual_col_list):
    
    object_list=[]
    '''
    for c in arg_df.columns:
        if arg_df[c].dtypes==object or str(arg_df[c].dtypes)=='category':
            object_list.append(c)
    if len(object_list)>0: 
        for column in [c for c in object_list if c not in actual_col_list]: '''
    for c in arg_df.columns:
        if arg_df[c].dtype!=bool:
            object_list.append(c)
    if len(object_list)>0:
        for columns in [c for c in object_list if c not in actual_col_list]:
            arg_df[columns]=arg_df[columns].astype('category',inplace=True)
        print('Change Datatype of {} Column to Category : \n{}'.format(len(object_list),object_list))

In [None]:
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        else:
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list

In [None]:
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')

## 4) Basic Checks

## 4.1) Dataset -Transaction

In [None]:
transaction.info()

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

In [None]:
transaction.head(3)

In [None]:
summary_object(transaction)

In [None]:
summary_numerical(transaction)

## 4.2) Dataset - Dispatched

In [None]:
dispatched.info()

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(dispatched)

In [None]:
dispatched.head(3)

In [None]:
summary_object(dispatched)

In [None]:
summary_numerical(dispatched)

## 5) Export Data Description

In [None]:
#export_Data_Description(transaction,surfix='transaction')
#export_Data_Description(dispatched,surfix='dispatched')

## 6) Data Cleaning

### 6.1) Drop columns
    6.1.1) Drop empty columns

In [None]:
drop_empty_column(transaction)

    6.1.2) Drop unilabel/univalue columns

In [None]:
drop_unilable_column(transaction)

## 6.2) Dataset -Transaction
    6.2.1) Fix Data Quality Issue in 'Fruit Size Code' 

In [None]:
print
transaction['Fruit Size Code'].unique().tolist()

In [None]:
transaction['Fruit Size Code']=transaction['Fruit Size Code'].astype('str')

In [None]:
print(len(transaction['Fruit Size Code'].unique()))
transaction['Fruit Size Code'].unique().tolist()

## 6.3) Dataset - Dispatched

In [None]:
drop_empty_column(dispatched)

In [None]:
drop_unilable_column(dispatched)

# 7) Convert to Time Series Data
Some of the attributes in this dataset contain datetime information,i.e. 
- Ok Until Date
- Pack Date
- Transaction Date
- Transaction Date Time

Since the focus of this project is on pallet transaction, this dataset will be converted into time series data based on Transaction Date Time.

In [None]:
transaction['Transaction Date Time']=pd.to_datetime(transaction['Transaction Date Time'],format='%d/%m/%Y %I:%M:%S %p')

In [None]:
transaction.set_index(keys='Transaction Date Time',inplace=True)

In [None]:
transaction.head(3)

## 7.1) Transaction
    7.1.1) Drop duplicated transactions

In [None]:
before_drop_duplicate=transaction.shape
transaction.drop_duplicates(inplace=True)
print('Total number of transaction before dropping duplicate : {}'.format(before_drop_duplicate[0]))
print('Total number of transaction after dropping duplicate : {}'.format(transaction.shape[0]))
print('Total number of transaction decrease by {:.2f} %'.format(
    (before_drop_duplicate[0]-transaction.shape[0])/before_drop_duplicate[0]*100))

In [None]:
before_drop_duplicate2=dispatched.shape
dispatched.drop_duplicates(inplace=True)
print('Total number of transaction before dropping duplicate : {}'.format(before_drop_duplicate2[0]))
print('Total number of transaction after dropping duplicate : {}'.format(dispatched.shape[0]))
print('Total number of transaction decrease by {:.2f} %'.format(
    (before_drop_duplicate2[0]-dispatched.shape[0])/before_drop_duplicate2[0]*100))

## 7.2) Dispatched

Some of the attributes in this dataset contain datetime information,i.e.

    - Ok Until Date
    - Pack Date
    - Loadout Date
    - Load Start Date

Since the focus of this project is on loadout bay, this dataset will be converted into time series data based on Loadout Date

In [None]:
dispatched['Loadout Date']=pd.to_datetime(dispatched['Loadout Date'],format='%d/%m/%Y %H:%M')

In [None]:
dispatched.sort_values(by=['Loadout Date','Order Number','Envelope Number'],inplace=True)

In [None]:
dispatched.set_index(keys='Loadout Date',inplace=True)

## 8) Exploratory Data Analysis

In [None]:
plt.style.available

In [None]:
plt.style.use('seaborn')

In [None]:
transaction.head(3)

In [None]:
Daily_Transaction=transaction['Pallet Number'].resample('D').count()

In [None]:
fig=plt.figure()
axes=fig.add_axes([0,0,6,4])
Daily_Transaction.plot(marker='o',ax=axes,fontsize=50,markersize=25)
axes.set_title('Daily Pallet Transaction',fontsize=60)

## Derive Attribute - Shift, Shift_Date

In [None]:
transaction['Day_Shift']=(transaction.index.hour>=7) & (transaction.index.hour<19)

In [None]:
transaction['Shift_Hour']=transaction.index.hour

In [None]:
transaction['Shift_Date']=transaction.index.date

In [None]:
transaction['Shift_Date']=pd.to_datetime(transaction['Shift_Date'],format='%Y-%m-%d')

In [None]:
# handle shift date of night shift with transaction time over 12am
transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']=transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']+datetime.timedelta(days=-1)

In [None]:
transaction.loc[:,'Shift_Date'].sample(5)

## Plot Daily Shift Transaction

In [None]:
daily_shift_transaction=transaction.groupby(['Shift_Date','Day_Shift'])['Pallet Number'].count().reset_index()

In [None]:
daily_shift_transaction['Day_Shift']=daily_shift_transaction['Day_Shift'].map({True:'Dayshift',False:'Nightshift'})

In [None]:
pv_daily_shift_transaction=pd.pivot(data=daily_shift_transaction,index='Shift_Date',columns='Day_Shift',values='Pallet Number')

In [None]:
pv_daily_shift_transaction['Allshift']=pv_daily_shift_transaction.sum(axis='columns')

In [None]:
plt.figure(figsize=(15,6))
plt.plot(pv_daily_shift_transaction['Dayshift'],label='Dayshift',marker='o',color='blue')
plt.plot(pv_daily_shift_transaction['Nightshift'],label='Nightshift',marker='X',markersize=10,color='black')
pv_daily_shift_transaction['Allshift'].plot.area(alpha=0.3,color='skyblue')
plt.legend(loc='upper right')
plt.tight_layout()
plt.xlabel('Date')
plt.ylabel('Transaction')
plt.title('Daily Shift Transaction')

## Destination

In [None]:
fig=plt.figure(figsize=(15,6))
axes=fig.add_axes([0,0,6,4])
transaction['New Value'].value_counts(dropna=False)[:15].plot(kind='bar',ax=axes,fontsize=60)
axes.set_title('Pallet New Location',fontsize=80)
axes.set_xlabel('New Location',fontsize=80)
axes.set_ylabel('Transaction',fontsize=80)
axes.set_ylim([0,7000])

In [None]:
# ['New Value']=='CONTR'
t_CONTR=transaction.loc[transaction['New Value']=='CONTR',:]

In [None]:
t_CONTR.groupby(t_CONTR.index.day)['Pallet Number'].count().plot()

In [None]:
daily_loadout=dispatched['Pallet Number'].resample('D').count()

In [None]:
daily_loadout['July 2019'].plot(kind='bar')

In [None]:
dispatched['Location Room Code'].value_counts(normalize=True,dropna=False).head().plot(kind='bar')

In [None]:
daily_loadout['July 2019'].sum()

## Pallet in Both Datasets

In [None]:
pallet_in_transaction=transaction['Pallet Number'].unique()
len(pallet_in_transaction)

In [None]:
pallet_in_dispatched=dispatched['Pallet Number'].unique()
len(pallet_in_dispatched)

In [None]:
pallet_in_both=[c for c in pallet_in_dispatched if c in pallet_in_transaction]

In [None]:
len(pallet_in_both)

## Handle Row with UNKNOWN value

In [None]:
transaction[transaction['Pallet Number']==59699489][['Pallet Number','Transaction Date','Username','Previous Value','New Value']]