# Summary

# Content:
    1) Import library
    2) Configuration
    3) Define Functions
    4) Basic Checks
    5) Export Data Description
    6) Data Cleaning
        6.1) Drop columns
            6.1.1) Drop empty columns
            6.1.2) Drop unilabel/univalue columns
        6.2) Transaction
            6.2) Fix Data Quality Issue in 'Fruit Size Code' 
        6.3) Dispatched
    7) Convert to Time Series Data
        7.1) Transaction
            7.1.1) Drop duplicated transactions
        7.2) Dispatched

# Coding

## 1) Import library

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [None]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

## 2) Configuration

In [None]:
filepath_1='C:/Users/Nan/Documents/GitHub_Data/TransactionDetails.csv'
filepath_2='C:/Users/Nan/Documents/GitHub_Data/Dispatched.csv'
filename_1=filepath_1.rstrip('.csv')
filename_2=filepath_2.rstrip('.csv')
transaction=pd.read_csv(filepath_1)
dispatched=pd.read_csv(filepath_2)

## 3) Define Functions

In [None]:
def get_datatype(arg_df):
    
    col_bool=[]
    col_object=[]
    col_number=[]
    col_catogory=[]
    
    for col in arg_df.columns:
        datatype=arg_df[col].dtypes
        if datatype==bool:
            col_bool.append(col)
        elif datatype==object :
            col_object.append(col)
        elif str(datatype)=='category':
            col_catogory.append(col)
        else:
            col_number.append(col)
    print('This dataset has {} Columns\nbool\t:{} \nobject\t:{}  \ncategory:{} \nnumeric\t:{} '
          .format(len(arg_df.columns),len(col_bool),len(col_object),len(col_catogory),len(col_number)))
    
    del arg_df
    gc.collect()
    
    return col_bool,col_object,col_catogory,col_number

In [None]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')

In [None]:
def export_Data_Description(arg_df,**kwarg):
    from scipy.stats import skew
    data_description=pd.DataFrame()
    for c in arg_df.columns:
        data_description.loc[c,'Datatype']=arg_df[c].dtypes
        data_description.loc[c,'Missing%']='{:.3f}'.format((len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100)
        if (len(arg_df[c])-arg_df[c].count())/len(arg_df[c])*100!=100:
            if arg_df[c].dtypes==object:
                data_description.loc[c,'Unique']=len(arg_df[c].unique())
                if len(arg_df[c].unique())==1:
                    data_description.loc[c,'Remark']='Dropped because this column has only single lable'
                else:
                    data_description.loc[c,'Remark']='Frequent: {} ({:.3f} %)'.format(
                        arg_df[c].mode()[0],arg_df[arg_df[c]==arg_df[c].mode()[0]][c].count()/len(arg_df[c])*100)
            else:
                if skew(arg_df[c])==0:
                    data_description.loc[c,'Unique']=1
                    data_description.loc[c,'Remark']='Dropped because this column has only single value'
                else:
                    data_description.loc[c,'Remark']='MAX: {:.3f} MIN: {:.3f} MEAN: {:.3f} STD: {:.3f}'.format(
                        arg_df[c].max(),arg_df[c].min(),arg_df[c].mean(),arg_df[c].std())
        else:
            data_description.loc[c,'Remark']='Dropped because this column is empty'
    data_description.reset_index(inplace=True)
    data_description.index=data_description.index+1
    data_description=data_description.rename(columns={'index':'Attribute'})
    if ('surfix' in kwarg):
        data_description.to_excel('data_description_{}.xlsx'.format(kwarg['surfix']))
    else:
        import datetime
        currentDT = datetime.datetime.now()
        time=str(currentDT.year)+'-'+str(currentDT.month)+'-'+str(currentDT.day)+' '+str(currentDT.hour)+str(currentDT.minute)+str(currentDT.second)
        data_description.to_excel('data_description_{}.xlsx'.format(time))

In [None]:
def rectify_to_category(arg_df,actual_col_list):
    
    object_list=[]
    '''
    for c in arg_df.columns:
        if arg_df[c].dtypes==object or str(arg_df[c].dtypes)=='category':
            object_list.append(c)
    if len(object_list)>0: 
        for column in [c for c in object_list if c not in actual_col_list]: '''
    for c in arg_df.columns:
        if arg_df[c].dtype!=bool:
            object_list.append(c)
    if len(object_list)>0:
        for columns in [c for c in object_list if c not in actual_col_list]:
            arg_df[columns]=arg_df[columns].astype('category',inplace=True)
        print('Change Datatype of {} Column to Category : \n{}'.format(len(object_list),object_list))

In [None]:
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        else:
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list

In [None]:
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')

In [None]:
def export_description(arg_df,str_1,str_2):
    arg_df.groupby(str_1)[str_2].value_counts(dropna=False,
                                              ascending=False).to_frame().to_csv('{}.csv'.format(str_2))

## 4) Basic Checks

## 4.1) Dataset -Transaction

In [None]:
transaction.info()

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(transaction)

In [None]:
transaction.head(3)

In [None]:
summary_object(transaction)

In [None]:
summary_numerical(transaction)

## 4.2) Dataset - Dispatched

In [None]:
dispatched.info()

In [None]:
bool_list,object_list,cat_list,num_list=get_datatype(dispatched)

In [None]:
dispatched.head(3)

In [None]:
summary_object(dispatched)

In [None]:
summary_numerical(dispatched)

## 5) Export Data Description

In [None]:
#export_Data_Description(transaction,surfix='transaction')
#export_Data_Description(dispatched,surfix='dispatched')

## 6) Data Cleaning

### 6.1) Drop columns
    6.1.1) Drop empty columns

In [None]:
drop_empty_column(transaction)

    6.1.2) Drop unilabel/univalue columns

In [None]:
drop_unilable_column(transaction)

    6.1.3) Drop Column 'Transfer To Coolstore' and 'Transfer To Supplier' with 99% data missing

In [None]:
transaction['Transfer To Coolstore'].value_counts()

In [None]:
transaction.drop(columns='Transfer To Coolstore',axis='columns',inplace=True)

In [None]:
transaction.drop(columns='Transfer To Supplier',axis='columns',inplace=True)

## 6.2) Dataset -Transaction
    6.2.1) Fix Data Quality Issue in 'Fruit Size Code' 

In [None]:
print(len(transaction['Fruit Size Code'].unique()))
transaction['Fruit Size Code'].unique().tolist()

In [None]:
transaction['Fruit Size Code']=transaction['Fruit Size Code'].astype('str')

In [None]:
print(len(transaction['Fruit Size Code'].unique()))
transaction['Fruit Size Code'].unique().tolist()

## 6.3) Dataset - Dispatched

In [None]:
drop_empty_column(dispatched)

In [None]:
drop_unilable_column(dispatched)

In [None]:
dispatched.drop(columns='Last Ppqi Date',axis='columns',inplace=True)

## Export and delete Description

In [None]:
code=['Location Room Code','Storage Source Code','Pack Label Code','Pack Indicator Code','Clearance Protocol Code',
     'Customer Label Code','Dry Matter Code','Quality Inspection Indicator Code','Japan Sub Brand Code',
     'Trial Packing Indicator Code','Marketer Code','Brand Code','Variety Code','Fruit Size Code',
      'Pack Category Code','Purchase Pool Code','Maturity Indicator Code','Plant Code']

descrip=[x.replace('Code','Description') for x in code]

In [None]:
#for c,d in zip(code,descrip):
#    export_description(dispatched,c,d)

In [None]:
dispatched.drop(columns=descrip,axis='columns',inplace=True)

In [None]:
#export_description(dispatched,'Packhouse Code','Zespri Li Number')

In [None]:
#dispatched.groupby(['Doi Number','Doi Clearance Date'])['Australian Inspection Reference'].value_counts().to_frame().to_csv('DOI.csv')

In [None]:
dispatched.drop(columns=['Doi Clearance Date','Australian Inspection Reference','Zespri Li Number'],axis='columns',inplace=True)

## Combine Pallet Note Type and Pallet Note Test , drop them
- as both are referring to damaged pallet

In [None]:
dispatched['Pallet Note']=dispatched['Pallet Note Type'].notnull() & dispatched['Pallet Note Text'].notnull()

In [None]:
dispatched.drop(columns=['Pallet Note Type','Pallet Note Text'],axis='columns',inplace=True)

## Drop 'OKUntil ISODate' because it is duplicate of 'Ok Until Date'

In [None]:
dispatched.drop(columns='OKUntil ISODate',axis='columns',inplace=True)

## Find Days difference between Loadout Date and other Date Attribute in dispatched

In [None]:
date_attribute=[c for c in dispatched.columns if 'Date' in c]

In [None]:
for c in date_attribute:
    dispatched[c]=pd.to_datetime(dispatched[c],format='%d/%m/%Y %H:%M')

In [None]:
date_attribute.remove('Loadout Date')
new_date_attribute=[c.replace(' ','_') for c in date_attribute]

In [None]:
for idx,c in enumerate(date_attribute):
    if (c !='Load Start Date'):
        dispatched[new_date_attribute[idx]+'_day']=dispatched[c]-dispatched['Loadout Date']
        dispatched[new_date_attribute[idx]+'_day']=dispatched[new_date_attribute[idx]+'_day'].astype('timedelta64[D]')
    else:
        dispatched[new_date_attribute[idx]+'_hour']=dispatched['Loadout Date']-dispatched[c]
        dispatched[new_date_attribute[idx]+'_hour']=dispatched[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')

In [None]:
dispatched.drop(columns=date_attribute,axis='columns',inplace=True)

In [None]:
dispatched['Load_Start_Date_hour']=-dispatched['Load_Start_Date_hour']

# 7) Convert to Time Series Data
Some of the attributes in this dataset contain datetime information,i.e. 
- Ok Until Date
- Pack Date
- Transaction Date
- Transaction Date Time

Since the focus of this project is on pallet transaction, this dataset will be converted into time series data based on Transaction Date Time.

In [None]:
transaction['Transaction Date Time']=pd.to_datetime(transaction['Transaction Date Time'],format='%d/%m/%Y %I:%M:%S %p')

In [None]:
transaction_ori=transaction.copy()

In [None]:
transaction=transaction.set_index(keys='Transaction Date Time')

In [None]:
transaction.head(3)

## 7.1) Transaction
    7.1.1) Drop duplicated transactions

In [None]:
before_drop_duplicate=transaction.shape
transaction.drop_duplicates(inplace=True)
print('Total number of transaction before dropping duplicate : {}'.format(before_drop_duplicate[0]))
print('Total number of transaction after dropping duplicate : {}'.format(transaction.shape[0]))
print('Total number of transaction decrease by {:.2f} %'.format(
    (before_drop_duplicate[0]-transaction.shape[0])/before_drop_duplicate[0]*100))

In [None]:
before_drop_duplicate2=dispatched.shape
dispatched.drop_duplicates(inplace=True)
print('Total number of instances in Dispatched before dropping duplicate : {}'.format(before_drop_duplicate2[0]))
print('Total number of instances in Dispatched after dropping duplicate : {}'.format(dispatched.shape[0]))
print('Total number of instances in Dispatched decrease by {:.2f} %'.format(
    (before_drop_duplicate2[0]-dispatched.shape[0])/before_drop_duplicate2[0]*100))

    7.1.2) Drop row with 'Value Type'= NaN, then drop column 'Value Type'

In [None]:
transaction['Value Type'].value_counts(dropna=False)

In [None]:
transaction[transaction['Value Type'].isna()]['New Value'].value_counts(dropna=False)

In [None]:
transaction[transaction['Value Type'].isna()]['Previous Value'].value_counts(dropna=False)

In [None]:
transaction[transaction['Value Type'].isna()][['Pallet Number','Location Mission Request Destination','Room']]

In [None]:
transaction[transaction['Pallet Number']==57430039][['Pallet Number','Previous Value','New Value','Username','Value Type']]

In [None]:
transaction[transaction['Pallet Number']==56996581][['Previous Value','New Value','Value Type']]

In [None]:
transaction['drop_value_type']=(transaction['Value Type'].isna()) & (transaction['New Value'].isna())

In [None]:
transaction=transaction[transaction['drop_value_type']==False]

In [None]:
t_time=transaction[transaction['Value Type'].isna()]['Transaction Date']
t_pallet=transaction[transaction['Value Type'].isna()]['Pallet Number']
t_value=transaction[transaction['Value Type'].isna()]['New Value']

In [None]:
for t,p,v in zip(t_time,t_pallet,t_value):
    transaction.loc[(transaction['Pallet Number']==p) & (transaction['Transaction Date']==t),'New Value']=t_value

In [None]:
shape_ori=transaction.shape

In [None]:
transaction=transaction[transaction['Value Type'].notnull()]

In [None]:
transaction.drop(['Value Type','drop_value_type'],axis='columns',inplace=True)

In [None]:
print('This section delete {} row, the shape currently is {}.'.format(shape_ori[0]-transaction.shape[0],transaction.shape))

    7.1.2) Drop row with 'Value Type'= NaN, then drop column 'Value Type'

In [None]:
transaction.groupby('Pallet Number')['Username'].count().sort_values(ascending=False).head()

In [None]:
transaction[transaction.duplicated(subset=['Pallet Number','Transaction Date'])]['Pallet Number'].value_counts().head()

In [None]:
#transaction[transaction['Pallet Number']==59434813][['Previous Value','New Value','Location Mission Request Destination','Username']]

In [None]:
shape_ori=transaction.shape
transaction.drop_duplicates(subset=['Pallet Number','Transaction Date'],keep='last',inplace=True)
print('This section delete {} row, the shape currently is {}.'.format(shape_ori[0]-transaction.shape[0],transaction.shape))

    7.1.3) Replace Missing Data in Previous Value with UNKNOWN

In [None]:
#transaction[transaction['Previous Value'].isna()]['Pallet Number'].value_counts().head()
#transaction[transaction['Pallet Number']==60919422][['Previous Value','New Value']]
transaction.loc[transaction['Previous Value'].isna(),'Previous Value']='UNKNOWN'

## 7.2) Dispatched

Some of the attributes in this dataset contain datetime information,i.e.

    - Ok Until Date
    - Pack Date
    - Loadout Date
    - Load Start Date

Since the focus of this project is on loadout bay, this dataset will be converted into time series data based on Loadout Date

In [None]:
dispatched['Loadout Date']=pd.to_datetime(dispatched['Loadout Date'],format='%d/%m/%Y %H:%M')

In [None]:
dispatched.sort_values(by=['Loadout Date','Order Number','Envelope Number'],inplace=True)

In [None]:
dispatched_ori=dispatched.copy()

In [None]:
dispatched.set_index(keys='Loadout Date',inplace=True)

## 8) Exploratory Data Analysis

In [None]:
transaction.tail(3)

In [None]:
Daily_Transaction=transaction['Pallet Number'].resample('D').count()

In [None]:
fig=plt.figure()
axes=fig.add_axes([0,0,6,4])
Daily_Transaction.plot(marker='o',ax=axes,fontsize=50,markersize=25)
axes.set_title('Daily Pallet Transaction',fontsize=60)

## Derive Attribute - Shift, Shift_Date

In [None]:
transaction['Day_Shift']=(transaction.index.hour>=7) & (transaction.index.hour<19)

In [None]:
transaction['Shift_Hour']=transaction.index.hour

In [None]:
transaction['Shift_Date']=transaction.index.date

In [None]:
transaction['Shift_Date']=pd.to_datetime(transaction['Shift_Date'],format='%Y-%m-%d')

In [None]:
# handle shift date of night shift with transaction time over 12am
transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']=transaction.loc[transaction['Shift_Hour']<7,'Shift_Date']+datetime.timedelta(days=-1)

In [None]:
transaction.loc[:,'Shift_Date'].sample(5)

## Plot Daily Shift Transaction

In [None]:
daily_shift_transaction=transaction.groupby(['Shift_Date','Day_Shift'])['Pallet Number'].count().reset_index()

In [None]:
daily_shift_transaction['Day_Shift']=daily_shift_transaction['Day_Shift'].map({True:'Dayshift',False:'Nightshift'})

In [None]:
pv_daily_shift_transaction=pd.pivot(data=daily_shift_transaction,index='Shift_Date',columns='Day_Shift',values='Pallet Number')

In [None]:
pv_daily_shift_transaction['Allshift']=pv_daily_shift_transaction.sum(axis='columns')

In [None]:
pv_daily_shift_transaction.head()

In [None]:
plt.figure(figsize=(15,6))
plt.plot(pv_daily_shift_transaction['Dayshift'],label='Dayshift',marker='o',color='blue')
plt.plot(pv_daily_shift_transaction['Nightshift'],label='Nightshift',marker='X',markersize=10,color='black')
pv_daily_shift_transaction['Allshift'].plot.area(alpha=0.3,color='skyblue')
plt.legend(loc='upper right')
plt.tight_layout()
plt.xlabel('Date')
plt.ylabel('Transaction')
plt.title('Daily Shift Transaction')

In [None]:
transaction['Shift_Date']=transaction['Shift_Date'].astype('object')

## Plot Distribution of Total Transactions per Pallet

In [None]:
total_transaction_pallet=pd.pivot_table(data=transaction,index='Pallet Number'
                                        ,columns='Username',values='Transaction Date'
                                       ,aggfunc='count').sum(axis='columns')

In [None]:
total_transaction_pallet.sort_values(ascending=False).head(5)

In [None]:
plt.figure(figsize=(12,6))
total_transaction_pallet.plot.hist(bins=40,title='Distribution of Total Transactions per Pallet',alpha=0.6)
plt.xlabel('Transactions per Pallet')
plt.xlim([0,80])

plt.twinx()
total_transaction_pallet.plot.hist(bins=40,title='Distribution of Total Transactions per Pallet',cumulative=True,color='red',alpha=0.3,normed=True)

## New Location

In [None]:
fig=plt.figure(figsize=(15,6))
axes=fig.add_axes([0,0,6,4])
transaction['New Value'].value_counts(dropna=False)[:15].plot(kind='bar',ax=axes,fontsize=60)
axes.set_title('Pallet New Location',fontsize=80)
axes.set_xlabel('New Location',fontsize=80)
axes.set_ylabel('Transaction',fontsize=80)
axes.set_ylim([0,7000])

In [None]:
# ['New Value']=='CONTR'
t_CONTR=transaction.loc[transaction['New Value']=='CONTR',:]

In [None]:
plt.figure(figsize=(15,6))
t_CONTR.groupby(t_CONTR.index.day)['Pallet Number'].count().plot(kind='bar')
plt.ylabel('Transaction with New Value = CONTR')
plt.title('Daily Transaction with New Value = CONTR')

## Location Request Number

In [None]:
#No Relationship between Room and Location Mission Request Destination
# Room = Location Room Code-Location Row Code-Location Column-Location Height
transaction[transaction['Location Mission Request Destination']=='SPQI'][['Room','Location Room Code','Location Row Code','Location Column','Location Height','Location Request Number',
 'Location Mission Request Destination']].sample(10)

In [None]:
transaction['Location Request Number'].value_counts().head()

In [None]:
# total pallet associate with this rqt number is 20, is it possible request number is linked to specified order number?
transaction[transaction['Location Request Number']==1815]['Pallet Number'].unique()

In [None]:
rqt_number_destination=transaction.groupby('Location Mission Request Destination')['Location Request Number'].value_counts().unstack()
rqt_number_destination=rqt_number_destination.T
rqt_number_destination['Total Transaction']=rqt_number_destination.sum(axis='columns')
rqt_number_destination.head(3)

In [None]:
f, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.lineplot(x=rqt_number_destination.index,y='Total Transaction',data=rqt_number_destination,ax=axes[0])
sns.boxplot(y='Total Transaction',data=rqt_number_destination,ax=axes[1])
plt.ylim([0,1000])

In [None]:
# upon checking, request number with exceptional high transaction are likely valid because there are hundreds of pallet linked to these request number
rqt_number_destination.sort_values(by='Total Transaction',ascending=False).head()

In [None]:
len(transaction[transaction['Location Request Number']==1388]['Pallet Number'].unique())

## Plot Total Transactions of each Mission

In [None]:
total_transc_destination=transaction.pivot_table(index='Location Request Number',columns='Location Mission Request Destination',
                        values='Pallet Number',aggfunc='count')

In [None]:
colors=['skyblue','cyan','red','green','blue','orange','violet','grey','olive']

f, axes=plt.subplots(1, len(total_transc_destination.columns), figsize=(16, 4))

for ind,c in enumerate(total_transc_destination.columns):
    #plt.figure(figsize=(16, 4))
    plt.subplot(1,len(total_transc_destination.columns),ind+1)
    total_transc_destination[total_transc_destination[c].notnull()][c].plot.box(color=colors[ind],
                                                                                title='Total:{}'.
                                                                                format(total_transc_destination[total_transc_destination[c]>0][c].sum()))

plt.tight_layout()

In [None]:
# there is only single line in DA1 because there is only one request number attached to DA1
total_transc_destination[total_transc_destination['DA1'].notnull()]

In [None]:
summary_object(transaction)

## Location ended with symbol '---' in Previous Value and Room

In [None]:
t_p_value=transaction['Previous Value'].value_counts().sort_values(ascending=False)
t_n_value=transaction['New Value'].value_counts().sort_values(ascending=False)

In [None]:
t_p_value=t_p_value.to_frame()
t_n_value=t_n_value.to_frame()

In [None]:
t_p_value.reset_index(inplace=True)
t_n_value.reset_index(inplace=True)

In [None]:
pd.merge(left=t_p_value,right=t_n_value,left_on=t_p_value.index,right_on=t_n_value.index).head()

In [None]:
del t_p_value,t_n_value

In [None]:
transaction['Previous Value']=transaction['Previous Value'].apply(lambda x:str(x).split('---')[0] if '---' in str(x) else x)

## Replace Missing Data in Location Row Code, Column and Height with na

In [None]:
# Location Row Code and Column are NaN because Location Room Code is UNKNOWN
# the other reason these data is NaN because the Room Code is FW.CONTR etc.
# transaction[['Pallet Number','Location Room Code','Location Column','Location Row Code','Location Height']].sample(5)

In [None]:
transaction.loc[transaction['Location Row Code'].isna(),'Location Row Code']='na'

In [None]:
transaction.loc[transaction['Location Column'].isna(),'Location Column']='na'

In [None]:
transaction.loc[transaction['Location Height'].isna(),'Location Height']='na'

In [None]:
transaction[transaction['Location Height'].isna()][['Location Room Code','Location Row Code',
                                                    'Location Column','Location Height']].head()

## Replace Doi Number with True/ False

In [None]:
transaction['DOI_Number']=transaction['Doi Number'].notna()

In [None]:
transaction.drop(columns='Doi Number',axis='columns',inplace=True)

## Replace Data with Missing Request Destination with 'na'

In [None]:
transaction.loc[transaction['Location Mission Request Destination'].isna(),'Location Mission Request Destination']='na'

In [None]:
transaction['Location Mission Request Destination'].value_counts()

## Missing Data in Location Room Code with UNKNOWN

In [None]:
transaction[transaction['Location Room Code'].isna()][['Room','Location Room Code',
                                                       'Location Column','Location Row Code',
                                                       'Previous Value','New Value']].head()

In [None]:
transaction[transaction['Location Room Code']=='UNKNOWN'][['Room','Location Room Code',
                                                       'Location Column','Location Row Code',
                                                       'Previous Value','New Value']].head()

In [None]:
transaction.loc[transaction['Location Room Code'].isna(),'Location Room Code']='UNKNOWN'

## Handle '---' in transaction['Room']

In [None]:
transaction['Room'].value_counts().head()

In [None]:
transaction.loc[transaction['Room']=='---','Room']='UNKNOWN'

In [None]:
transaction['Room']=transaction['Room'].apply(lambda x : str(x).split('---')[0] if  '---' in x else x)

## =========No more missing data in transactional dataset=========

In [None]:
# Dispatched

In [None]:
dispatched.head(3)

In [None]:
summary_object(dispatched)

In [None]:
summary_numerical(dispatched)

## Replace Missing data in Row Code & Column with na

In [None]:
#[['Location Room Code','Location Row Code','Location Column','Location Height','Location Request Number']]

In [None]:
dispatched[dispatched['Location Row Code'].isna()]['Location Room Code'].value_counts().head()

In [None]:
dispatched.loc[dispatched['Location Row Code'].isna(),['Location Row Code','Location Column','Location Height']]='na'

In [None]:
dispatched.loc[dispatched['Location Room Code'].isna(),'Location Room Code']='UNKNOWN'

## Replace missing data in 'Container' and 'Container Number' with na, represent chartered

In [None]:
dispatched.loc[dispatched['Container'].isna(),'Shipment Type Code'].value_counts()

In [None]:
dispatched.loc[dispatched['Container'].notnull(),'Shipment Type Code'].value_counts()

In [None]:
dispatched.loc[dispatched['Container'].isna(),['Container','Container Number']]='na'

## Replace missing data in 'Location Request Number' and 'Location Mission Request Destination' with na

In [None]:
dispatched[dispatched['Location Request Number'].isna()]['Location Mission Request Destination'].unique()

In [None]:
dispatched.loc[dispatched['Location Request Number'].isna(),['Location Request Number','Location Mission Request Destination']]='na'

## Replace missing data in Repack_Date_day,Tkl_Email_Date_day,Last_Spqi_Date_day with 0

In [None]:
dispatched.head(2)

In [None]:
dispatched.loc[dispatched['Repack_Date_day'].isna(),'Repack_Date_day']=0

In [None]:
dispatched.loc[dispatched['Tkl_Email_Date_day'].isna(),'Tkl_Email_Date_day']=0
dispatched.loc[dispatched['Last_Spqi_Date_day'].isna(),'Last_Spqi_Date_day']=0

## dispatched : filter out subset of important attribure

In [None]:
dispatched_short=['Pallet Number',
'Location Room Code', 'Location Row Code','Location Column', 'Location Height',
'Loadout Priority',
'Purchase Pool Code',
'Pack Code','Pack Type Code', 'Stacking Configuration Code',
'Order Number', 'Order Line Number','Envelope Number',
'Container', 'Container Number',
'Shipment Type Code', 'Destination Port Code', 'Trucking Company Code',
'Location Request Number','Location Mission Request Destination','Pallet Note',
 'Repack_Date_day',
 'Last_Spqi_Date_day',
 'Pack_Date_day',
 'Ok_Until_Date_day',
 'Load_Start_Date_hour',
 'Tkl_Email_Date_day']

In [None]:
dispatched_s=dispatched[dispatched_short]

In [None]:
summary_numerical(dispatched_s)

In [None]:
dispatched_s.shape

## Merge transaction with dispatched_s

In [None]:
dispatched_s.reset_index(inplace=True)

In [None]:
transaction.reset_index(inplace=True)

In [None]:
merge_dispatch=transaction.merge(dispatched_s,on='Pallet Number')
merge=transaction.merge(dispatched_s,on='Pallet Number',how='left')

In [None]:
merge_dispatch.to_csv('merge_dispatch.csv',index=False)
merge.to_csv('merge.csv',index=False)

In [None]:
len(merge_dispatch_only['Pallet Number'].unique())

In [None]:
merge_dispatch['Location Mission Request Destination_x'].value_counts(dropna=False)

In [None]:
summary_object(transaction)

In [None]:
daily_loadout=dispatched[['Pallet Number']].resample('D').count()

In [None]:
daily_loadout_july=daily_loadout.loc['July 2019',:]

In [None]:
daily_loadout_july.plot(kind='bar')

In [None]:
[c for c in dispatched.columns if 'Date' in c]

In [None]:
plt.figure(figsize=(15,6))
plt.plot(daily_loadout_july['Pallet Number'],marker='o',markerfacecolor='r')
plt.title('Daily Loadout in July 2019',fontsize=20)
plt.ylabel('Pallet Quantity')
x_tick_value=daily_loadout['July 2019'].index.tolist()
x_tick_label=daily_loadout['July 2019'].index.date.tolist()

In [None]:
plt.figure(figsize=(15,6))
dispatched['Location Room Code'].value_counts(normalize=False,dropna=False).head(10).plot(kind='bar')
plt.xlabel('Location Room Code')
plt.ylabel('Transaction')
plt.title('Top 10 Location Room Code of Dispatched Pallet')

In [None]:
daily_loadout['July 2019'].sum()

## Pallet in Both Datasets

In [None]:
pallet_in_transaction=transaction['Pallet Number'].unique()
len(pallet_in_transaction)

In [None]:
pallet_in_dispatched=dispatched['Pallet Number'].unique()
len(pallet_in_dispatched)

In [None]:
pallet_in_both=[c for c in pallet_in_dispatched if c in pallet_in_transaction]

In [None]:
len(pallet_in_both)

In [None]:
transaction['Location Height'].value_counts()