In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime

In [None]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

In [None]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>1:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')

In [None]:
#filepath_1='C:/Users/Nan/Documents/GitHub_Data/merge.csv'
filepath_2='C:/Users/Nan/Documents/GitHub_Data/merge_dispatch.csv'
#filename_1=filepath_1.rstrip('.csv')
filename_2=filepath_2.rstrip('.csv')
#merge=pd.read_csv(filepath_1)
merge_dispatch=pd.read_csv(filepath_2)

In [None]:
plt.figure(figsize=(15,6))
ax=merge_dispatch.groupby('Order Number')['Pallet Number'].count().plot(kind='hist',bins=50)
plt.title('Distribution of Total Pallet of Order Number',fontsize=20)
ax.set_xlabel('Quantity of Order Number',fontsize=12)

In [None]:
merge_dispatch['Transaction Date Time']=pd.to_datetime(merge_dispatch['Transaction Date Time'],format='%Y-%m-%d %H:%M:%S')

In [None]:
merge_dispatch.set_index(keys='Transaction Date Time',inplace=True)

In [None]:
merge_dispatch.groupby('Pallet Number')['New Value'].last().value_counts().head()

In [None]:
merge_dispatch[['Transaction Date','Pallet Number']].duplicated().value_counts()

In [None]:
#pd.pivot_table(data=merge_dispatch,index='Transaction Date',
#               columns='Pallet Number',values='New Value',aggfunc='max',fill_value=)

In [None]:
merge_dispatch['Location Mission Request Destination_x'].value_counts(dropna=False)

In [None]:
summary_object(merge_dispatch)

In [None]:
summary_numerical(merge_dispatch)

In [None]:
merge_dispatch['Pallet Number'].sample(4)

In [None]:
merge_dispatch.groupby(['Order Number','Pallet Number'])['Location Mission Request Destination_x'].count()

In [None]:
merge_dispatch[merge_dispatch['Pallet Number']==59717008][['Location Mission Request Destination_x','Previous Value','New Value','Loadout Date','Location Room Code_y']]

In [None]:
merge_dispatch[merge_dispatch['Pallet Number']==59756595][['Location Mission Request Destination_x','Previous Value','New Value','Loadout Date','Location Room Code_y','Container','Envelope Number']]

In [None]:
merge_dispatch[merge_dispatch['Container']==8548.0]

In [None]:
merge_dispatch[merge_dispatch['Container']!='na'].groupby('Container Number')['Pack Type Code'].value_counts()

## What is the Order Structure?
    - order with container number-> Container Vessel, order without container number  -> Chartered Vessel
    - single order number can has several container
    - all pallet with same envelope number will be loadout at same loadout time, except for 4 envelop number has           2 loadout date, these 4 envelopes are are being loadout together at second loadout under no container number, together with an envelope number 193023 that has 3 pallet. Make up 56 pallet in total

In [None]:
print('Unique element in Order Number :{}'.format(len(merge_dispatch['Order Number'].unique())))
print('Unique element in Container :{}'.format(len(merge_dispatch['Container'].unique())))
print('Unique element in Container Number :{}'.format(len(merge_dispatch['Container Number'].unique())))
print('Unique element in Envelope Number :{}'.format(len(merge_dispatch['Envelope Number'].unique())))

In [None]:
merge_dispatch.groupby(['Container','Container Number','Envelope Number'])['Loadout Date'].value_counts()

In [None]:
# all envelope number matched with one loadut date, except 4 of them that has 2 loadout date 193024,193025,193026,193027
merge_dispatch.groupby(['Envelope Number'])['Loadout Date'].value_counts()

In [None]:
'''these envelope with 2 loadout date has no container id, and all shipped out at same time, suggest it go to chartered?'''

print(merge_dispatch[merge_dispatch['Envelope Number']==193024].groupby(['Container','Container Number','Envelope Number'])['Loadout Date'].value_counts())
print('')
print(merge_dispatch[merge_dispatch['Envelope Number']==193025].groupby(['Container','Container Number','Envelope Number'])['Loadout Date'].value_counts())
print('')
print(merge_dispatch[merge_dispatch['Envelope Number']==193026].groupby(['Container','Container Number','Envelope Number'])['Loadout Date'].value_counts())
print('')
print(merge_dispatch[merge_dispatch['Envelope Number']==193027].groupby(['Container','Container Number','Envelope Number'])['Loadout Date'].value_counts())

In [None]:
merge_dispatch[merge_dispatch['Loadout Date']=='2019-08-02 14:45:00'].groupby(['Container','Envelope Number'])['Pallet Number'].count()

In [None]:
merge_dispatch[merge_dispatch['Envelope Number']==193023]['Loadout Date'].value_counts()

## Which Envelope has been loaded out daily?

In [None]:
merge_dispatch.groupby(['Loadout Date','Container','Container Number'])['Envelope Number'].value_counts()

## Why there is pallet with loadout date before July 2019 although dispatched was merge to transaction data started from July 2019

In [None]:
old_loadout=[193256,193189,193243,193342,197087,197157,193017,197251,197230,197268,197336,197367]

In [None]:
merge_dispatch[merge_dispatch['Envelope Number'].apply(lambda x: x in old_loadout)]

# --- Cannot delete as this will reduce total pallet being move, processed ------
# --- Only delete when analysing  data from Loadout persepctive

## Delete transaction with loadout date before July 2019 and after July 2019

#merge_dispatch['Loadout Date']=pd.to_datetime(merge_dispatch['Loadout Date'],format='%Y-%m-%d %H:%M:%S')
#merge_dispatch[merge_dispatch['Loadout Date'].dt.month!=7]
#merge_dispatch=merge_dispatch[merge_dispatch['Loadout Date'].dt.month==7]

## Delete transaction with loadout date before 15 July 2019
    - this is because currently there is no transaction data in June, transaction data of pallet that being shipped out before 15 June will has no full transaction data visibility
    
#merge_dispatch=merge_dispatch[merge_dispatch['Loadout Date'].dt.day>=15]

## For Every pallet, how many transaction needed for each destination
    - Not the same with --> How many transaction happen in each process everyday?

In [None]:
#test=merge_dispatch.loc[merge_dispatch['Location Mission Request Destination_x']!='na'].groupby('Location Mission Request Destination_x')['Pallet Number'].value_counts()
test=merge_dispatch.groupby('Location Mission Request Destination_x')['Pallet Number'].value_counts()
test.head()

In [None]:
f_test=test.unstack(level=1).T
f_test.sample(5)

In [None]:
f_test[f_test['LAB'].notnull()]

## Distribution of Total Transition in each Process

In [None]:
f_test.plot(kind='hist',alpha=0.3,bins=50,figsize=(16,7))

In [None]:
plt.figure(figsize=(16,36))
for idx,dest in enumerate(f_test.columns):
    plt.subplot(len(f_test.columns),1,idx+1)
    f_test[dest].plot(kind='hist',alpha=0.3,bins=40)
    plt.title(dest,fontsize=20)
    plt.xlabel('Transition Quantity')
    plt.ylabel('Pallet Quantity')
    left, right = plt.xlim()
    plt.xlim(0,right)
plt.tight_layout()

## What is the Mission Request Destination of transaction everyday?
- not the same with --> How many transaction in each department everyday?

In [None]:
test=merge_dispatch.groupby('Location Mission Request Destination_x')['Pallet Number'].resample('D').count().reset_index()

In [None]:
test['Location Mission Request Destination_x'].value_counts()

In [None]:
#test[test['Location Mission Request Destination_x']=='RPK']
#test[test['Location Mission Request Destination_x']=='FW']
#test[test['Location Mission Request Destination_x']=='SPQI']

In [None]:
formatted_test=test.pivot(index='Transaction Date Time',columns='Location Mission Request Destination_x',values='Pallet Number')

In [None]:
formatted_test.head()

In [None]:
formatted_test.plot(kind='bar',stacked=True,figsize=(17,6))

## For each pallet, when is the last transaction in each Process

In [None]:
#checking following result
#merge_dispatch[merge_dispatch['Pallet Number']==53188675][['Previous Value','New Value','Loadout Date','Location Mission Request Destination_x']]

In [None]:
test=merge_dispatch.reset_index()
test=test.groupby(['Pallet Number','Location Mission Request Destination_x'])['Transaction Date Time','Previous Value','New Value','Loadout Date'].last()
test.head(10)

## For each pallet, what is the first and last transaction in each Process

In [None]:
test=merge_dispatch.reset_index()
test_first=test.groupby(['Pallet Number','Location Mission Request Destination_x'])['Transaction Date Time','Previous Value','New Value'].first()
test_last=test.groupby(['Pallet Number','Location Mission Request Destination_x'])['Transaction Date Time','Previous Value','New Value'].last()

In [None]:
test_first.head()

In [None]:
#test_first=test_first.reset_index(level='Location Mission Request Destination_x',col_level=1)
#test_last=test_last.reset_index(level='Location Mission Request Destination_x')

In [None]:
test_first.head()

In [None]:
test_last.head()

In [None]:
test_merge=test_first.merge(test_last,on=test_first.index,suffixes=('_first', '_last'))

In [None]:
test_merge.head()

## For each pallet, what is the duration in each process

In [None]:
test_merge['Transaction_duration_per_process']=test_merge['Transaction Date Time_last']-test_merge['Transaction Date Time_first']

In [None]:
test_merge['duration_per_process_sec']=test_merge['Transaction_duration_per_process'].astype('timedelta64[s]')

In [None]:
test_merge.sample(10)

## What is the distribution of duration_per_process

In [None]:
test_duration=test_merge[['key_0','duration_per_process_sec']]
test_duration.head()

In [None]:
for idx,col in enumerate(['Pallet Number','Location Mission Request Destination_x']):
    test_duration[col]=test_duration['key_0'].apply(lambda x:x[idx])

In [None]:
test_duration.drop(columns='key_0',axis='columns',inplace=True)
test_duration.head()

In [None]:
test_duration['duration_per_process_sec']=test_duration['duration_per_process_sec']/(60*60*24)

In [None]:
sns.boxplot(data=test_duration,x='Location Mission Request Destination_x',y='duration_per_process_sec')

In [None]:
pv_test_duration=pd.pivot(data=test_duration,index='Pallet Number',columns='Location Mission Request Destination_x')

In [None]:
plt.figure(figsize=(16,20))
for idx,c in enumerate(pv_test_duration.columns):
    plt.subplot(len(pv_test_duration.columns),1,idx+1)

    pv_test_duration[c].plot(kind='box',vert=0,color='red')
    locs, labels = plt.yticks()
    #plt.yticks(np.arange(0, 2500000, step=500000),np.arange(0, 2.5, step=0.5))

    plt.xticks()
    plt.yticks([])
    plt.ylabel(c[1])
    plt.xlabel('Duration (days)')
plt.tight_layout()

## What is the transaction history of the pallet sorted according to pallet number

In [None]:
pallet=merge_dispatch['Pallet Number']

In [None]:
pallet.last('M')

## What is the last position of each pallet in transaction data

In [None]:
test=merge_dispatch.reset_index()
test=test.groupby('Pallet Number')['Transaction Date Time','Previous Value','New Value','Loadout Date'].last()
test.head()

In [None]:
test.loc[53188620]

In [None]:
merge_dispatch.loc[merge_dispatch['Pallet Number']==53191163,['Pallet Number','Location Mission Request Destination_x','Previous Value','New Value']]

In [None]:
test=merge_dispatch.reset_index()
test=test.groupby(['Transaction Date Time','Previous Value','Pallet Number'])['Pallet Number'].count()

In [None]:
test.head()

In [None]:
test.unstack(level=1)

In [None]:
merge_dispatch['Previous Value'].unique()

In [None]:
df_position=pd.DataFrame(data=merge_dispatch['Previous Value'].unique(),columns=['Position'])

In [None]:
df_position['Position'].apply(lambda c:c.split('-')[2] if '-' in c else c)