Updates:

- 20 Oct 8pm-10pm : started to code for sitemap generation
- 21 Oct 1pm-3pm : generate sitemap


In [None]:
q_time='2019-07-10 07:00:00'

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime
import re
import fnmatch

In [None]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

In [None]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>2:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            elif (len(arg_df[col].unique())==2) & (df_summary.loc['Missing (%)',col]==0):
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
'============================================================='        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category' and str(datatype) !='datetime64[ns]':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')
'==================================================================='        
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        elif str(arg_df[c].dtypes)!='datetime64[ns]':
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
            elif (len(arg_df[c].unique())==2) & (arg_df[c].isna().mean()>0):
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list
'===================================================================' 
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')
'==================================================================='        
def drop_columns(arg_df,column_names):
    arg_df.drop(columns=column_names,axis='columns',inplace=True)
    print('Drop {} columns : \n{}'.format(len(column_names),column_names))
'============================================================='        
def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    roomrow=roomrow.unique().tolist()
    roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow['Unique_Row'].values
'============================================================='  
def generate_dif_columns(arg_df,left_column,left_sffx,right_sffx):
    common_title=[]
    for idx,c in enumerate(left_column):
        common_title.append(left_column[idx].split(left_sffx)[0])
    print('There are {} common columns : \n{}'.format(len(left_column),common_title))

    for idx,c in enumerate(common_title):
        compare_col=common_title[idx].replace(' ','')
        arg_df['dif_'+compare_col]=(arg_df[c+left_sffx]!=arg_df[c+right_sffx]) & (arg_df[c+right_sffx].notna())
    print('\nColumns Generated : {}'.format(len(common_title)))
'==================================================================='        
def find_time_dif_hour(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')

# 1) Import 2 dataset : trans and disp

In [None]:
path='C:/Users/Nan/Documents/GitHub_Data/'
file_1=path+'p_transaction_2.csv'
file_2=path+'p_dispatched_2.csv'

filename_1=file_1
filename_2=file_2

sffx_transaction='_Trsc'
sffx_dispatched='_Dptch'
target_process='FW'

In [None]:
filename_1

In [None]:
trans=pd.read_csv(filename_1)
disp=pd.read_csv(filename_2)

In [None]:
trans.shape

In [None]:
trans.head(2)

In [None]:
disp.shape

In [None]:
disp.head(2)

### Data Processing so that common columns have same set of data label
    -change to string:Fruit Size Code_Trsc
    -truncate 0 at first position:Purchase Pool Code_Trsc
    -incorrect calculation as referring to dif,should remain in datetime: date columns(incl Ok_Until_Date_day,Pack_Date_day)

In [None]:
trans['Fruit Size Code']=trans['Fruit Size Code'].astype('str')
disp['Fruit Size Code']=disp['Fruit Size Code'].astype('str')

In [None]:
trans['Purchase Pool Code']=trans['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)
disp['Purchase Pool Code']=disp['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)

In [None]:
#drop_columns(disp,['Pack_Date_day', 'Ok_Until_Date_day'])
drop_columns(disp,['Pack_Date_day'])

In [None]:
# to prevent OK Until Date with same date but dif time
trans['Ok Until Date']=pd.to_datetime(trans['Ok Until Date'],format='%Y-%m-%d %H:%M:%S')
disp['Ok Until Date']=pd.to_datetime(disp['Ok Until Date'],format='%Y-%m-%d %H:%M:%S')

In [None]:
trans['Ok Until Date']=trans['Ok Until Date'].dt.date
disp['Ok Until Date']=disp['Ok Until Date'].dt.date

In [None]:
## 12 Oct : comment out this bcs found out Packed Trays is 0 in disp
# disp.rename(index=str,columns={'Packed Trays':'Trays'},inplace=True)

In [None]:
trans['Transaction Date Time']=pd.to_datetime(trans['Transaction Date Time'],format='%Y-%m-%d %H:%M:%S')

## 2) Preprocess transactional data

In [None]:
trans.head(10)

### Add Query Time
df=trans.copy()
df['query_Time']=q_time
list_time=[c for c in list(df.columns) if 'Time' in c]
list_time
for c in list_time:
    df[c]=pd.to_datetime(df[c],format='%Y-%m-%d %H:%M:%S')
trans_prev=df.loc[df['Transaction Date Time']<df['query_Time']]
trans_prev.shape
trans_prev.head()

In [None]:
temp_df=trans.groupby('Previous Value')['Transaction Date Time','Pallet Number'].first()

In [None]:
temp_df_2=trans.groupby('New Value')['Transaction Date Time','Pallet Number'].first()

In [None]:
temp_df.reset_index(inplace=True)
temp_df_2.reset_index(inplace=True)

In [None]:
df_merge=temp_df.merge(temp_df_2,how='outer',left_on='Previous Value',right_on='New Value',suffixes=('_From', '_To'))
df_merge.sample(5)

In [None]:
# Exclude row with Transaction Date Time_To happened before Transaction Date Time_From
df=df_merge.loc[(df_merge['Transaction Date Time_From']<df_merge['Transaction Date Time_To']) |
             df_merge['Transaction Date Time_To'].isna(),:]

In [None]:
df.shape

### Check Duplicates

In [None]:
df[df['Pallet Number_From'].duplicated()]

In [None]:
df_pallet_first_trsn=trans[['Pallet Number','Transaction Date Time']].groupby('Pallet Number')['Transaction Date Time'].min()

In [None]:
df_pallet_first_trsn=df_pallet_first_trsn.to_frame().reset_index()

In [None]:
df_pallet_first_trsn.head()

In [None]:
temp=df.shape
df=df.merge(df_pallet_first_trsn,how='inner',
         left_on=['Transaction Date Time_From','Pallet Number_From'],
        right_on=['Transaction Date Time','Pallet Number'])
print('Total Row Rectified : {}'.format(temp[0]-df.shape[0]))

In [None]:
df.sample(10)

In [None]:
df=df[['Previous Value','Pallet Number_From','Transaction Date Time_From']]
df.rename(index=str,columns={'Previous Value':'Location'},inplace=True)

In [None]:
df['isOccupied']=1

### Counter check with Complete set of position

In [None]:
temp_list=trans['Previous Value'].unique()
temp_list_2=trans['New Value'].unique()

In [None]:
len(temp_list)

In [None]:
temp_list=np.append(temp_list,temp_list_2)

In [None]:
unique_position=set(temp_list)

In [None]:
print('Total Unique Position : {}'.format(len(unique_position)))

In [None]:
unique_position=list(unique_position)

In [None]:
unique_position_hyphen=list(c for c in unique_position if '-' in c)

In [None]:
df_Location=pd.DataFrame(columns=['Location'],data=unique_position_hyphen)

In [None]:
df=df.merge(df_Location,how='outer',on='Location')

In [None]:
df['isOccupied'].fillna(0,inplace=True)
df['Pallet Number_From'].fillna(0,inplace=True)

### Filter Out Location with Hyphen

In [None]:
df=df.loc[df['Location'].str.contains('-'),:]
df.shape

### Derive Room Code

In [None]:
df['Location_List']=df['Location'].str.split('-')

In [None]:
df['Room']=df['Location_List'].apply(lambda x:x[0])
df['Row']=df['Location_List'].apply(lambda x:x[1])
df['Column']=df['Location_List'].apply(lambda x:x[2])
df['Height']=df['Location_List'].apply(lambda x:x[3])

In [None]:
drop_columns(df,['Location_List'])

In [None]:
df.sample(5)

In [None]:
df['Room'].value_counts(dropna=False).sort_index()

In [None]:
df=df.sort_values(by=['Room','Row','Column','Height']).reset_index(drop=True)

In [None]:
df.rename(index=str,columns={'Pallet Number_From':'Pallet Number',
                            'Transaction Date Time_From':'Transaction Date Time'},inplace=True)

In [None]:
df.head()

### Summary

In [None]:
d_empty=len(df[df['isOccupied']==0])
d_occupied=len(df[df['isOccupied']==1])
#d_empty=len(df[df['isEmpty']==0])
#d_occupied=len(df[df['isEmpty']==1])

In [None]:
print('Total Pallet Position : {}'.format(len(df['Location'])))
print('Occupied Position: {:} '.format(d_occupied))
print('Occupancy Rate : {:.3f} %'.format(d_occupied/(d_empty+d_occupied)*100))

In [None]:
df.groupby('Room')['isOccupied'].sum()

In [None]:
df['isOccupied'].value_counts()

### Export

In [None]:
df.to_csv('Sitemap_July2019.csv',index=False)