Updates:

- 20 Oct 8pm-10pm : started to code for sitemap generation
- 21 Oct 1pm-3pm : generate sitemap


In [1]:
q_time='2019-07-10 07:00:00'

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from scipy.stats import skew,kurtosis
import datetime
import re
import fnmatch

In [3]:
%matplotlib inline
# to view all columns
pd.set_option('display.max_columns',500)
plt.style.use('seaborn')

In [4]:
def summary_object(arg_df):
    
    object_list=[]
    category_list=[]
    bool_list=[]
    unilabel_list=[]
    missing_list=[]
    
    for c in arg_df.columns:
        if arg_df[c].dtypes==object:
            object_list.append(c)
        elif str(arg_df[c].dtypes)=='category':
            category_list.append(c)
        elif arg_df[c].dtypes==bool:
            bool_list.append(c)
    if len(object_list)+len(category_list)+len(bool_list)>0:    
        index_list=['Count','Unique','Missing (%)','Top','Top (%)','Bottom','Bottom (%)']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(object_list))),index=index_list,columns=object_list)

        for col in object_list+category_list+bool_list:
            vc=arg_df[col].value_counts().reset_index()
            df_summary.loc['Count',col]=(arg_df[col].count())
            df_summary.loc['Unique',col]=len(arg_df[col].unique())
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            df_summary.loc['Top',col]=vc.iloc[0,0]
            df_summary.loc['Top (%)',col]=vc.iloc[0,1]/len(arg_df)*100
            if len(arg_df[col].unique())>2:
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            elif (len(arg_df[col].unique())==2) & (df_summary.loc['Missing (%)',col]==0):
                df_summary.loc['Bottom',col]=vc.iloc[-1,0]
                df_summary.loc['Bottom (%)',col]=vc.iloc[-1,1]/len(arg_df)*100
            else:
                unilabel_list.append(col)
            if df_summary.loc['Missing (%)',col]==100:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Unique'],ascending=False)
        df_summary=df_summary[(df_summary['Unique']>1) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NON-NUMERICAL ATTRIBUTES:\n'.format(
            len(object_list)+len(category_list)+len(bool_list)))
        if len(object_list)>0:
            print('{} Object Columns'.format(len(object_list)))
        if len(category_list)>0:
            print('{} Categorical Columns'.format(len(category_list)))
        if len(bool_list)>0:
            print('{} Bool Columns'.format(len(bool_list)))
        if len(unilabel_list)>0:
            print('\n{} Columns with Single Label : \n{}'.format(len(unilabel_list),unilabel_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))   
            
        del arg_df,object_list,vc,index_list,unilabel_list,missing_list
        gc.collect()
        return df_summary
    else:
        print('No Non-Numerical Attributes')
'============================================================='        
def summary_numerical(arg_df):

    target_list=[]
    missing_list=[]
    zero_skew_list=[]
    
    for c in arg_df.columns:
        datatype=arg_df[c].dtypes
        if datatype != object and datatype != bool and str(datatype) != 'category' and str(datatype) !='datetime64[ns]':
            target_list.append(c)
    if len(target_list)>0:
        from scipy.stats import skew,kurtosis
        
        index_list=['Count','Missing (%)','Mean','Median','Min','Max','Skewness','Kurtosis']
        df_summary=pd.DataFrame(data=np.zeros((len(index_list),len(target_list))),
                                index=index_list,columns=target_list)
        for col in target_list:
            df_summary.loc['Count',col]=arg_df[col].count()
            df_summary.loc['Missing (%)',col]=arg_df[col].isna().mean()*100
            if df_summary.loc['Missing (%)',col]!=100:
                df_summary.loc['Mean',col]=arg_df[col].mean()
                df_summary.loc['Median',col]=arg_df[col].median()
                df_summary.loc['Min',col]=arg_df[col].min()
                df_summary.loc['Max',col]=arg_df[col].max()
                df_summary.loc['Skewness',col]=skew(arg_df[col])
                if df_summary.loc['Skewness',col]==0:
                    zero_skew_list.append(col)
                df_summary.loc['Kurtosis',col]=kurtosis(arg_df[col])
            else:
                missing_list.append(col)
                
        df_summary=df_summary.T.sort_values(['Missing (%)','Skewness'],ascending=False)
        df_summary=df_summary[(df_summary['Skewness']!=0) & (df_summary['Missing (%)']!=100)]
        df_summary.reset_index(inplace=True)
        df_summary.index=df_summary.index+1
        df_summary.columns=['Attribute']+index_list
        
        print('SUMMARY OF {} NUMERICAL ATTRIBUTES:'.format(len(target_list)))
        if len(zero_skew_list)>0:
            print('\n{} Columns with Single Value: \n{}'.format(len(zero_skew_list),zero_skew_list))
        if len(missing_list)>0:
            print('\n{} Empty Columns: \n{}'.format(len(missing_list),missing_list))
        del arg_df,target_list,index_list
        gc.collect()

        return df_summary
    else:
        print('No Numerical Attributes')
'==================================================================='        
def drop_unilable_column(arg_df):
    
    target_list=[]
    object_list=[]
    number_list=[]
    for c in arg_df.columns:
        if (arg_df[c].dtypes==object) | (str(arg_df[c].dtypes)=='category') | (arg_df[c].dtypes==bool):
            object_list.append(c)
        elif str(arg_df[c].dtypes)!='datetime64[ns]':
            number_list.append(c)
    if len(object_list)>0:    
        for c in object_list:
            if len(arg_df[c].unique())==1:
                target_list.append(c)
            elif (len(arg_df[c].unique())==2) & (arg_df[c].isna().mean()>0):
                target_list.append(c)
    
    if len(number_list)>0:   
        from scipy.stats import skew
        for c in number_list:
            if skew(arg_df[c])==0:
                target_list.append(c)
                
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis='columns',inplace=True)
        print('Drop {} Columns with Single Label:\n{}'.format(len(target_list),target_list))
    else: 
        print('No Columns with Single Label/Value')

    del target_list,object_list
'===================================================================' 
def drop_empty_column(arg_df):
    target_list=[]
    for c in arg_df.columns:
        if arg_df[c].count()==0:
            target_list.append(c)
    if len(target_list)>0:
        arg_df.drop(columns=target_list,axis=1,inplace=True)
        print('Delete {} Empty Column : \n{}'.format(len(target_list),target_list))
    else:
        print('No Empty Column')
'==================================================================='        
def drop_columns(arg_df,column_names):
    arg_df.drop(columns=column_names,axis='columns',inplace=True)
    print('Drop {} columns : \n{}'.format(len(column_names),column_names))
'============================================================='        
def extract_room_row(arg_df,col_position):
    '''To return unique Room-Row from standard Room-Row-Column-Height position data'''
    roomrow=[]
    roomrow=arg_df[col_position].apply(lambda x :x.split('-')[0]+'-'+x.split('-')[1] if '-' in x else x)
    roomrow=roomrow.unique().tolist()
    roomrow=pd.DataFrame(roomrow,columns=['Unique_Row']).sort_values(by='Unique_Row')
    return roomrow['Unique_Row'].values
'============================================================='  
def generate_dif_columns(arg_df,left_column,left_sffx,right_sffx):
    common_title=[]
    for idx,c in enumerate(left_column):
        common_title.append(left_column[idx].split(left_sffx)[0])
    print('There are {} common columns : \n{}'.format(len(left_column),common_title))

    for idx,c in enumerate(common_title):
        compare_col=common_title[idx].replace(' ','')
        arg_df['dif_'+compare_col]=(arg_df[c+left_sffx]!=arg_df[c+right_sffx]) & (arg_df[c+right_sffx].notna())
    print('\nColumns Generated : {}'.format(len(common_title)))
'==================================================================='        
def find_time_dif_hour(arg_df,ref_date,proc_date):
    new_date_attribute=[c.replace(' ','_') for c in proc_date]
    for idx,c in enumerate(proc_date):
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[c]-arg_df[ref_date]
        arg_df[new_date_attribute[idx]+'_hour']=arg_df[new_date_attribute[idx]+'_hour'].astype('timedelta64[h]')

# 1) Import 2 dataset : trans and disp

In [5]:
path='C:/Users/Nan/Documents/GitHub_Data/'
file_1=path+'p_transaction_2.csv'
file_2=path+'p_dispatched_2.csv'

filename_1=file_1
filename_2=file_2

sffx_transaction='_Trsc'
sffx_dispatched='_Dptch'
target_process='FW'

In [6]:
filename_1

'C:/Users/Nan/Documents/GitHub_Data/p_transaction_2.csv'

In [7]:
trans=pd.read_csv(filename_1)
disp=pd.read_csv(filename_2)

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
trans.shape

(207259, 48)

In [9]:
trans.head(2)

Unnamed: 0,Transaction Date Time,Customer Label Code,Dry Matter Code,Fruit,Fruit Per Pack,Fruit Size Code,Labelling Indicator Code,Location Mission Request Destination,Marketer Code,Material Number,New Value,Ok Until Date,Pack Code,Pack Date,Packed Fruit,Packhouse Code,Pack Label Code,Pack Make Code,Packs,Pallet Number,Pallet Rework Count,Plant Code,Previous Value,Protocol Code,Purchase Pool Code,Quality Inspection Indicator Code,Storage Method Code,Transaction Date,Transaction Sub Type Code,Trays,Username,Variety Code,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code,Pack_Base,Stacking Configuration Code,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway
0,2019-07-01 00:00:10,N,Y,13000,130,42,LG,na,ZIL,62853,Q02-03-4-1,2019-07-21 00:00:00,41236,2019-06-06 00:00:00,13000,3TPP,N,KC1,100,59717008,0,1103,Q02-03-3-1,N,1,A,CN,2019-07-01 00:00:10,MLA,309.52381,agubs,HW,19.0,-26.0,False,MB,E,N,False,0,Sun,True,True,True,Q02-03,3,1,10
1,2019-07-01 00:00:15,N,Y,9216,36,36,LG,na,ZIL,62809,Q14-15-9-2,2019-07-21 00:00:00,41717,2019-06-05 00:00:00,9216,3TPP,N,KC1,256,59708334,0,1103,Q14-02-10-1,N,1,A,CN,2019-07-01 00:00:15,MTA,256.0,jaspsi,HW,19.0,-27.0,False,IT,E,3,False,0,Sun,True,True,True,Q14-02,10,1,3


In [10]:
disp.shape

(32251, 72)

In [11]:
disp.head(2)

Unnamed: 0,Pallet Number,Location Room Code,Location Row Code,Location Column,Location Height,Location Request Number,Storage Source Code,Pack Label Code,Pack Indicator Code,Clearance Protocol Code,Customer Label Code,Dry Matter Code,Storing Characteristic Code,CCKPassed Failed,Quality Inspection Indicator Code,Japan Sub Brand Code,Trial Packing Indicator Code,Marketer Code,Loadout Priority,Material Number,Material Group Code,Brand Code,Variety Code,Fruit Class Code,Fruit Size Code,Pack Category Code,Labelling Indicator Code,Purchase Pool Code,Pack Code,Growing Method,Packs Per Pallet,Maturity Indicator Code,Ok Until Date,Plant Code,Ignore Psr,Pack Type Code,Packed Fruit,Packed Trays,Packhouse Code,Pallet Card Reference,Fruit,Order Number,Order Line Number,Loadout Date,Shipment Type Code,Destination Port Code,Trucking Company Code,Packs,Full,Market Holds,Stacking Configuration Code,Pack Style Code,Pack Make Code,Load Start Date,Envelope Number,Container,Container Number,isDoi_Number,isInternal_Holds,isTemporary_Seal_Number,isTemperature_Recorder_Number,isLocation_Mission_Request_Destination,isRf_Id_Tag1,isPermanentSeal,Pallet Note,Pack_Base,Repack_Date_day,Last_Spqi_Date_day,Pack_Date_day,Doi_Clearance_Date_day,Tkl_Email_Date_day,Load_Start_Date_hour
0,54510208,Q27,A,11.0,1.0,na,CN,N,N,A,N,Y,P,na,A,N,N,ZIL,20,61600,Zespri,Z,GA,1,22,ENIJ,JB,16,10168,CK,200,C,2019-05-05 00:00:00,1103,False,ENIJKC3,4400,200.0,3TPP,3121DAY,4400,5146501,20,2019-03-28 13:00:00,1,JPTYO,PRHA,200,True,"KR, RE, Z7",N,IJ,KC3,2019-03-27 00:00:00,193017,na,na,False,False,False,False,False,True,False,False,E,,,-11.0,,,-37.0
1,54510260,UNKNOWN,na,na,0.0,na,CN,N,N,A,N,Y,P,na,A,N,N,ZIL,20,61577,Zespri,Z,GA,1,25,ENML,GR,16,10182,CK,160,C,2019-05-05 00:00:00,1103,False,ENMLKC6,6560,262.4,3TPP,3121DAY,6560,5166002,20,2019-04-03 16:00:00,18,SGSIN,TOLL,160,True,"KR, RE, Z7",N,ML,KC6,2019-04-02 00:00:00,193074,7839.0,PCIU6069215,False,False,True,True,False,True,False,False,E,,,-17.0,,,-40.0


### Data Processing so that common columns have same set of data label
    -change to string:Fruit Size Code_Trsc
    -truncate 0 at first position:Purchase Pool Code_Trsc
    -incorrect calculation as referring to dif,should remain in datetime: date columns(incl Ok_Until_Date_day,Pack_Date_day)

In [12]:
trans['Fruit Size Code']=trans['Fruit Size Code'].astype('str')
disp['Fruit Size Code']=disp['Fruit Size Code'].astype('str')

In [13]:
trans['Purchase Pool Code']=trans['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)
disp['Purchase Pool Code']=disp['Purchase Pool Code'].apply(lambda x:x.split('0')[1] if x[0]=='0' else x)

In [14]:
#drop_columns(disp,['Pack_Date_day', 'Ok_Until_Date_day'])
drop_columns(disp,['Pack_Date_day'])

Drop 1 columns : 
['Pack_Date_day']


In [15]:
# to prevent OK Until Date with same date but dif time
trans['Ok Until Date']=pd.to_datetime(trans['Ok Until Date'],format='%Y-%m-%d %H:%M:%S')
disp['Ok Until Date']=pd.to_datetime(disp['Ok Until Date'],format='%Y-%m-%d %H:%M:%S')

In [16]:
trans['Ok Until Date']=trans['Ok Until Date'].dt.date
disp['Ok Until Date']=disp['Ok Until Date'].dt.date

In [17]:
## 12 Oct : comment out this bcs found out Packed Trays is 0 in disp
# disp.rename(index=str,columns={'Packed Trays':'Trays'},inplace=True)

In [18]:
trans['Transaction Date Time']=pd.to_datetime(trans['Transaction Date Time'],format='%Y-%m-%d %H:%M:%S')

## 2) Preprocess transactional data

In [19]:
trans.head(10)

Unnamed: 0,Transaction Date Time,Customer Label Code,Dry Matter Code,Fruit,Fruit Per Pack,Fruit Size Code,Labelling Indicator Code,Location Mission Request Destination,Marketer Code,Material Number,New Value,Ok Until Date,Pack Code,Pack Date,Packed Fruit,Packhouse Code,Pack Label Code,Pack Make Code,Packs,Pallet Number,Pallet Rework Count,Plant Code,Previous Value,Protocol Code,Purchase Pool Code,Quality Inspection Indicator Code,Storage Method Code,Transaction Date,Transaction Sub Type Code,Trays,Username,Variety Code,Ok_Until_Date_day,Pack_Date_day,isDOINumber,Pack Style Code,Pack_Base,Stacking Configuration Code,isDayShift,Shift_Hour,Day_of_week,isWeekend,isActualMovement,isPackedFruitequalFruit,Previous_RoomRow,Previous_Column,Previous_Height,distance_to_driveway
0,2019-07-01 00:00:10,N,Y,13000,130,42,LG,na,ZIL,62853,Q02-03-4-1,2019-07-21,41236,2019-06-06 00:00:00,13000,3TPP,N,KC1,100,59717008,0,1103,Q02-03-3-1,N,1,A,CN,2019-07-01 00:00:10,MLA,309.52381,agubs,HW,19.0,-26.0,False,MB,E,N,False,0,Sun,True,True,True,Q02-03,3,1,10
1,2019-07-01 00:00:15,N,Y,9216,36,36,LG,na,ZIL,62809,Q14-15-9-2,2019-07-21,41717,2019-06-05 00:00:00,9216,3TPP,N,KC1,256,59708334,0,1103,Q14-02-10-1,N,1,A,CN,2019-07-01 00:00:15,MTA,256.0,jaspsi,HW,19.0,-27.0,False,IT,E,3,False,0,Sun,True,True,True,Q14-02,10,1,3
2,2019-07-01 00:00:29,N,Y,12650,115,39,LG,na,ZIL,65735,Q14-02-10-1,2019-07-14,76332,2019-06-01 00:00:00,12650,3TPP,N,KCT,110,59699489,0,1103,Q14-02-7-1,N,1,A,CN,2019-07-01 00:00:29,MLA,324.358974,jaspsi,HW,12.0,-31.0,False,BM,E,1,False,0,Sun,True,True,True,Q14-02,7,1,6
3,2019-07-01 00:00:30,N,Y,7800,78,27,GR,na,ZIL,61591,Q26-01-4-2,2019-07-07,10175,2019-05-26 00:00:00,7800,3TPP,N,KC1,100,59672352,0,1103,Q27-02-12-1,N,1,A,CN,2019-07-01 00:00:30,MTA,288.888889,mansot,HW,5.0,-37.0,False,MB,E,N,False,0,Sun,True,True,True,Q27-02,12,1,1
4,2019-07-01 00:00:37,N,Y,10200,102,36,LG,na,ZIL,62845,Q01-03-8-2,2019-07-21,41724,2019-06-05 00:00:00,10200,3TPP,N,KC1,100,59708617,2,1103,Q01-02-5-2,N,1,A,CN,2019-07-01 00:00:37,MTA,283.333333,yasms,HW,19.0,-27.0,False,MB,E,N,False,0,Sun,True,True,True,Q01-02,5,2,8
5,2019-07-01 00:00:39,N,Y,7540,52,33,GR,SPQI,ZIL,61554,UNKNOWN,2019-07-07,10193,2019-05-25 00:00:00,7540,3TPP,N,KC1,145,59669482,0,1103,ALLO,N,1,A,CN,2019-07-01 00:00:39,MLA,228.484848,teaio,HW,5.0,-38.0,False,P1,E,N,False,0,Sun,True,True,True,ALLO,na,na,na
6,2019-07-01 00:00:44,N,Y,13000,130,42,LG,na,ZIL,62853,Q02-16-6-2,2019-07-21,41236,2019-06-06 00:00:00,13000,3TPP,N,KC1,100,59717008,0,1103,Q02-03-4-1,N,1,A,CN,2019-07-01 00:00:44,MTA,309.52381,agubs,HW,19.0,-26.0,False,MB,E,N,False,0,Sun,True,True,True,Q02-03,4,1,9
7,2019-07-01 00:01:03,N,Y,6264,27,27,GR,na,ZIL,51182,Q02-03-3-2,2019-07-21,12494,2019-06-06 00:00:00,6264,3TPP,N,KC1,232,59718630,0,1103,Q02-03-2-2,N,1,A,CN,2019-07-01 00:01:03,MLA,232.0,agubs,HW,19.0,-26.0,False,IT,E,N,False,0,Sun,True,True,True,Q02-03,2,2,11
8,2019-07-01 00:01:10,N,Y,12650,115,39,LG,na,ZIL,65735,UNKNOWN,2019-07-14,76332,2019-06-01 00:00:00,12650,3TPP,N,KCT,110,59699489,0,1103,Q14-02-10-1,N,1,A,CN,2019-07-01 00:01:10,MTA,324.358974,jaspsi,HW,12.0,-31.0,False,BM,E,1,False,0,Sun,True,True,True,Q14-02,10,1,3
9,2019-07-01 00:01:15,N,Y,8640,48,30,GR,na,ZIL,61541,Q03-16-7-1,2019-07-07,10202,2019-05-24 00:00:00,8640,3TPP,T,KC1,180,59444423,0,1103,UNKNOWN,N,1,A,CN,2019-07-01 00:01:15,MTA,288.0,palgr,HW,5.0,-39.0,False,M2,E,2,False,0,Sun,True,True,True,UNKNOWN,na,na,na


### Add Query Time
df=trans.copy()
df['query_Time']=q_time
list_time=[c for c in list(df.columns) if 'Time' in c]
list_time
for c in list_time:
    df[c]=pd.to_datetime(df[c],format='%Y-%m-%d %H:%M:%S')
trans_prev=df.loc[df['Transaction Date Time']<df['query_Time']]
trans_prev.shape
trans_prev.head()

In [20]:
temp_df=trans.groupby('Previous Value')['Transaction Date Time','Pallet Number'].first()

In [21]:
temp_df_2=trans.groupby('New Value')['Transaction Date Time','Pallet Number'].first()

In [22]:
temp_df.reset_index(inplace=True)
temp_df_2.reset_index(inplace=True)

In [23]:
df_merge=temp_df.merge(temp_df_2,how='outer',left_on='Previous Value',right_on='New Value',suffixes=('_From', '_To'))
df_merge.sample(5)

Unnamed: 0,Previous Value,Transaction Date Time_From,Pallet Number_From,New Value,Transaction Date Time_To,Pallet Number_To
7490,Q12-20-1-1,2019-07-21 23:55:57,57421297.0,Q12-20-1-1,2019-07-21 23:53:54,57421297.0
16991,Q30-23-3-1,2019-07-08 02:08:44,59640832.0,Q30-23-3-1,2019-07-08 02:25:52,59642560.0
19818,ZT03-20-5-1,2019-07-19 10:41:21,57399572.0,,NaT,
14852,Q27-09-11-2,2019-07-02 16:12:47,56603380.0,Q27-09-11-2,2019-07-02 02:12:15,56603380.0
975,Q01-17-1-1,2019-07-09 14:03:11,59700185.0,Q01-17-1-1,2019-07-09 20:42:25,57435782.0


In [24]:
# Exclude row with Transaction Date Time_To happened before Transaction Date Time_From
df=df_merge.loc[(df_merge['Transaction Date Time_From']<df_merge['Transaction Date Time_To']) |
             df_merge['Transaction Date Time_To'].isna(),:]

In [25]:
df.shape

(14359, 6)

### Check Duplicates

In [26]:
df[df['Pallet Number_From'].duplicated()]

Unnamed: 0,Previous Value,Transaction Date Time_From,Pallet Number_From,New Value,Transaction Date Time_To,Pallet Number_To
578,P04-A-3-1,2019-07-13 20:48:50,57349683.0,,NaT,
1025,Q01-19-1-1,2019-07-03 15:41:50,59699984.0,Q01-19-1-1,2019-07-03 19:48:23,59170858.0
1393,Q02-09-12-1,2019-07-03 01:51:00,59435094.0,Q02-09-12-1,2019-07-11 01:38:45,59442320.0
1505,Q02-13-6-2,2019-07-02 22:48:32,59685406.0,Q02-13-6-2,2019-07-03 22:15:36,59682962.0
1544,Q02-15-13-2,2019-07-02 22:48:52,59685406.0,,NaT,
2144,Q03-15-13-2,2019-07-29 11:15:27,59663350.0,,NaT,
2391,Q03-A-8-1,2019-07-01 00:01:22,59699489.0,Q03-A-8-1,2019-07-05 13:33:55,57121722.0
2694,Q04-17-3-2,2019-07-02 15:17:48,59694958.0,Q04-17-3-2,2019-07-03 08:03:54,90017685.0
2701,Q04-17-7-2,2019-07-02 14:46:03,59712515.0,Q04-17-7-2,2019-07-12 09:15:24,59436022.0
3190,Q05-14-10-1,2019-07-01 12:13:32,56999445.0,Q05-14-10-1,2019-07-05 20:42:33,59668287.0


In [27]:
df_pallet_first_trsn=trans[['Pallet Number','Transaction Date Time']].groupby('Pallet Number')['Transaction Date Time'].min()

In [28]:
df_pallet_first_trsn=df_pallet_first_trsn.to_frame().reset_index()

In [29]:
df_pallet_first_trsn.head()

Unnamed: 0,Pallet Number,Transaction Date Time
0,53179215,2019-07-02 09:12:52
1,53179598,2019-07-03 09:05:03
2,53180556,2019-07-01 01:11:34
3,53181362,2019-07-03 09:05:40
4,53184912,2019-07-01 11:35:09


In [30]:
temp=df.shape
df=df.merge(df_pallet_first_trsn,how='inner',
         left_on=['Transaction Date Time_From','Pallet Number_From'],
        right_on=['Transaction Date Time','Pallet Number'])
print('Total Row Rectified : {}'.format(temp[0]-df.shape[0]))

Total Row Rectified : 95


In [31]:
df.sample(10)

Unnamed: 0,Previous Value,Transaction Date Time_From,Pallet Number_From,New Value,Transaction Date Time_To,Pallet Number_To,Pallet Number,Transaction Date Time
2481,Q07-13-9-1,2019-07-01 09:14:01,57372841.0,Q07-13-9-1,2019-07-01 16:39:14,57433580.0,57372841,2019-07-01 09:14:01
13492,ZT02-57-4-2,2019-07-19 17:12:12,57399909.0,,NaT,,57399909,2019-07-19 17:12:12
3366,Q11-01-3-2,2019-07-06 03:06:52,57395949.0,Q11-01-3-2,2019-07-10 22:47:14,57394676.0,57395949,2019-07-06 03:06:52
1643,Q04-03-3-1,2019-07-02 08:16:39,80501080.0,Q04-03-3-1,2019-07-02 12:14:34,59444546.0,80501080,2019-07-02 08:16:39
10647,Q30-04-8-2,2019-07-05 01:54:05,59726352.0,Q30-04-8-2,2019-07-05 01:54:52,59728295.0,59726352,2019-07-05 01:54:05
11614,ZD05-09-3-2,2019-07-10 11:02:36,57370366.0,,NaT,,57370366,2019-07-10 11:02:36
689,Q01-15-7-1,2019-07-03 23:36:32,59699939.0,Q01-15-7-1,2019-07-03 23:56:10,59701922.0,59699939,2019-07-03 23:36:32
6580,Q20-21-8-1,2019-07-06 22:52:42,59723306.0,Q20-21-8-1,2019-07-06 22:56:38,59430754.0,59723306,2019-07-06 22:52:42
13704,ZT03-16-4-1,2019-07-02 09:54:21,57378621.0,,NaT,,57378621,2019-07-02 09:54:21
4538,Q14-22-6-2,2019-07-23 14:10:26,59722828.0,,NaT,,59722828,2019-07-23 14:10:26


In [32]:
df=df[['Previous Value','Pallet Number_From','Transaction Date Time_From']]
df.rename(index=str,columns={'Previous Value':'Location'},inplace=True)

In [33]:
df['isOccupied']=1

### Counter check with Complete set of position

In [34]:
temp_list=trans['Previous Value'].unique()
temp_list_2=trans['New Value'].unique()

In [35]:
len(temp_list)

20331

In [36]:
temp_list=np.append(temp_list,temp_list_2)

In [37]:
unique_position=set(temp_list)

In [38]:
print('Total Unique Position : {}'.format(len(unique_position)))

Total Unique Position : 20380


In [39]:
unique_position=list(unique_position)

In [40]:
unique_position_hyphen=list(c for c in unique_position if '-' in c)

In [41]:
df_Location=pd.DataFrame(columns=['Location'],data=unique_position_hyphen)

In [42]:
df=df.merge(df_Location,how='outer',on='Location')

In [43]:
df['isOccupied'].fillna(0,inplace=True)
df['Pallet Number_From'].fillna(0,inplace=True)

### Filter Out Location with Hyphen

In [44]:
df=df.loc[df['Location'].str.contains('-'),:]
df.shape

(20342, 4)

### Derive Room Code

In [45]:
df['Location_List']=df['Location'].str.split('-')

In [46]:
df['Room']=df['Location_List'].apply(lambda x:x[0])
df['Row']=df['Location_List'].apply(lambda x:x[1])
df['Column']=df['Location_List'].apply(lambda x:x[2])
df['Height']=df['Location_List'].apply(lambda x:x[3])

In [47]:
drop_columns(df,['Location_List'])

Drop 1 columns : 
['Location_List']


In [48]:
df.sample(5)

Unnamed: 0,Location,Pallet Number_From,Transaction Date Time_From,isOccupied,Room,Row,Column,Height
12027,ZE04-10-4-1,57422621.0,2019-07-17 11:59:49,1.0,ZE04,10,4,1
20260,Q23-A-3-1,0.0,NaT,0.0,Q23,A,3,1
15104,Q28-16-1-1,0.0,NaT,0.0,Q28,16,1,1
1504,Q03-16-3-2,59675056.0,2019-07-04 14:30:49,1.0,Q03,16,3,2
8738,Q25-17-2-2,57417924.0,2019-07-18 03:29:38,1.0,Q25,17,2,2


In [49]:
df['Room'].value_counts(dropna=False).sort_index()

P01        133
P02        109
P03        114
P04        207
Q01        605
Q02        607
Q03        602
Q04        475
Q05        620
Q06        615
Q07        570
Q08        601
Q09        604
Q10        576
Q11        576
Q12        609
Q13        240
Q14        237
Q15        495
Q16        555
Q17        522
Q18        507
Q19        518
Q20        485
Q21        621
Q22        567
Q23        557
Q24        552
Q25        611
Q26        571
Q27        612
Q28        552
Q29        622
Q30        614
Q51          4
Q52          4
UNKNOWN      4
ZD02       118
ZD03       148
ZD04       159
ZD05       166
ZE02       260
ZE03        69
ZE04       223
ZE05       286
ZE06       182
ZE07       145
ZT01        91
ZT02       690
ZT03       732
Name: Room, dtype: int64

In [50]:
df=df.sort_values(by=['Room','Row','Column','Height']).reset_index(drop=True)

In [51]:
df.rename(index=str,columns={'Pallet Number_From':'Pallet Number',
                            'Transaction Date Time_From':'Transaction Date Time'},inplace=True)

In [52]:
df.head()

Unnamed: 0,Location,Pallet Number,Transaction Date Time,isOccupied,Room,Row,Column,Height
0,P01-01-7-2,57428906.0,2019-07-20 20:18:30,1.0,P01,1,7,2
1,P01-02-7-1,0.0,NaT,0.0,P01,2,7,1
2,P01-02-7-2,0.0,NaT,0.0,P01,2,7,2
3,P01-02-8-2,0.0,NaT,0.0,P01,2,8,2
4,P01-03-5-1,58506139.0,2019-07-26 19:53:49,1.0,P01,3,5,1


### Summary

In [53]:
d_empty=len(df[df['isOccupied']==0])
d_occupied=len(df[df['isOccupied']==1])
#d_empty=len(df[df['isEmpty']==0])
#d_occupied=len(df[df['isEmpty']==1])

In [54]:
print('Total Pallet Position : {}'.format(len(df['Location'])))
print('Occupied Position: {:} '.format(d_occupied))
print('Occupancy Rate : {:.3f} %'.format(d_occupied/(d_empty+d_occupied)*100))

Total Pallet Position : 20342
Occupied Position: 14259 
Occupancy Rate : 70.096 %


In [55]:
df.groupby('Room')['isOccupied'].sum()

Room
P01         96.0
P02         79.0
P03         87.0
P04        148.0
Q01        481.0
Q02        447.0
Q03        285.0
Q04        163.0
Q05        261.0
Q06        366.0
Q07        141.0
Q08        129.0
Q09        335.0
Q10        339.0
Q11        396.0
Q12        360.0
Q13        222.0
Q14        225.0
Q15        394.0
Q16        242.0
Q17        291.0
Q18        421.0
Q19        365.0
Q20        347.0
Q21        466.0
Q22        407.0
Q23        443.0
Q24        465.0
Q25        487.0
Q26        466.0
Q27        324.0
Q28        443.0
Q29        447.0
Q30        486.0
Q51          0.0
Q52          1.0
UNKNOWN      0.0
ZD02       118.0
ZD03       148.0
ZD04       159.0
ZD05       166.0
ZE02       260.0
ZE03        69.0
ZE04       223.0
ZE05       286.0
ZE06       182.0
ZE07       145.0
ZT01        91.0
ZT02       625.0
ZT03       732.0
Name: isOccupied, dtype: float64

In [56]:
df['isOccupied'].value_counts()

1.0    14259
0.0     6083
Name: isOccupied, dtype: int64

### Export

In [57]:
df.to_csv('Sitemap_July2019.csv',index=False)