# Positions Prep

This file contains the process performed on the new `positions.csv` file. The process is the same as the one performed in the `data_cleanup.ipynb` file, but with a few differences.

In [2]:
import pandas as pd

# read the 'positions.csv' into a DataFrame from the 'data/raw' directory, and make sure that the 'location' and 'aisle',  are 'string's:
positions = pd.read_csv('../data/raw/positions.csv', dtype={'location': 'string', 'aisle': 'string'})
# Treat every column with 'date' in the column name in it as datetime:
date_columns = [name for name in positions.columns if 'date' in name]
positions[date_columns] = pd.to_datetime(positions[date_columns].stack(), dayfirst=True).unstack()  # Stack and unstack are needed because to_datetime works with Series but not with DataFrame.
# lower case all the column names:
positions.columns = map(str.lower, positions.columns)
# uppercase all 'string' values:
positions = positions.applymap(lambda x: x.upper() if isinstance(x, str) else x)
# remove all rows where 'uuid' is null:
positions = positions[positions['uuid'].notnull()]
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
...,...,...,...,...,...,...,...,...,...,...,...
24164,A003724,SORT05,NaT,2.0,2023-05-30 09:48:00,2023-05-30 09:48:00,2023-05-30 09:48:00,SORT,SORT,SORT,SORT
24165,A029187,SORT05,NaT,1.0,2023-05-25 10:44:00,2023-05-25 10:44:00,2023-05-25 10:44:00,SORT,SORT,SORT,SORT
24166,A029804,SORT05,NaT,1.0,2023-05-30 08:50:00,2023-05-30 08:50:00,2023-05-30 08:50:00,SORT,SORT,SORT,SORT
24167,A029876,SORT05,NaT,1.0,2023-05-24 14:01:00,2023-05-24 14:01:00,2023-05-24 14:01:00,SORT,SORT,SORT,SORT


In [3]:
problematic_location_names = ['sort', 'cd', 'kir', 'flawed', 'rl', 'cl']
for name in problematic_location_names: # Remove all problematic places from the DF so we can work with items we can manage
  positions = positions[~positions.location.str.contains(name, case=False)]
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
...,...,...,...,...,...,...,...,...,...,...,...
23958,A005873,FLOOR04,NaT,1.0,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK\n,DEPOSIT,PALLET JACK\n,FLOOR
23959,A005529,FLOOR04,NaT,1.0,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK\n,DEPOSIT,PALLET JACK\n,FLOOR
23960,A011879,FLOOR04,NaT,1.0,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK\n,DEPOSIT,PALLET JACK\n,FLOOR
23961,A011867,FLOOR04,2023-03-30 13:25:00,2.0,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK\n,DEPOSIT,PALLET JACK\n,FLOOR


In [4]:
# replace all instances which contain '\n' with '':
positions = positions.replace('\n', '', regex=True)
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01
...,...,...,...,...,...,...,...,...,...,...,...
23958,A005873,FLOOR04,NaT,1.0,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR
23959,A005529,FLOOR04,NaT,1.0,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR
23960,A011879,FLOOR04,NaT,1.0,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR
23961,A011867,FLOOR04,2023-03-30 13:25:00,2.0,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR


In [5]:
# get the the 'location type' column from the 'old_positions_data.xlsc' file:
location_types = pd.read_excel('../data/raw/old_positions_data.xlsx', sheet_name="position", usecols=['location', 'location type'])
# uppercase all 'string' values:
location_types = location_types.applymap(lambda x: x.upper() if isinstance(x, str) else x)
location_types

Unnamed: 0,location,location type
0,09040304,FLOOR
1,09040102,FLOOR
2,09040204,FLOOR
3,09060201,FLOOR
4,09080104,FLOOR
...,...,...
25842,02090719,ORDER PICKER
25843,04110104,FLOOR
25844,40260901,REACH FORK
25845,40260901,REACH FORK


In [6]:
# check how many values are missing in the 'location type' column:
location_types['location type'].isnull().sum()

0

In [7]:
# merge both DataFrames on the 'location' column:
positions = positions.merge(location_types, on='location', how='left')
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
2,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
3,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
139384,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139385,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139386,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139387,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


In [8]:
# show all duplicated rows
positions[positions.duplicated(keep=False)]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
5,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
6,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
139384,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139385,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139386,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
139387,A011868,FLOOR04,NaT,1.0,2022-09-28 13:47:00,2022-09-28 13:47:00,2022-10-24 13:10:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


In [9]:
# drop all duplicated rows:
positions = positions.drop_duplicates().reset_index(drop=True)
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
23812,A005873,FLOOR04,NaT,1.0,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23813,A005529,FLOOR04,NaT,1.0,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23814,A011879,FLOOR04,NaT,1.0,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23815,A011867,FLOOR04,2023-03-30 13:25:00,2.0,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


In [10]:
# how many missing values are in each column:
positions.isnull().sum()

uuid                          0
location                      0
last count date            2363
quantity                      0
receive date                  0
last status change date       0
last movement date            0
putaway zone                  0
fetch zone                    0
warehouse zone                0
aisle                         0
location type               763
dtype: int64

In [11]:
# show which rows have missing values in the 'location type' column:
positions[positions['location type'].isnull()]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
97,A016421,01010719,NaT,2.0,2023-04-09 09:18:00,2023-04-09 09:18:00,2023-04-16 10:54:00,OPT-B-01-04,OPT01,PASSAGE01-04,01,
426,A004630,01020501,NaT,2.0,2023-04-18 11:19:00,2023-04-18 11:19:00,2023-04-20 11:16:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,
432,A022504,01020525,2023-04-18 11:49:00,1.0,2023-04-18 11:49:00,2023-04-18 11:49:00,2023-04-18 11:49:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,
435,A022424,01020605,2023-04-18 13:01:00,5.0,2023-05-17 09:58:00,2023-04-09 09:17:00,2023-04-16 10:55:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,
439,A005189,01020621,NaT,8.0,2023-05-01 16:01:00,2023-05-01 16:01:00,2023-05-03 11:49:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,
...,...,...,...,...,...,...,...,...,...,...,...,...
23505,A005110,60130303,NaT,2.0,2023-04-18 10:34:00,2023-04-19 12:29:00,2023-04-19 12:29:00,PALLET-B,PALLET,PASSAGE60,60,
23579,A005514,60200303,NaT,1.0,2023-04-18 09:42:00,2023-04-18 09:42:00,2023-04-19 14:29:00,PALLET-B,PALLET,RL-STORGE,60,
23580,A010697,60210102,NaT,7.0,2022-11-08 15:01:00,2022-11-08 15:01:00,2023-04-27 14:24:00,PALLET JACK,PALLET JACK,PASSAGE60,60,
23597,A010729,60220303,NaT,1.0,2023-04-18 10:07:00,2023-04-18 10:07:00,2023-04-19 12:12:00,PALLET-B,PALLET,RL-STORGE,60,


In [12]:
# fill all NaN values in the 'location type' column with 'unknown':
positions['location type'] = positions['location type'].fillna('unknown')

# show all different groupings of 'putaway zone', 'fetch zone' and 'location type', save as a variable:
zone_groupings = positions.groupby(['putaway zone', 'fetch zone', 'location type']).size().reset_index().rename(columns={0:'count'})
zone_groupings

Unnamed: 0,putaway zone,fetch zone,location type,count
0,BITNOT,BITNOT,FLOOR,53
1,BITNOT,OPT07-LOW,FLOOR,6
2,BITNOT,OPT09,ORDER PICKER,2
3,CANTI-HEAVY,OPT05,ORDER PICKER,136
4,CANTI-HEAVY,OPT06-LOW,FLOOR,46
...,...,...,...,...
95,PALLET-A,PALLET JACK,unknown,6
96,PALLET-B,PALLET,REACH FORK,362
97,PALLET-B,PALLET,unknown,19
98,PALLET-C,PALLET,REACH FORK,829


In [13]:
# filter out from zone_groupings all 'putaway zone's which don't have at least one row with 'location type' == 'unknown':
zones_with_unkown_values = zone_groupings[zone_groupings['location type'] == 'unknown']['putaway zone'].unique()
# from the zone_groupings DataFrame, leave only the rows where the 'putaway zone' is in the zones_with_unkown_values list:
zone_groupings = zone_groupings[zone_groupings['putaway zone'].isin(zones_with_unkown_values)]
# sort by 'putaway zone' and 'fetch zone':
zone_groupings = zone_groupings.sort_values(by=['putaway zone', 'fetch zone'])
zone_groupings

Unnamed: 0,putaway zone,fetch zone,location type,count
5,CANTI-LIGHT,OPT05,ORDER PICKER,59
6,CANTI-LIGHT,OPT05,unknown,1
7,CANTI-LIGHT,OPT06-LOW,FLOOR,36
8,CANTI-LIGHT,OPT06-LOW,unknown,1
10,HIVE,OPT07,FLOOR,1
...,...,...,...,...
95,PALLET-A,PALLET JACK,unknown,6
96,PALLET-B,PALLET,REACH FORK,362
97,PALLET-B,PALLET,unknown,19
98,PALLET-C,PALLET,REACH FORK,829


In [14]:
# in zone_groupings, for every group of 'putaway zone' and 'fetch zone', apply the following function:
def fill_unknowns(group):
    # if there are two rows in the group, and one of them has 'location type' == 'unknown':
    if len(group) == 2 and 'unknown' in group['location type'].values:
        # remove the row with 'location type' == 'unknown':
        group = group[group['location type'] != 'unknown']
    # return the group:
    return group

# apply the fill_unknowns function to the zone_groupings DataFrame:
zone_groupings = zone_groupings.groupby(['putaway zone', 'fetch zone']).apply(fill_unknowns)[['location type']]
zone_groupings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,location type
putaway zone,fetch zone,Unnamed: 2_level_1,Unnamed: 3_level_1
CANTI-LIGHT,OPT05,5,ORDER PICKER
CANTI-LIGHT,OPT06-LOW,7,FLOOR
HIVE,OPT07,10,FLOOR
HIVE,OPT07-LOW,11,FLOOR
HIVE,OPT09-LOW,13,FLOOR


In [15]:
# take zone_groupings and take the index columns and make them into columns:
zone_groupings = zone_groupings.reset_index()
zone_groupings.head()

Unnamed: 0,putaway zone,fetch zone,level_2,location type
0,CANTI-LIGHT,OPT05,5,ORDER PICKER
1,CANTI-LIGHT,OPT06-LOW,7,FLOOR
2,HIVE,OPT07,10,FLOOR
3,HIVE,OPT07-LOW,11,FLOOR
4,HIVE,OPT09-LOW,13,FLOOR


In [16]:
# drop the 'level_2' column:
zone_groupings = zone_groupings.drop(columns=['level_2'])
zone_groupings

Unnamed: 0,putaway zone,fetch zone,location type
0,CANTI-LIGHT,OPT05,ORDER PICKER
1,CANTI-LIGHT,OPT06-LOW,FLOOR
2,HIVE,OPT07,FLOOR
3,HIVE,OPT07-LOW,FLOOR
4,HIVE,OPT09-LOW,FLOOR
5,MERAKEZET,OPT06-LOW,FLOOR
6,OPT-A-01-04,OPT01-04-LOW,FLOOR
7,OPT-A-05-07,OPT05,ORDER PICKER
8,OPT-A-05-07,OPT06,ORDER PICKER
9,OPT-A-05-07,OPT06-LOW,FLOOR


In [17]:
# from zone_groupings, take only the rows where 'location type' == 'unknown':
unknown_locations = zone_groupings[zone_groupings['location type'] == 'unknown'][['putaway zone', 'fetch zone']]
unknown_locations

Unnamed: 0,putaway zone,fetch zone
11,OPT-A-05-07,OPT06-LOW
14,OPT-A-07-11,OPT07


In [18]:
# take the values from both columns of unknown_locations and unite them into one string:
unknown_locations['zones'] = unknown_locations.apply(lambda x: '_'.join(x), axis=1)
unknown_locations

Unnamed: 0,putaway zone,fetch zone,zones
11,OPT-A-05-07,OPT06-LOW,OPT-A-05-07_OPT06-LOW
14,OPT-A-07-11,OPT07,OPT-A-07-11_OPT07


In [19]:
zone_groupings['zones'] = zone_groupings['putaway zone'] + '_' + zone_groupings['fetch zone']
# from zone_groupings, filter out rows which have 'zones' values which can be found in unknown_locations:
zone_groupings = zone_groupings[~zone_groupings['zones'].isin(unknown_locations['zones'])]
zone_groupings

Unnamed: 0,putaway zone,fetch zone,location type,zones
0,CANTI-LIGHT,OPT05,ORDER PICKER,CANTI-LIGHT_OPT05
1,CANTI-LIGHT,OPT06-LOW,FLOOR,CANTI-LIGHT_OPT06-LOW
2,HIVE,OPT07,FLOOR,HIVE_OPT07
3,HIVE,OPT07-LOW,FLOOR,HIVE_OPT07-LOW
4,HIVE,OPT09-LOW,FLOOR,HIVE_OPT09-LOW
5,MERAKEZET,OPT06-LOW,FLOOR,MERAKEZET_OPT06-LOW
6,OPT-A-01-04,OPT01-04-LOW,FLOOR,OPT-A-01-04_OPT01-04-LOW
7,OPT-A-05-07,OPT05,ORDER PICKER,OPT-A-05-07_OPT05
8,OPT-A-05-07,OPT06,ORDER PICKER,OPT-A-05-07_OPT06
15,OPT-A-07-11,OPT07-LOW,FLOOR,OPT-A-07-11_OPT07-LOW


In [20]:
# in positions DataFrame, for every row, if the 'location type' is 'unknown', replace it with NaN:
import numpy as np
positions['location type'] = positions['location type'].replace('unknown', np.nan)

In [21]:
# merge the zone_groupings DataFrame with the positions DataFrame on the 'putaway zone' and 'fetch zone' columns:
# where location type is NaN, replace it with the value from the zone_groupings DataFrame:
positions = positions.merge(zone_groupings.drop(labels='zones', axis=1), on=['putaway zone', 'fetch zone'], how='left')
positions['location type_x'] = positions['location type_x'].fillna(positions['location type_y'])
# drop the 'location type_y' column:
positions = positions.drop(columns=['location type_y'])
# rename the 'location type_x' column to 'location type':
positions = positions.rename(columns={'location type_x': 'location type'})
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
23812,A005873,FLOOR04,NaT,1.0,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23813,A005529,FLOOR04,NaT,1.0,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23814,A011879,FLOOR04,NaT,1.0,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23815,A011867,FLOOR04,2023-03-30 13:25:00,2.0,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


In [22]:
# show which rows have missing values in the 'location type' column:
positions[positions['location type'].isnull()]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
16181,A016392,5130303,NaT,7.0,2023-04-18 15:06:00,2023-01-31 09:06:00,2023-04-16 13:08:00,OPT-A-05-07,OPT06-LOW,PASSAGE05,5,
16689,A028740,5210301,2022-10-26 07:43:00,84.0,2023-01-03 06:34:00,2022-10-12 09:51:00,2023-04-04 10:41:00,OPT-A-05-07,OPT06-LOW,PASSAGE05,5,
18956,A009592,7020703,2023-04-24 06:30:00,1.0,2023-04-24 06:30:00,2023-04-24 06:30:00,2023-04-24 06:30:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18957,A009593,7020703,2023-04-24 06:28:00,1.0,2023-04-24 06:28:00,2023-04-24 06:28:00,2023-04-24 06:28:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18958,A005096,7020703,2023-04-24 06:32:00,4.0,2023-04-24 06:32:00,2023-04-24 06:32:00,2023-04-24 06:32:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19042,A005076,7040501,2023-04-24 07:06:00,1.0,2023-04-24 07:06:00,2023-04-24 07:06:00,2023-04-24 07:06:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19220,A028205,7080302,NaT,4.0,2023-05-23 07:36:00,2023-05-16 14:56:00,2023-05-17 06:33:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19221,A005686,7080303,2023-04-27 06:25:00,3.0,2023-01-24 11:05:00,2022-08-23 14:45:00,2023-04-04 14:26:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19229,A019377,7080703,2023-04-27 06:18:00,3.0,2023-05-15 13:51:00,2023-04-27 06:18:00,2023-04-27 06:18:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19563,A028374,7150501,2023-05-08 06:29:00,3.0,2023-03-29 09:53:00,2022-12-22 10:37:00,2023-04-04 10:31:00,OPT-A-07-11,OPT07,PASSAGE07,7,


In [23]:
# for every row in positions, if the 'location type' is NaN, check the previous and next rows:
# if the 'location type' in the previous row is the same as in the next row, replace the NaN with the value from the previous row:
for index, row in positions.iterrows():
    if pd.isnull(row['location type']):
        if positions.iloc[index-1]['location type'] == positions.iloc[index+1]['location type']:
            positions.at[index, 'location type'] = positions.iloc[index-1]['location type']
positions

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
0,A024200,01010105,2023-04-17 11:59:00,28.0,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A017062,01010109,2023-04-17 12:00:00,2.0,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
2,A026194,01010109,2023-04-17 12:13:00,6.0,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
3,A012596,01010113,2023-04-17 12:07:00,6.0,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A017601,01010113,2023-04-17 12:04:00,3.0,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
23812,A005873,FLOOR04,NaT,1.0,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23813,A005529,FLOOR04,NaT,1.0,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23814,A011879,FLOOR04,NaT,1.0,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23815,A011867,FLOOR04,2023-03-30 13:25:00,2.0,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


In [24]:
# show which rows have missing values in the 'location type' column:
positions[positions['location type'].isnull()]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
18956,A009592,7020703,2023-04-24 06:30:00,1.0,2023-04-24 06:30:00,2023-04-24 06:30:00,2023-04-24 06:30:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18957,A009593,7020703,2023-04-24 06:28:00,1.0,2023-04-24 06:28:00,2023-04-24 06:28:00,2023-04-24 06:28:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18958,A005096,7020703,2023-04-24 06:32:00,4.0,2023-04-24 06:32:00,2023-04-24 06:32:00,2023-04-24 06:32:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19220,A028205,7080302,NaT,4.0,2023-05-23 07:36:00,2023-05-16 14:56:00,2023-05-17 06:33:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19221,A005686,7080303,2023-04-27 06:25:00,3.0,2023-01-24 11:05:00,2022-08-23 14:45:00,2023-04-04 14:26:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19563,A028374,7150501,2023-05-08 06:29:00,3.0,2023-03-29 09:53:00,2022-12-22 10:37:00,2023-04-04 10:31:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19565,A014089,7150503,NaT,2.0,2023-05-16 13:29:00,2023-05-16 13:29:00,2023-05-16 15:40:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19566,A029461,7150503,2023-05-08 06:30:00,4.0,2023-02-16 06:25:00,2023-01-01 06:24:00,2023-04-04 10:30:00,OPT-A-07-11,OPT07,PASSAGE07,7,


In [25]:
# for every row in positions, if the 'location type' is NaN, print the row, and print the 'location type' values of the previous and next rows:
for index, row in positions.iterrows():
    if pd.isnull(row['location type']):
        print(index-1, positions.iloc[index-1]['location'], positions.iloc[index-1]['location type'])
        print(index, row['location'])
        print(index+1, positions.iloc[index+1]['location'], positions.iloc[index+1]['location type'])
        print('------------------')


18955 07020702 ORDER PICKER
18956 07020703
18957 07020703 nan
------------------
18956 07020703 nan
18957 07020703
18958 07020703 nan
------------------
18957 07020703 nan
18958 07020703
18959 07020704 ORDER PICKER
------------------
19219 07080301 ORDER PICKER
19220 07080302
19221 07080303 nan
------------------
19220 07080302 nan
19221 07080303
19222 07080304 ORDER PICKER
------------------
19562 07150404 FLOOR
19563 07150501
19564 07150502 ORDER PICKER
------------------
19564 07150502 ORDER PICKER
19565 07150503
19566 07150503 nan
------------------
19565 07150503 nan
19566 07150503
19567 07150504 ORDER PICKER
------------------


We can see the all of the rows which have still have a missing value in the `position` column, are mostly consecutive rows.

We can see that all appearances of this show a phenomenon of having the same `loclevel` values (except for `83505`, which has a previous row which isn't on the same shelf). As such, we can decide that the `position` column can be filled in by using the `bfill()` method.

In [26]:
# show which rows have missing values in the 'location type' column:
positions[positions['location type'].isnull()]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
18956,A009592,7020703,2023-04-24 06:30:00,1.0,2023-04-24 06:30:00,2023-04-24 06:30:00,2023-04-24 06:30:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18957,A009593,7020703,2023-04-24 06:28:00,1.0,2023-04-24 06:28:00,2023-04-24 06:28:00,2023-04-24 06:28:00,OPT-A-07-11,OPT07,PASSAGE07,7,
18958,A005096,7020703,2023-04-24 06:32:00,4.0,2023-04-24 06:32:00,2023-04-24 06:32:00,2023-04-24 06:32:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19220,A028205,7080302,NaT,4.0,2023-05-23 07:36:00,2023-05-16 14:56:00,2023-05-17 06:33:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19221,A005686,7080303,2023-04-27 06:25:00,3.0,2023-01-24 11:05:00,2022-08-23 14:45:00,2023-04-04 14:26:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19563,A028374,7150501,2023-05-08 06:29:00,3.0,2023-03-29 09:53:00,2022-12-22 10:37:00,2023-04-04 10:31:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19565,A014089,7150503,NaT,2.0,2023-05-16 13:29:00,2023-05-16 13:29:00,2023-05-16 15:40:00,OPT-A-07-11,OPT07,PASSAGE07,7,
19566,A029461,7150503,2023-05-08 06:30:00,4.0,2023-02-16 06:25:00,2023-01-01 06:24:00,2023-04-04 10:30:00,OPT-A-07-11,OPT07,PASSAGE07,7,


In [27]:
# for every row with 'location type' == 'unknown', bfill it:
positions['location type'] = positions['location type'].replace('unknown', np.nan).bfill()

In [28]:
# show all rows in which 'quantity' isn't a full number:
positions[positions['quantity'] % 1 != 0]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type
13257,A000116,4051102,2023-05-02 07:19:00,37.5,2022-08-28 15:05:00,2022-08-28 15:05:00,2022-08-28 15:09:00,OPT-B-01-04,OPT04,PASSAGE01-04,4,ORDER PICKER
15387,A000057,5010901,2022-11-07 11:13:00,7.7,2023-02-19 10:45:00,2022-11-07 11:13:00,2022-11-07 11:13:00,CANTI-LIGHT,OPT05,PASSAGE05,5,ORDER PICKER
15490,A000023,5030202,NaT,3.1,2023-02-27 10:32:00,2023-02-27 10:32:00,2023-03-07 09:08:00,CANTI-HEAVY,OPT06-LOW,PASSAGE05,5,FLOOR
15515,A000036,5030601,2022-11-07 13:11:00,3.6,2023-02-01 09:34:00,2022-11-07 13:11:00,2022-11-07 13:11:00,CANTI-HEAVY,OPT05,PASSAGE05,5,ORDER PICKER
15846,A000056,5073101,2023-05-09 07:48:00,2.6,2023-02-19 10:43:00,2023-02-19 10:43:00,2023-02-21 13:07:00,OPT-C-05-07,OPT05,PASSAGE05,5,ORDER PICKER


In [29]:
# round up all values in the 'quantity' column:
positions['quantity'] = positions['quantity'].apply(np.ceil)
positions[positions['quantity'] % 1 != 0]

Unnamed: 0,uuid,location,last count date,quantity,receive date,last status change date,last movement date,putaway zone,fetch zone,warehouse zone,aisle,location type


In [30]:
# turn the 'quantity' column into integers:
positions['quantity'] = positions['quantity'].astype(int)

In [31]:
positions = positions.rename(columns={'location type': 
'fetch_tool'})
# replace whitespaces with underscores in all column names:
positions.columns = positions.columns.str.replace(' ', '_')
positions['location'] = np.where(
    positions['location'].str.contains('FLOOR'), positions['location'], positions['location'].str[:-2])
# uppercase all values within the table:
positions = positions.apply(lambda x: x.str.upper() if x.dtype == "string" else x)
# within the table, replace all whitespaces with underscores:
positions = positions.apply(lambda x: x.str.replace(' ', '_') if x.dtype == "string" else x)
positions

Unnamed: 0,uuid,location,last_count_date,quantity,receive_date,last_status_change_date,last_movement_date,putaway_zone,fetch_zone,warehouse_zone,aisle,fetch_tool
0,A024200,010101,2023-04-17 11:59:00,28,2023-05-10 12:32:00,2022-01-20 13:42:00,2022-01-20 13:42:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
1,A017062,010101,2023-04-17 12:00:00,2,2023-01-23 14:51:00,2023-01-18 12:03:00,2023-01-19 10:43:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
2,A026194,010101,2023-04-17 12:13:00,6,2023-01-08 12:12:00,2022-01-20 14:15:00,2023-04-17 12:14:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
3,A012596,010101,2023-04-17 12:07:00,6,2022-01-20 14:06:00,2022-01-20 14:06:00,2023-04-17 12:08:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
4,A017601,010101,2023-04-17 12:04:00,3,2023-04-13 06:34:00,2022-01-20 14:02:00,2022-01-20 14:02:00,OPT-A-01-04,OPT01-04-LOW,PASSAGE01-04,01,FLOOR
...,...,...,...,...,...,...,...,...,...,...,...,...
23812,A005873,FLOOR04,NaT,1,2023-02-08 10:36:00,2023-02-08 10:36:00,2023-02-09 11:15:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23813,A005529,FLOOR04,NaT,1,2022-11-02 09:43:00,2022-11-02 09:43:00,2022-11-03 13:03:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23814,A011879,FLOOR04,NaT,1,2023-02-08 10:37:00,2023-02-08 10:37:00,2023-02-09 10:55:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK
23815,A011867,FLOOR04,2023-03-30 13:25:00,2,2023-02-20 14:41:00,2023-02-20 14:41:00,2023-02-21 09:50:00,PALLET JACK,DEPOSIT,PALLET JACK,FLOOR,PALLET JACK


Export the positions.csv for inner data usage:

In [32]:
# export as a csv file:
import csv
positions.to_csv('../data/processed/positions.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)