# OptimizingDelivery Import and Preparation 

## Import libraries

In [1]:
import os 
import pandas as pd

## Import data

In [2]:
%store -r optimizingdelivery_source_dir
%store -r optimizingdelivery_data_dir

In [3]:
# Define the list of file names
file_names = ['dim_customers.csv', 
              'dim_date.csv', 
              'dim_products.csv', 
              'dim_targets_orders.csv', 
              'fact_order_lines.csv', 
              'fact_orders_aggregate.csv']

# Read each CSV file into a pandas DataFrame
og_dataframes = [pd.read_csv(os.path.join(optimizingdelivery_source_dir, file)) for file in file_names]

# Unpack the list of DataFrames into individual variables
customers, date, products, target_orders, order_lines, orders_aggregate = og_dataframes

In [4]:
print(customers.shape)
print(date.shape)
print(products.shape)
print(target_orders.shape)
print(order_lines.shape)
print(orders_aggregate.shape)

(35, 3)
(183, 3)
(18, 3)
(35, 4)
(57096, 11)
(31729, 6)


## Prepare data

### Duplicates and nans

In [5]:
# Define dataframes and dataframe names
dataframes = [customers, date, products, target_orders, order_lines, orders_aggregate]
dataframe_names = ['customers', 'date', 'products', 'target_orders', 'order_lines', 'orders_aggregate']

In [6]:
## Check dataframes for duplicate rows

# Create list to store duplicate rows
all_duplicate_rows = []

# Loop through each dataframe and pair dataframes with dataframe names
for df_name, df in zip(dataframe_names, dataframes):

    # Find duplicate rows in the current dataframe
    duplicates = df[df.duplicated()]
    
    # Append the duplicates to the list
    if not duplicates.empty:
        all_duplicate_rows.append((df_name, duplicates))

# Output the duplicate rows for each dataframe
for df_name, duplicates in all_duplicate_rows:

    print(f"Duplicate rows in {df_name}:")
    print(duplicates)

In [7]:
## Check dataframes for nan rows

# Create list to store nan rows
all_nan_rows = []

# Loop through each dataframe
for df_name, df in zip(dataframe_names, dataframes):

    # Find rows with NaN values in the current dataframe
    nan_rows = df[df.isnull().any(axis=1)]
    
    # Append the rows with NaN values to the list
    if not nan_rows.empty:
        all_nan_rows.append((df_name, nan_rows))

# Output the rows with NaN values for each dataframe
for df_name, nan_rows in all_nan_rows:

    print(f"Rows with NaN values in {df_name}:")
    print(nan_rows)

### Refine datatypes

In [8]:
dataframe_names

['customers',
 'date',
 'products',
 'target_orders',
 'order_lines',
 'orders_aggregate']

In [9]:
customers.dtypes

customer_id       int64
customer_name    object
city             object
dtype: object

In [10]:
date.dtypes

date       object
mmm_yy     object
week_no    object
dtype: object

In [11]:
# Change datatypes
date['date'] = pd.to_datetime(date['date'], format='%d-%b-%y')

date['month'] = date['date'].dt.month.astype('int64')
date['year'] = date['date'].dt.year.astype('int64')
date['week_number'] = date['week_no'].str.extract(r'(\d+)').astype('int64')

# Sort dataframe by date increasing
date = date.sort_values('date').reset_index(drop=True)

# Drop redundant columns
date = date.drop(['mmm_yy', 'week_no'], axis=1)

In [12]:
date.dtypes

date           datetime64[ns]
month                   int64
year                    int64
week_number             int64
dtype: object

In [13]:
products.dtypes

product_name    object
product_id       int64
category        object
dtype: object

In [14]:
target_orders.dtypes

customer_id       int64
ontime_target%    int64
infull_target%    int64
otif_target%      int64
dtype: object

In [15]:
order_lines.dtypes

order_id                object
order_placement_date    object
customer_id              int64
product_id               int64
order_qty                int64
agreed_delivery_date    object
actual_delivery_date    object
delivery_qty             int64
In Full                  int64
On Time                  int64
On Time In Full          int64
dtype: object

In [16]:
# Change datatypes
order_lines['order_placement_date'] = pd.to_datetime(order_lines['order_placement_date'], format='%A, %B %d, %Y')
order_lines['agreed_delivery_date'] = pd.to_datetime(order_lines['agreed_delivery_date'], format='%A, %B %d, %Y')
order_lines['actual_delivery_date'] = pd.to_datetime(order_lines['actual_delivery_date'], format='%A, %B %d, %Y')

order_lines['order_placement_day'] = order_lines['order_placement_date'].dt.day_name()
order_lines['agreed_delivery_day'] = order_lines['agreed_delivery_date'].dt.day_name()
order_lines['actual_delivery_day'] = order_lines['actual_delivery_date'].dt.day_name()

# Sort dataframe by date increasing
order_lines = order_lines.sort_values('order_placement_date').reset_index(drop=True)

# Reorder columns
order_lines = order_lines[['order_id', 'order_placement_date', 'order_placement_day', 'customer_id', 'product_id',
       'order_qty', 'agreed_delivery_date', 'agreed_delivery_day', 'actual_delivery_date', 'actual_delivery_day',
       'delivery_qty', 'In Full', 'On Time', 'On Time In Full']].copy()

In [17]:
order_lines.dtypes

order_id                        object
order_placement_date    datetime64[ns]
order_placement_day             object
customer_id                      int64
product_id                       int64
order_qty                        int64
agreed_delivery_date    datetime64[ns]
agreed_delivery_day             object
actual_delivery_date    datetime64[ns]
actual_delivery_day             object
delivery_qty                     int64
In Full                          int64
On Time                          int64
On Time In Full                  int64
dtype: object

In [18]:
orders_aggregate.dtypes

order_id                object
customer_id              int64
order_placement_date    object
on_time                  int64
in_full                  int64
otif                     int64
dtype: object

In [19]:
orders_aggregate['order_placement_date'] = pd.to_datetime(orders_aggregate['order_placement_date'], format='%d-%b-%y')


In [20]:
orders_aggregate.dtypes

order_id                        object
customer_id                      int64
order_placement_date    datetime64[ns]
on_time                          int64
in_full                          int64
otif                             int64
dtype: object

### Save prepared data

In [21]:
# Save prepared data to parquet files

# Iterate over dataframes and names
for dataframe, name in zip(dataframes, dataframe_names):

    # Create file paths
    file_path = os.path.join(optimizingdelivery_data_dir, f"{name}.parquet")
    
    # Save the dataframes
    dataframe.to_parquet(file_path)