In [None]:
# 9/17/24

# Let me begin by exploring the data.


In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import csv
import json
import os
from pandas import DataFrame, Series


In [11]:
# Read the Historical Product Demand data
df = pd.read_csv('Historical Product Demand.csv')

# Display the first 5 rows of the data
df.head()


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500


In [12]:
# Get the shape of the data
df.shape


(1048575, 5)

In [13]:
# Get the data types of the columns
df.dtypes


Product_Code        object
Warehouse           object
Product_Category    object
Date                object
Order_Demand        object
dtype: object

In [19]:
# Let me convert Date and Order_Demand to datetime and integer data types respectively

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y/%m/%d')

# Convert the 'Order_Demand' column to integer format if it contains string values
if df['Order_Demand'].dtype == 'object':
    df['Order_Demand'] = df['Order_Demand'].str.replace('(', '').str.replace(')', '').astype(int)

df.head()


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012-07-27,100
1,Product_0979,Whse_J,Category_028,2012-01-19,500
2,Product_0979,Whse_J,Category_028,2012-02-03,500
3,Product_0979,Whse_J,Category_028,2012-02-09,500
4,Product_0979,Whse_J,Category_028,2012-03-02,500


In [20]:
# Double check the column types now to make sure they were properly converted.
df.dtypes


Product_Code                object
Warehouse                   object
Product_Category            object
Date                datetime64[ns]
Order_Demand                 int64
dtype: object

In [21]:
# See how many unique values there are for each column.

for col in df.columns:
    print(col, df[col].nunique())



Product_Code 2160
Warehouse 4
Product_Category 33
Date 1729
Order_Demand 3320


In [22]:
# Now I want to make another column in the df that is the month of the order date.

# Extract the month and convert it to the month name
df['Order_Month'] = df['Date'].dt.month_name()

df.head()


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Order_Month
0,Product_0993,Whse_J,Category_028,2012-07-27,100,July
1,Product_0979,Whse_J,Category_028,2012-01-19,500,January
2,Product_0979,Whse_J,Category_028,2012-02-03,500,February
3,Product_0979,Whse_J,Category_028,2012-02-09,500,February
4,Product_0979,Whse_J,Category_028,2012-03-02,500,March


In [25]:
# Now I want to get the average Order_Demand for every Product_Code 
# in every Warehouse in every month.

grouped_df = df.groupby(['Product_Code', 'Warehouse', 'Order_Month'])['Order_Demand'].mean().reset_index()

grouped_df.rename(columns={'Order_Demand': 'Average_Order_Demand'}, inplace=True)

grouped_df.head(50)


Unnamed: 0,Product_Code,Warehouse,Order_Month,Average_Order_Demand
0,Product_0001,Whse_A,April,1836.363636
1,Product_0001,Whse_A,August,1575.0
2,Product_0001,Whse_A,December,1038.461538
3,Product_0001,Whse_A,February,1486.363636
4,Product_0001,Whse_A,January,1106.25
5,Product_0001,Whse_A,July,1700.0
6,Product_0001,Whse_A,June,1388.235294
7,Product_0001,Whse_A,March,1709.52381
8,Product_0001,Whse_A,May,1657.142857
9,Product_0001,Whse_A,November,1566.666667


In [24]:
len(grouped_df)

33246

In [26]:
# Are there any rows in df where 'Order_Demand' is 0?

df[df['Order_Demand'] == 0]


Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Order_Month
46286,Product_0981,Whse_A,Category_028,2012-06-04,0,June
46287,Product_0981,Whse_A,Category_028,2012-06-08,0,June
68611,Product_1250,Whse_A,Category_019,2012-03-09,0,March
68613,Product_1250,Whse_A,Category_019,2012-05-01,0,May
68615,Product_1250,Whse_A,Category_019,2012-05-25,0,May
...,...,...,...,...,...,...
1047529,Product_1378,Whse_J,Category_019,2016-08-31,0,August
1047683,Product_0955,Whse_J,Category_028,2016-03-29,0,March
1047684,Product_1030,Whse_J,Category_028,2016-03-29,0,March
1047685,Product_0955,Whse_J,Category_028,2016-09-26,0,September


In [None]:
# Here is my conclusion for this dataset:

# To make a ML model, I would only be able to do it for a specific Product_Code,
# Warehouse, and Order_Month. There are only about 30-50 rows for each combination.

# If I'm trying to make ML models, I should use data that has more rows for each combination.

# But this is a good dataset to practice data cleaning and manipulation.
# I can use it to practice visualizing how data might look in the real world,
# where I don't have large amounts of records all the time.

# Something like the wine dataset would be better for practicing ML model making.