# 01. Importing libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Import data

In [2]:
# define project folder path
path = r'C:\Users\nsmith\OneDrive - Georgia Poultry Laboratory Network\CareerFoundry\02 - Data Immersion\Achievement 4\12-2024 Instacart Basket Analysis'

In [3]:
# Step 3: Import the exported merged data as df with path variable
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

# 03. Check the dimensions of the imported data

In [4]:
# check output of new df
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,merge_flag
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,both


In [5]:
# check the shape of the dataframe
ords_prods_merge.shape

(32404859, 15)

In [6]:
ords_prods_merge.dtypes

order_id                     int64
user_id                      int64
order_number                 int64
order_day_of_week            int64
order_hour_of_day            int64
days_since_prior_order     float64
first_order                   bool
product_id                   int64
add_to_cart_order            int64
reordered                    int64
product_name                object
aisle_id                     int64
department_id                int64
prices                     float64
merge_flag                category
dtype: object

# 04. Appply criteria with if-statements

#### If the item’s price is lower than or equal to 5, it will be labeled a “low-range product.”
#### If the item’s price is above 5 but lower than or equal to 15, it will be labeled a “mid-range product.”
#### If the item’s price is above 15, it will be labeled a “high-range product.”

In [7]:
# Create a subset of first million rows to prevent memory bog down
df = ords_prods_merge[:1000000]

In [8]:
# create a function to derive a new price label column based on criteria
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [9]:
# Apply function to df
df['price_range'] = df.apply(price_label, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis=1)


In [10]:
# check value counts
df['price_range'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [11]:
# check the most expensive product
df['prices'].max()

14.8

In [12]:
# try same process with .loc
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [13]:
# try same process with .loc
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [14]:
# try same process with .loc
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [15]:
# check value counts
df['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

In [16]:
# try same process with .loc on entire dataset
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [17]:
# try same process with .loc on entire dataset
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [18]:
# try same process with .loc on entire dataset
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [19]:
# check value counts
ords_prods_merge['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

# 05. If-statements with for-loops

In [20]:
# check frequency distribution of day of week
ords_prods_merge['order_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_day_of_week, dtype: int64

### Create a new column to categorize how busy each day is

#### column = busiest_day; categories = Busiest day, Least busy, Regularly busy

In [21]:
# create new column
result = []

In [22]:
# add category values to new column
for value in ords_prods_merge["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [23]:
# add results to column
ords_prods_merge['busiest_day'] = result

In [24]:
# view frequency distribution
ords_prods_merge['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

# 06. Tasks

### Step 1. completed above

### Step 2. create new busiest_days column with 2 busiest and 2 slowest days

In [25]:
# clear the previous results
result = []

In [26]:
# add category values to new column
for value in ords_prods_merge["order_day_of_week"]:
    if value == 0 or value == 1:
        result.append("Busiest days")
    elif value == 3 or value == 4:
        result.append("Slowest days")
    else:
        result.append("Regular days")

In [27]:
# add results to column
ords_prods_merge['busiest_days'] = result

In [28]:
# show frequency distribution
ords_prods_merge['busiest_days'].value_counts(dropna = False)

Regular days    12916111
Busiest days    11864412
Slowest days     7624336
Name: busiest_days, dtype: int64

### Step 3. Manually adding together days 0 and 1 gives the same value as "busiest days" in the frequency distribution. The same is true of slowest days, for 3 and 4.

### Step 4. Calculate new column "busiest_period_of_day" from order_hour_of_day column. Categorize hours into periods: Most orders, Average orders, Fewest orders.

#### Determine busiest hour of day

In [29]:
# clear previous results
result = []

In [30]:
# check frequency distribution of hour of day
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

In [31]:
# add category values to new column
for value in ords_prods_merge["order_hour_of_day"]:
    if value == 10:
        result.append("Most orders")
    elif value == 3:
        result.append("Least orders")
    else:
        result.append("Average orders")

In [32]:
# add results to column
ords_prods_merge['busiest_period_of_day'] = result

### Step 5 print the frequency of the new column

In [33]:
# show frequency distribution of new column
ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

Average orders    29591818
Most orders        2761760
Least orders         51281
Name: busiest_period_of_day, dtype: int64

### Step 6. Ensure notebook is clean and commented - check! :)

### Step 7. Export dataframe as pickle file

In [35]:
# Export data to pkl
ords_prods_merge.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_derived.pkl'))