# 4.7 Deriving new variables - Part 1

## This script contains the following points:

### Step 1. Import data
### Step 2. Create subset
### Step 3. Define and apply function
### Step 4. Create loop results

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [2]:
# Assign project folder path to a variable

path = r'C:\Users\jomok\Documents\Career Foundry\Achievement 4\07-2023 Instacart Basket Analysis'

## 1. Import data

In [3]:
# Import data

df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

## 2. Create subset

In [4]:
# Create subset

df = df_ords_prods_merged[:1000000]

In [5]:
# Check shape

df.shape

(1000000, 15)

In [6]:
# Check output

df.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,prior,1,2,8,0.0,196,1,0,Soda,77,7,9.0,both
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [7]:
# Drop extra column

df = df.drop(columns = ['_merge'])

In [8]:
# Check shape

df.shape

(1000000, 14)

## 3. Define and apply function

In [9]:
# Define a function
def price_label(row):

  if row['prices'] <= 5:
    return 'Low range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid range product'
  elif row['prices'] > 15:
    return 'High range product'
  else: return np.nan

In [10]:
# Apply the function

df['price_range'] = df.apply(price_label, axis=1)

In [11]:
# Get frequency of "price_range" variable

df['price_range'].value_counts(dropna = False)

Mid range product    756450
Low range product    243550
Name: price_range, dtype: int64

In [12]:
# Find most expensive product in subset

df['prices'].max()

14.8

## 4. Create loop results

In [13]:
# Create price range conditions

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [14]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [15]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [16]:
# Get frequency of "price_range_loc" variable

df['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

In [17]:
# Create price range conditions for entire dataframe

df_ords_prods_merged.loc[df_ords_prods_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [18]:
df_ords_prods_merged.loc[(df_ords_prods_merged['prices'] <= 15) & (df_ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [19]:
df_ords_prods_merged.loc[df_ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [20]:
# Get frequency of "price_range_loc" variable

df_ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

In [21]:
# Get frequency of "df_orders_day_of_the_week" column

df_ords_prods_merged['order_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: order_day_of_week, dtype: int64

In [22]:
# Create empty list for loop results

result = []

for value in df_ords_prods_merged["order_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [23]:
# Print the result

result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [24]:
# Create new column for assigning business values to each day of the week

df_ords_prods_merged['busiest_day'] = result

In [25]:
# Check frequency of "busiest_day" variable

df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

# 4.7 Deriving New Variables - Part 2

## This script contains:

### 1. Create new column for busiest days and slowest days
### 2. Create new column for busiest hours of the day
### 3. Export dataframe

## 1. Create new column for busiest and slowest days

In [26]:
# Step 2 - Create empty list for loop results

result = []

for value in df_ords_prods_merged["order_day_of_week"]:
  if value == 0 or value == 1:
    result.append("Busiest days")
  elif value == 4 or value == 3:
    result.append("Slowest days")
  else:
    result.append("Regularly busy")

In [27]:
# Print the result

result

['Regularly busy',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Slowest days',
 'Regularly busy',
 'Slowest days',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Slowest days',
 'Slowest days',
 'Regularly busy',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Slowest days',
 'Regularly busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest 

In [28]:
# Create new column for assigning business values to each day of the week

df_ords_prods_merged['busiest_day'] = result

In [29]:
# Step 3 - Check frequency of "busiest_day" variable

df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    12916111
Busiest days      11864412
Slowest days       7624336
Name: busiest_day, dtype: int64

Busiest days are now closer in frequency to the regularly busy days

## 2. Create new column for busiest hours of the day

In [30]:
# Get frequency of "order_hour_of_day" column

df_ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

Using the hourly frequency above, the 24 hour period, can be divided into 8-hour buckets assigned to each of the 3 periods:
"Most orders" = 10, 11, 14, 15, 13, 12, 16, 9
"Average orders" = 17, 8, 18, 19, 20, 7, 21, 22
"Fewest orders" = 23, 6, 0, 1, 5, 2, 4, 3

In [31]:
# Step 4 - Create empty list for loop results

hours = []

for value in df_ords_prods_merged["order_hour_of_day"]:
  if value in [10, 11, 14, 15, 13, 12, 16, 9]:
    hours.append("Most orders")
  elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
    hours.append("Fewest orders")
  else:
    hours.append("Average orders")

In [32]:
# Print the result

hours

['Average orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most ord

In [33]:
# Create new column for assigning business values to each hour of the day

df_ords_prods_merged['busiest_period_of_day'] = hours

In [34]:
# Step 5 - Check frequency of "busiest_period_of_day" variable

df_ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: busiest_period_of_day, dtype: int64

There is a huge difference between frequency of when most orders are placed and the other periods, which would explain why the app freezes during those times.

In [35]:
# Check output

df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,2539329,1,prior,1,2,8,0.0,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Average orders
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Average orders
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Slowest days,Most orders


In [36]:
# Drop merge results column

df_ords_prods_merged = df_ords_prods_merged.drop(columns = ['_merge'])

In [37]:
# Check output

df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,price_range_loc,busiest_day,busiest_period_of_day
0,2539329,1,prior,1,2,8,0.0,196,1,0,Soda,77,7,9.0,Mid-range product,Regularly busy,Average orders
1,2398795,1,prior,2,3,7,15.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders
2,473747,1,prior,3,3,12,21.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders
3,2254736,1,prior,4,4,7,29.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Average orders
4,431534,1,prior,5,4,15,28.0,196,1,1,Soda,77,7,9.0,Mid-range product,Slowest days,Most orders


Confirmed that the 3 new columns are now included in the dataframe

In [38]:
# Check shape

df_ords_prods_merged.shape

(32404859, 17)

## 3. Export data

In [39]:
# Step 7 - Export newly created dataframe to pkl

df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_new_columns.pkl'))