# 4.7 Deriving New Variables

## This script contains the following points:

### 01. Importing Libraries
### 02. Importing Data
### 03. Checking Dataframes
### 04. Creating a New Variable ('price_range')
### 05. Creating a New Variable ('busiest_day')
### TASK
### 06. Creating a Busiest Days Column
### 07. Creating a Busiest Hours of the Day Column
### 08. Exporting Data

## 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

## 02. Importing Data

In [2]:
# Set project folder as a string
path = r'/Users/matthewjones/Documents/CareerFoundry/Data Immersion/Achievement 4/04-2024 Instacart Basket Analysis'

In [3]:
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

## 03. Checking Dataframes

In [4]:
# Check the dataframe's shape and output
ords_prods_merge.shape

(32404859, 15)

In [5]:
ords_prods_merge.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117,19,3.0,both
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23,19,4.4,both
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24,4,10.3,both
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both


In [6]:
# Drop the no longer useful merge flag column
ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

## 04. Creating a New Variable ('price_range')

### Testing with a Subset of the Dataframe

In [7]:
# Create a subset of dataframe (first 1,000,000 rows)
df = ords_prods_merge[:1000000]

### If-Statement with User-Defined Function

In [8]:
# Define the price_label function
def price_label(row):
    
    if row['prices'] <= 5:
        return 'Low-Range Product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-Range Product'
    elif row['prices'] > 15:
        return 'High-Range Product'
    else: return 'Not enough data'

In [9]:
# Apply the price_label function to the new column 'price_range'
df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [10]:
# Check the output
df['price_range'].value_counts()

price_range
Mid-Range Product     673183
Low-Range Product     314392
High-Range Product     12268
Not enough data          157
Name: count, dtype: int64

In [11]:
# Check the maximum values of the 'prices' variable
df['prices'].max()

25.0

### If-Statement with the loc() Function

In [12]:
# Locate the High-Range prices
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-Range Product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-Range Product'


In [13]:
# Locate the Mid-Range prices
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-Range Product'

In [14]:
# Locate the Low-Range prices
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-Range Product'

In [15]:
# Check the output
df['price_range_loc'].value_counts()

price_range_loc
Mid-Range Product     673183
Low-Range Product     314392
High-Range Product     12268
Name: count, dtype: int64

##### The resulting values are the same for both methods

### Using the loc() Function on the Full Dataframe

In [16]:
# Locate the High-Range prices
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-Range Product'

In [17]:
# Locate the Mid-Range prices
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-Range Product'

In [18]:
# Locate the Low-Range prices
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-Range Product'

In [19]:
# Check the output
ords_prods_merge['price_range_loc'].value_counts()

price_range_loc
Mid-Range Product     21860860
Low-Range Product     10126321
High-Range Product      412551
Name: count, dtype: int64

## 05. Creating a New Variable ('busiest_day')

In [20]:
# Check the frequency of orders per day of the week
ords_prods_merge['order_day_of_week'].value_counts(dropna = False)

order_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [21]:
# Define a list (result)
result = []

# Run a for-loop that will add a string to result based on the day of the week
for value in ords_prods_merge['order_day_of_week']:
    if value == 0:
        result.append('Busiest')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')

In [22]:
# Print the list
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy

In [23]:
# Create a new column ('busiest_day') with the values from result
ords_prods_merge['busiest_day'] = result

In [24]:
# Check the values for accuracy
ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regularly busy    22416875
Busiest            6204182
Least busy         3783802
Name: count, dtype: int64

In [25]:
# Renaming the 'price_range_loc' column to be consistent with the exercise
ords_prods_merge = ords_prods_merge.rename(columns = {'price_range_loc' : 'price_label'})

# TASK

## 06. Creating a Busiest Days column

In [26]:
# Define a list (new_result)
new_result = []

# Run a for-loop that will add a string to new_result based on the day of the week
for value in ords_prods_merge['order_day_of_week']:
    if (value == 0) | (value == 1):
        new_result.append('Busiest')
    elif (value == 3) | (value == 4):
        new_result.append('Least busy')
    else:
        new_result.append('Regularly busy')

In [27]:
# Create a new column ('busiest_days') with the values from new_result
ords_prods_merge['busiest_days'] = new_result

In [28]:
# Check the values for accuracy
ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regularly busy    12916111
Busiest           11864412
Least busy         7624336
Name: count, dtype: int64

##### The busiest days are Saturday (0) and Sunday (1) // The least busy days are Wednesday (4) and Tuesday (3)
##### 6,204,182 + 5,660,230 = 11,864,412
##### 3,783,802 + 3,840,534 = 7,624,336

## 07. Creating a Busiest Hours of the Day column

In [29]:
# Check the frequency of orders per hour of day
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [30]:
# Define a list (hour_result)
hour_result = []

# Run a for-loop that will add a string to hour_result based on the hour of the day
# There are 24 hours in the day, so hours are divided into three periods
for value in ords_prods_merge['order_hour_of_day']:
    if value in [10, 11, 14, 15, 13, 12, 16, 9]:
        hour_result.append('Most Orders')
    elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
        hour_result.append('Fewest Orders')
    else:
        hour_result.append('Average Orders')

In [31]:
# Create a new column ('busiest_period_of_day') with the values from hour_result
ords_prods_merge['busiest_period_of_day'] = hour_result

In [32]:
# Check the values for accuracy
ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

busiest_period_of_day
Most Orders       21118071
Average Orders     9997651
Fewest Orders      1289137
Name: count, dtype: int64

##### The busiest hours in the day are from 9am - 4pm // The least busiest hours in the day are from 11pm - 6am
##### 2,761,760 + 2,736,140 + 2,689,136 + 2,662,144 + 2,660,954 + 2,618,532 + 2,535,202 + 2,454,203 = 21,118,071
##### 402,316 + 290,493 + 218,769 + 115,700 + 87,961 + 69,375 + 53,242 + 51,281 = 1,289,137

## 08. Exporting the Data

In [33]:
# Export ords_prods_merge to the Prepared Data folder
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_NewVariables.pkl'))