# 4.7 - DERIVING NEW VARIABLES


## If-Statements

Excel -> =IF()
SQL -> WHERE


In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# Import dataframes
path = r"D:\0 - Data Analytics\4 - Python fundamentals for Data Analysts\Instacart Basket Analisys"
# import ords_prods_merge.pkl
df_ords_prods_merge = pd.read_pickle(
    os.path.join(path, "02 Data", "Prepared Data", "ords_prods_merge.pkl")
)

# create a subset of the dataframe
df_subset = df_ords_prods_merge[:1000000]

In [3]:
df_subset.shape

(1000000, 15)

### If-Statements with User-Defined Functions

User-defined functions are useful when it comes to conducting custom operations on data.


In [None]:
# Example function to sum values:


def add_num(x, y):
    sum = x + y
    return sum


add_num(9, 11)

20

In [None]:
# Write a function to sort products in different price ranges:
# low-range-product <= 5$
# mid-range-product >5 & <= 15
# high-range-product > 15
# if there's not eoough data, return "Not enough data"


def price_label(row):
    if row["prices"] <= 5:
        return "Low-range-product"
    elif (row["prices"] > 5) and (row["prices"] <= 15):
        return "Mid-range-product"
    elif row["prices"] > 15:
        return "High-range-product"
    else:
        return "Not enough data"

In [None]:
# Apply the function to the subset
df_subset["price_range"] = df_subset.apply(price_label, axis=1)

# df_subset['price_range'] instead of callinge the whole dataframe, we are creating a new column called price_range


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['price_range'] = df_subset.apply(price_label, axis=1)


In [None]:
# Check the frequency of the new column
df_subset["price_range"].value_counts()

price_range
Mid-range-product     673183
Low-range-product     314392
High-range-product     12425
Name: count, dtype: int64

In [None]:
# check max value of the prices column
df_subset["prices"].max()

99999.0

### If-Statements with the loc() Function

Using loc(), you can apply the conditional logic of an if-statement to a function without explicitly creating an if-else construct


In [None]:
# high range product
df_subset.loc[df_subset["prices"] > 15, "price_range_loc"] = "High-range product"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset.loc[df_subset['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [None]:
# mid-range product
df_subset.loc[
    (df_subset["prices"] <= 15) & (df_subset["prices"] > 5), "price_range_loc"
] = "Mid-range product"

In [None]:
# low-range product
df_subset.loc[df_subset["prices"] <= 5, "price_range_loc"] = "Low-range product"

In [None]:
df_subset["price_range_loc"].value_counts(dropna=False)

price_range_loc
Mid-range product     673183
Low-range product     314392
High-range product     12425
Name: count, dtype: int64

### Using loc on entire dataframe not just a subset


In [None]:
# High Range
df_ords_prods_merge.loc[df_ords_prods_merge["prices"] > 15, "price_label"] = (
    "High-range product"
)


In [None]:
# Mid Range
df_ords_prods_merge.loc[
    (df_ords_prods_merge["prices"] <= 15) & (df_ords_prods_merge["prices"] > 5),
    "price_label",
] = "Mid-range product"

In [None]:
# High Range
df_ords_prods_merge.loc[df_ords_prods_merge["prices"] <= 5, "price_label"] = (
    "Low-range product"
)

In [None]:
# checking the values
df_ords_prods_merge["price_label"].value_counts(dropna=False)

price_label
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

### If-Statements with For-Loops

For-loops are loops for running the same block of code multiple times. They’re used to perform the same function on multiple elements, for instance, by running through an entire dataframe and performing a function on each row within that dataframe.


In [None]:
# For-loop exaple
# Print “My age is X” for ages 30 to 45

for x in range(30, 45):
    print("My age is", x)

My age is 30
My age is 31
My age is 32
My age is 33
My age is 34
My age is 35
My age is 36
My age is 37
My age is 38
My age is 39
My age is 40
My age is 41
My age is 42
My age is 43
My age is 44


In [None]:
# Find which day of the weeks are busy days
# First count the values
df_ords_prods_merge["orders_day_of_week"].value_counts(dropna=False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [19]:
# Create a new column called busiest_day
result = []

for value in df_ords_prods_merge["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

# Create an empty list result[] to store the results

In [20]:
# Print result
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy

In [None]:
# Create a new column in the dataframe with the results
df_ords_prods_merge["busiest_day"] = result

In [None]:
# check the frequency of the values
df_ords_prods_merge["busiest_day"].value_counts(dropna=False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [23]:
# check columns names
df_ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_last_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge', 'price_label',
       'busiest_day'],
      dtype='object')

# TASK 4.7


### 1

If you haven’t done so already, complete the instructions in the Exercise for creating the “price_label” and “busiest_day” columns.
Done.


### 2

Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method.


In [24]:
# Create a new column called "busiest_days" with the results
# I will use a for-loop to create the new column

result = []

for value in df_ords_prods_merge["orders_day_of_week"]:
    if value == 0 or value == 1:
        result.append("Busiest days")
    elif value == 3 or value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [None]:
# add the column to the dataframe
df_ords_prods_merge["busiest_days"] = result

### 3

Check the values of this new column for accuracy. Note any observations in markdown format.


In [None]:
# check the frequency of the values of the new column
df_ords_prods_merge["busiest_days"].value_counts(dropna=False)

busiest_days
Regularly busy    12916111
Busiest days      11864412
Least busy         7624336
Name: count, dtype: int64

In [None]:
# check the frequency of the values of the busiest_day column
df_ords_prods_merge["busiest_day"].value_counts(dropna=False)

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [None]:
# check shape of busiest_day and busiest_days columns
df_ords_prods_merge[["busiest_day", "busiest_days"]].shape


(32404859, 2)

Total number column in the dataframe has stayed the same, so all columns gave been allocated.
It appears that on the 2 most busiest days are amde almost as many orders as on the regualarly busy days.


### 4

When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” Create a new column containing these labels called “busiest_period_of_day.”


In [None]:
# check the frequency of hours of the day
df_ords_prods_merge["order_hour_of_day"].value_counts(dropna=False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

I will sort the orders in 3 main time frames based on the number of orders:

- Most orders = hours 9-16
- Fewest orders = 23-6
- Average orders = the other hours (17/23 & 8/9)

In this way we should get 3 8 hours time frame labeled accordingly


In [None]:
# Create a for loop that label the hours of the day for the amount of orders and store it in a new column called "busiest_period_of_day"

result = []

for value in df_ords_prods_merge["order_hour_of_day"]:
    if value >= 9 and value <= 16:
        result.append("Most orders")
    elif value >= 23 or value <= 6:
        result.append("Fewest orders")
    else:
        result.append("Average orders")

# add the column to the dataframe
df_ords_prods_merge["busiest_period_of_day"] = result


### 5

Print the frequency for this new column.


In [None]:
# Print the frequency of the values of the new column
df_ords_prods_merge["busiest_period_of_day"].value_counts(dropna=False)

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

### 6

Ensure your notebook is clean and structured and that your code is well commented. Done.


### 7

Export your dataframe as a pickle file (since you added new columns) and store it correctly in your “Prepared Data” folder.


In [None]:
# Export the dataframe to pkl
df_ords_prods_merge.to_pickle(
    os.path.join(path, "02 Data", "Prepared Data", "ords_prods_merge_V2.pkl")
)