### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import os

### Importing Data

In [3]:
# Folder Path

path = r'C:\Users\Windows 10\Documents\04-2023 Instacart Basket Analysis'

In [4]:
path

'C:\\Users\\Windows 10\\Documents\\04-2023 Instacart Basket Analysis'

In [5]:
# Importing 'orders_products_merged.pkl' via pandas.pkl and os.path.join

df_orders_products_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

### Exercise 4.7

In [6]:
# Check the output

df_orders_products_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


In [7]:
# Create a subset up to 100000

df = df_orders_products_merged[:1000000]

### 1. If-Statements using User-Defined Function

In [8]:
# Defining a function

# As this is a function you’re creating yourself, you need to start by defining it.
# This is done by way of the "def" syntax at the beginning of the code.
# Following this is the name you want to give your new function: price_label.
# In the parentheses is "row", which is a standard argument telling the function to look at each row within the dataframe.
# The colon before the "return" translates to “then,” 
# making the entire statement read: “If the value in the ‘prices’ column within the given row is less than or equal to 5,
# then return the string ‘Low-range product.’”
# The last "else" encompasses any other possible situations that fall outside the criteria set forth by the three if and elseif statements.
# Think of it as a bucket to catch anything that leaks all the way through
# In this case, the else is important as it will catch missing values (which haven’t been addressed anywhere in your conditions).

def price_label(row):
    
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else: return 'Not enough data'

In [9]:
df.shape

(1000000, 14)

In [10]:
# Applying the fuction
# Remember, define first, apply second. That’s the order when working with user-defined functions! 

df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [11]:
df['price_range'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range, dtype: int64

In [12]:
# The max() function returns the max value within the “prices” column of the df dataframe.

df['prices'].max()

14.8

### 2. If-Statements with the loc() Function

In [13]:
# By using loc(), 
# you can apply the conditional logic of an if-statement to a function without explicitly creating an if-else construct.

# The loc() function locates a particular column in the dataframe it’s been assigned to.
# Now, a logical operator (smaller than, larger than, equal to, etc.) is being added into the function, as well.
# This is used to create a condition.
# The difference here is that there’s no explicit if in your if-statement. Instead, it’s all been implied.

In [14]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

# After the comma comes the implied “then.”
# Here, a new column called “price_range_loc” is being set equal to the string “High-range product.”

# The two implied halves of the if-statement, then, would be:
# 1. if = df.loc[df['prices'] > 15,
# 2. then = 'price_range_loc'] = 'High-range product'
# Remember—the comma is a key! It’s what separates the “if” from the “then.”

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [15]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

# These two conditions are combined by way of the & sign in the middle of the “if” half.
# Additionally, the two conditions have been placed inside of parentheses.
# This simply ensures that they’re both treated as separate conditions.
# Whenever you’re working with multiple conditions within the same statement, section them off with parentheses!

In [16]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [17]:
df['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
Low-range product    243550
Name: price_range_loc, dtype: int64

#### Notes

In [18]:
# First, using loc() won’t result in a warning message.
# While this won’t actually interfere with your work, it’s still a sign that, for whatever reason, Python thinks you should be doing something different.
# Second, the loc() method runs much faster.
# This is because the loc() function applies the conditional filters before searching through the dataframe,
# while your user-defined function searched through the entire dataframe, then determined where to set the filters (remember axis = 1?).

#### Try repeating the process again on your entire dataframe as opposed to the subset

In [19]:
df_orders_products_merged.loc[df_orders_products_merged['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [20]:
df_orders_products_merged.loc[(df_orders_products_merged['prices'] <= 15) & (df_orders_products_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [21]:
df_orders_products_merged.loc[df_orders_products_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [22]:
df_orders_products_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: price_range_loc, dtype: int64

In [23]:
# Thanks to loc(), you can now conduct your filtering operation on the entire dataframe, rather than just a subset.
# If you’d tried to do the same thing with your user-defined function, you may very well have received a memory error, but not so with loc(). That’s the power of Python!

### 3. If-Statements with For-Loops

In [24]:
# To start, you need to know on which day most orders take place.
# You can find this out by printing the frequency of the “orders_day_of_the_week” column

df_orders_products_merged['orders_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_week, dtype: int64

In [25]:
# The first step is to create an empty list, "result" then creating the loop
# This will act as the empty shell, into which you can place the results from your loop.

result = []

for value in df_orders_products_merged['orders_day_of_week']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regularly busy')
        
# This "value" here is simply acting as a placeholder. It could stand for anything. And you could call it anything, too (oftentimes, an x is used)
# You can think of this element in the code as representing every entry that the loop will check. 

In [26]:
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Reg

In [27]:
# create a new column within your df_orders_products_merged dataframe and set it equal to result

df_orders_products_merged['busiest_day'] = result

In [28]:
df_orders_products_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: busiest_day, dtype: int64

In [29]:
df_orders_products_merged.shape

(32404859, 16)

### Task 4.7

### Step 1

#### If you haven’t done so already, complete the instructions in the Exercise for creating the “price_label” and “busiest_day” columns.

In [30]:
# Done

### Step 2

#### Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. 

In [31]:
# Find out the frequency of the “orders_day_of_the_week” column

df_orders_products_merged['orders_day_of_week'].value_counts(dropna = False)

0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: orders_day_of_week, dtype: int64

In [32]:
# Result
# Two busiest days: 0 (Saturday), 1 (Sunday)
# Two slowest days: 3 (Tuesday), 4 (Wednesday)

In [33]:
# The first step is to create an empty list, "result2" then creating the loop

result2 = []

for value in df_orders_products_merged['orders_day_of_week']:
    if value == 0 or value == 1:
        result2.append('Busiest days')
    elif value == 3 or value == 4:
        result2.append('Slowest days')
    else:
        result2.append('Regularly days')

In [34]:
# Create a new column (busiest_days) within my df_orders_products_merged dataframe and set it equal to result2

df_orders_products_merged['busiest_days'] = result2

In [35]:
# Check the output

df_orders_products_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly days
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days


### Step 3

#### Check the values of this new column for accuracy.

In [36]:
# Check the frequency

df_orders_products_merged['busiest_days'].value_counts(dropna = False)

Regularly days    12916111
Busiest days      11864412
Slowest days       7624336
Name: busiest_days, dtype: int64

In [37]:
df_orders_products_merged.shape

(32404859, 17)

### Step 4

#### When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” Create a new column containing these labels called “busiest_period_of_day.”

In [38]:
# To start, I need to know on which hour most orders take place.
# Check the frequency

df_orders_products_merged['order_hour_of_day'].value_counts(dropna = False)

10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: order_hour_of_day, dtype: int64

#### Since I want to categorize the hours into three parts: “Most orders,” “Average orders,” and “Fewest orders.”, then I will divide the total count of hours into 3 parts as well  (24/3 = 8). 

#### 8 counts for the busiest hours (most orders) = 10am, 11am, 14pm, 15pm, 13pm, 12pm, 16pm, 9am
#### 8 counts for the average hours (average orders) = 17pm, 8am, 18pm, 19pm, 20pm, 7am, 21pm, 22pm
#### 8 counts for the least hours (fewest orders) = 23pm, 6am, 0am, 1am, 5am, 2am, 4am, 3am

In [39]:
# The first step is to create an empty list, "hours" then creating the loop

hours = []


for value in df_orders_products_merged['order_hour_of_day']:
    if value in [10, 11, 14, 15, 13, 12, 16, 9]:
        hours.append('Most orders')
    elif value in [23, 6, 0, 1, 5, 2, 4, 3]:
        hours.append('Fewest orders')
    else:
        hours.append('Average orders')


In [40]:
# Create a new column (busiest_period_of_day) within my df_orders_products_merged dataframe and set it equal to hours

df_orders_products_merged['busiest_period_of_day'] = hours

In [41]:
# Check the output

df_orders_products_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly days,Average orders
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Average orders
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Slowest days,Most orders
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Average orders
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Slowest days,Most orders


In [42]:
# Check the output of the 'order_hour_of_day' and the 'busiest_period_of_day' column for accuracy

df_orders_products_merged[['order_hour_of_day', 'busiest_period_of_day']].head(10)

Unnamed: 0,order_hour_of_day,busiest_period_of_day
0,8,Average orders
1,7,Average orders
2,12,Most orders
3,7,Average orders
4,15,Most orders
5,7,Average orders
6,9,Most orders
7,14,Most orders
8,16,Most orders
9,8,Average orders


### Step 5

#### Print the frequency for this new column.

In [43]:
df_orders_products_merged['busiest_period_of_day'].value_counts(dropna = False)

Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: busiest_period_of_day, dtype: int64

### Step 6

#### Ensure your notebook is clean and structured and that your code is well commented

### Step 7

#### Export your dataframe as a pickle file (since you added new columns) and store it correctly in your “Prepared Data” folder.

In [44]:
df_orders_products_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_update.pkl'))