# 4.4 Data Wrangling and Subsetting

## This script contains the following points:

### 01. Importing Libraries
### 02. Importing Data
### 03. Data Wrangling (in Exercise)
### 04. Data Wrangling (in Task)
### 05. Exporting Data

# 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [2]:
# Set project folder as a string
path = r'/Users/matthewjones/Documents/CareerFoundry/Data Immersion/Achievement 4/04-2024 Instacart Basket Analysis'

In [3]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)

In [4]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [5]:
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

# 03. Data Wrangling (in Exercise)

## Dropping Columns

In [6]:
# Drop the eval_set column from orders.csv
df_ords = df_ords.drop(columns = ['eval_set'])

In [7]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## Renaming Columns

In [8]:
# Rename order_dow column to be more intuitive
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)

In [9]:
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## Changing Data Types

In [10]:
# Change order_id from an integer to a string/object
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [11]:
df_ords['order_id'].dtype

dtype('O')

## Transposing Data

In [12]:
# Transpose df_dep
df_dep_t = df_dep.T

In [13]:
df_dep_t

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [14]:
# Add an index column to df_dep_t
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [15]:
# Create a new header with the values of the first row (Index = 0)
new_header = df_dep_t.iloc[0]

In [16]:
# Remove the first row
df_dep_t_new = df_dep_t[1:]

In [17]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [18]:
# Set the column header to the correct values
df_dep_t_new.columns = new_header

In [19]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


## Creating a Data Dictionary

In [20]:
# Use df_dep_t_new to create a data dictionary (give meaning to department_id)
data_dict = df_dep_t_new.to_dict('index')

In [21]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

## Subsetting

In [22]:
# Create a subset of df_prods of only the data from the snack department (department_id = 19)
df_snacks = df_prods[df_prods['department_id'] == 19]

In [23]:
df_snacks.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
15,16,Mint Chocolate Flavored Syrup,103,19,5.2
24,25,Salted Caramel Lean Protein & Fiber Bar,3,19,1.9
31,32,Nacho Cheese White Bean Chips,107,19,4.9
40,41,Organic Sourdough Einkorn Crackers Rosemary,78,19,6.5


# 04. Data Wrangling (in Task)

## 2. Change another identifier variable that does not need to be included as a numeric variable

In [24]:
# Change user_id from an integer to a string/object
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [25]:
# Check to see that user_id is now categorized as an object
df_ords['user_id'].dtype

dtype('O')

## 3. Change a variable with an unintuitive name without overwriting the dataframe

In [26]:
# Change order_number to number_of_orders without the final inplace argument
df_ords.rename(columns = {'order_number' : 'number_of_orders'})

Unnamed: 0,order_id,user_id,number_of_orders,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0
3421079,1854736,206209,11,4,10,30.0
3421080,626363,206209,12,1,12,18.0
3421081,2977660,206209,13,1,12,7.0


In [27]:
# Check to see that df_ords still has its original variable name
df_ords.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


## 4. Find the busiest hour for placing orders

In [28]:
# Find the frequency of the order_hour_of_day variable
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

### 10am is the busiest hour

## 5. Determine the meaning behind a '4' in the department_id column

In [29]:
# Call the data dictionary for index 4
data_dict['4']

{'department': 'produce'}

### Produce has a department_id of 4

## 6. Create a subset of only breakfast item sales

In [30]:
# Create subset of df_prods of only the data from the breakfast department (department_id = 14)
df_breakfast = df_prods[df_prods['department_id'] == 14]

In [31]:
df_breakfast.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6


## 7. Create a subset of items from the alcohol, deli, beverages, and meat/seafood departments

In [32]:
# Create a subset of df_prods of only data from the alcohol, deli, beverages, and meat/seafood departments 
# (department_id is in 5, 20, 7, or 12)
df_dinner_party = df_prods[df_prods['department_id'].isin([5, 20, 7, 12])]

In [33]:
df_dinner_party.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
19,20,Pomegranate Cranberry & Aloe Vera Enrich Drink,98,7,6.0
22,23,Organic Turkey Burgers,49,12,8.2
34,35,Italian Herb Porcini Mushrooms Chicken Sausage,106,12,15.1
38,39,Daily Tangerine Citrus Flavored Beverage,64,7,12.5
39,40,Beef Hot Links Beef Smoked Sausage With Chile ...,106,12,22.5


## 8. Finding the total rows of df_dinner_party

In [34]:
# Use the shape function to find the number of rows and columns
df_dinner_party.shape

(7650, 5)

### 7650 rows or items

## 9. Extract all information possible from the customer with user_id of 1

In [35]:
# Create a subset of df_ords of only the data where user_id = 1
df_user1 = df_ords[df_ords['user_id'] == '1']

In [36]:
df_user1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## 10. Describe this customer's behavior

In [37]:
# Find the descriptive statistics of the subset dataframe, df_user1
df_user1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


### On average, this customer places orders in the morning (and never beyond 4pm), and early in the week (never past Thursday). This customer has never purchased the same number of items twice, averaging at 6 items per order. And on average, they go 19 days between orders.

# 05. Exporting Data

In [38]:
# Export df_ords to the Prepared Data folder
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'))