# Wrangling & Subsetting Data

## Lecture Steps

In [1]:
# Importing libraries
import os
import pandas as pd
import numpy as np

In [3]:
# setting master path
path = r'/Users/Norberto/Desktop/2023-10 Instacart Basket Analysis'

# importing orders data file
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'))


In [4]:
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


In [10]:
# dropping eval_set column since it is not necessary. Assign new df to old df.
df_ords = df_ords.drop(columns = ['eval_set'])

In [11]:
# rename column in dataframe
df_ords.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [12]:
# change data type for order_id so numerical stats aren't provided.
df_ords['order_id'] = df_ords['order_id'].astype('str')
df_ords['user_id'] = df_ords['user_id'].astype('str')

In [13]:
# importing departments data file
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'),index_col = False)

In [14]:
# transpose data set and assign to new variable
df_dep_t=df_dep.T

# reset index to fix column names
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [15]:
# grab titles for columns in  transposed dataframe
new_header = df_dep_t.iloc[0]

In [16]:
# create new transposed dataframe without index column
df_dep_t_new = df_dep_t[1:]

In [17]:
# assign proper column titles to new transposed df
df_dep_t_new.columns = new_header

In [18]:
# create data dictionary with department id's and names; used for subsetting
data_dict = df_dep_t_new.to_dict('index')

In [19]:
# importing products data file
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [20]:
# creating new dataframe with only snack items, using data dictionary info
df_snacks =  df_prods[df_prods['department_id']==19]

## Task

### Step 2

Find another identifier variable in the df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.

In [21]:
# view columns currently being described and stats
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [22]:
# change 'user_id' column since it isn't providing valuable nuumerical data
df_ords['user_id'] = df_ords['user_id'].astype('str')
df_ords['user_id'].dtype

dtype('O')

### Step 3

Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the dataframe.

In [23]:
# list original column names
df_ords.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order'],
      dtype='object')

In [24]:
# create new df as a copy of existing df
df_ords_new = df_ords[:]

# remove 's' from orders_day_of_week column
df_ords_new.rename(columns = {'orders_day_of_week' : 'order_day_of_week'}, inplace = True)
df_ords_new.head(2)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0


In [25]:
# original dataframe 
df_ords.head(2)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0


### Step 4

Your client wants to know what the busiest hour is for placing orders. Find the frequency of the corresponding variable and share your findings.

In [26]:
# frequency table for hour of day column, including all values
df_ords['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

The busiest hour of the day is 10 with 288,418 orders

### Step 5

Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

In [27]:
# retrieve the element with key value of '4' from data dictionary created in Lecture Notes
data_dict.get('4')

{'department': 'produce'}

### Step 6

The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information.

In [28]:
# create subset of df with matching department id
df_breakfast =  df_prods[df_prods['department_id']==14]
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


### Step 7

They’d also like to see details about products that customers might use to throw dinner parties. Your task is to find all observations from the entire dataframe that include items from the following departments: alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [29]:
# list department keys and values
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [30]:
# create df as subset using keys for matching categories
df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,20,7,12])]
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


### Step 8

It’s important that you keep track of total counts in your dataframes. How many rows does the last dataframe you created have?

In [31]:
# list the dimensions of the subset
df_dinner_parties.shape

(7650, 5)

The Dinner Parties sub-dataframe has 7650 rows

### Step 9

Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” Extract all the information you can about this user.

In [32]:
# create subset of user where user_id is equal to user in question
df_strange_user = df_ords[df_ords['user_id']=='1']
df_strange_user

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


### Step 10

You also need to provide some details about this user’s behavior. What basic stats can you provide based on the information you have?

In [33]:
# descriptive analysis of dataframe with only subset values
df_strange_user.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


### Steps 12 & 13

Export your df_ords dataframe as “orders_wrangled.csv” in your “Prepared Data” folder.

Export the df_dep_t_new dataframe as “departments_wrangled.csv” in your “Prepared Data” folder so that you have a “.csv” file of your departments data in the correct format.

In [34]:
df_ords.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [36]:
df_ords.dtypes

order_id                   object
user_id                    object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

In [35]:
# export wrangled data back to orders csv file
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

# export wrangled data back to departments csv file
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))