# 01. Importing libraries

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Data

In [2]:
path = r'C:\Users\Logan\OneDrive\Desktop\Instacart Basket Analysis'
path

'C:\\Users\\Logan\\OneDrive\\Desktop\\Instacart Basket Analysis'

In [3]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'), index_col = False)
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'), index_col = False)

In [4]:
df_ords.head(1)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,


In [5]:
df_prods.head(1)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8


In [6]:
df_dep.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


# 03. Data Wrangling

In [7]:
# Remove unecessary column 'eval_set' from df_ords
df_ords = df_ords.drop(columns = ['eval_set'])
df_ords.head(1)

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,


In [8]:
# Count missing values -> 'NaN'
df_ords['days_since_prior_order'].value_counts(dropna = False)

30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: days_since_prior_order, dtype: int64

In [9]:
# Renaming columns 
# Rename 'order_dow' to 'order_day_of_week'
df_ords.rename(columns = {'order_dow' : 'order_day_of_week'}, inplace = True)
df_ords.head(1)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,


In [10]:
# Change type of order_id to str
df_ords['order_id'] = df_ords['order_id'].astype('str')

In [11]:
df_ords['order_id'].dtype

dtype('O')

In [12]:
# Transpose data from departments into long form
df_dep_t = df_dep.T

In [13]:
#Add index to transposed dataframe
df_dep_t.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [14]:
# Take the first row of df_dep_t for the header
new_header = df_dep_t.iloc[0]

In [15]:
# Remove first row so that we can add our new header and avoid duplicates
df_dep_t_new = df_dep_t[1:]

In [16]:
df_dep_t_new.head()

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol


In [17]:
# Set new_header variable as the new header in datafram
df_dep_t_new.columns = new_header

In [18]:
df_dep_t_new.head()

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol


# 04. 4.4 Task Procedures

In [19]:
# Find another variable in df_ords dataframe that doesn’t need to be included in your analysis as a numeric variable and change it to a suitable format.
df_ords['user_id'] = df_ords['user_id'].astype('str')
df_ords.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3214874.0
mean,17.15486,2.776219,13.45202,11.11484
std,17.73316,2.046829,4.226088,9.206737
min,1.0,0.0,0.0,0.0
25%,5.0,1.0,10.0,4.0
50%,11.0,3.0,13.0,7.0
75%,23.0,5.0,16.0,15.0
max,100.0,6.0,23.0,30.0


In [20]:
# Look for a variable in your df_ords dataframe with an unintuitive name and change its name without overwriting the data frame.
df_ords.rename(columns = {'days_since_prior_order' : 'last_order_days'}, inplace = True)
df_ords.head(1)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,last_order_days
0,2539329,1,1,2,8,


In [21]:
# Your client wants to know what the busiest hour is for placing orders. 
# Find the frequency of the corresponding variable and share your findings.
df_ords['order_hour_of_day'].value_counts(dropna = False)

10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: order_hour_of_day, dtype: int64

In [22]:
# Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.
data_dict = df_dep_t_new.to_dict('index')
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [23]:
print(data_dict.get('4'))

{'department': 'produce'}


In [24]:
df_prods[df_prods['department_id']==4]

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
30,31,White Pearl Onions,123,4,7.5
42,43,Organic Clementines,123,4,11.5
44,45,European Cucumber,83,4,14.3
65,66,European Style Spring Mix,123,4,11.7
88,89,Yogurt Fruit Dip Sliced Apples,123,4,12.6
...,...,...,...,...,...
49582,49578,Black Garlic Bulbs,123,4,8.0
49623,49619,Opo Squash,83,4,12.7
49639,49635,"Baby Food Blueberry, Parsnip & Buckwheat Stage 2",83,4,12.5
49661,49657,Cabernet Tomatoes,83,4,8.3


In [25]:
# The sales team in your client’s organization wants to know more about breakfast item sales. Create a subset containing only the required information
df_breakfast = df_prods[df_prods['department_id']==14]
print(data_dict.get('14'))

{'department': 'breakfast'}


In [26]:
df_breakfast

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [27]:
# They’d also like to see details about customers who might be throwing dinner parties. 
# Your task is to find all observations from the entire dataframe that include items from the following departments: 
#   alcohol, deli, beverages, and meat/seafood. 
# You’ll need to present this subset to your client.
print(data_dict.get('5'))
print(data_dict.get('20'))
print(data_dict.get('7'))
print(data_dict.get('12'))

{'department': 'alcohol'}
{'department': 'deli'}
{'department': 'beverages'}
{'department': 'meat seafood'}


In [28]:
df_dinner_parties = df_prods[df_prods['department_id'].isin([5,20,7,12])]
df_dinner_parties

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [29]:
# It’s important that you keep track of total counts in your dataframes. 
# How many rows does the last dataframe you created have?
# 7650 rows × 5 columns
df_dinner_parties.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7650 entries, 2 to 49688
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   product_id     7650 non-null   int64  
 1   product_name   7647 non-null   object 
 2   aisle_id       7650 non-null   int64  
 3   department_id  7650 non-null   int64  
 4   prices         7650 non-null   float64
dtypes: float64(1), int64(3), object(1)
memory usage: 358.6+ KB


In [30]:
# Someone from the data engineers team in Instacart thinks they’ve spotted something strange about the customer with a "user_id" of “1.” 
# Extract all the information you can about this user.
df_user_1 = df_ords.loc[df_ords['user_id'] == '1']
df_user_1.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,last_order_days
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [31]:
# You also need to provide some details about this user’s behavior. 
# What basic stats can you provide based on the information you have?
df_user_1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,last_order_days
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [37]:
df_ords.shape

(3421083, 6)

In [38]:
# Export your df_ords dataframe as “orders_wrangled.csv” in your “Prepared Data” folder.
df_ords.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled_2.csv'))

In [34]:
# Export the df_dep_t_new dataframe as “departments_wrangled.csv” in your “Prepared Data” folder so that you have a “.csv” file of your departments data in the correct format.
df_dep_t_new.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'departments_wrangled.csv'))