# 01 Importing libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 02 Importing Data

In [4]:
path = r'/Users/woodoooo/Desktop/Instacart Basket Analysis/'

#orders data
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'orders.csv'), index_col = False)

#products data
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'products.csv'), index_col = False)

#departments data
df_dep = pd.read_csv(os.path.join(path, '02 Data', 'Original data', 'departments.csv'), index_col = False)

# 03 Data wrangling (orders.csv)

In [16]:
# removind eval_set column

df_ords_2 = df_ords.drop(columns = ['eval_set'])

# re-naming 'order_dow' to 'order_of_the_week
df_ords_2.rename(columns = {'order_dow':'order_day_of_week'}, inplace = True)

In [17]:
df_ords_2.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [18]:
# 'order_id' and 'user_id don’t need to be included in the analysis as a numeric variable
# both have been changed to a string data type

df_ords_2[['order_id','user_id']] = df_ords_2[['order_id','user_id']].astype('str')

# 04 Data wrangling (departments.csv)

In [22]:
df_dep

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [23]:
# transposing department data
df_dep.T

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta


In [28]:
# create new dataframe for transposed department data
df_dep_t = df_dep.T

# adding index
df_dep_t.reset_index()

# make a new header (first row of df_dep_t)
new_header = df_dep_t.iloc[0]

# checking
new_header

0    department
Name: department_id, dtype: object

In [29]:
# creating a new dataframe fromm the 1st row
df_dep_t_new = df_dep_t[1:]

In [30]:
df_dep_t_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [32]:
# adding new header
df_dep_t_new.columns = new_header

In [33]:
df_dep_t_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


# 05 Data Dictionary for Department_id

In [35]:
data_dict = df_dep_t_new.to_dict('index')

In [36]:
data_dict

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

## 06 Business Questions

In [21]:
# The busiest hour for placing orders is from 10AM till 11AM. 
df_ords_2['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

In [37]:
# Determine the meaning behind a value of 4 in the "department_id" column within the df_prods dataframe using a data dictionary.

print(data_dict.get('4'))

{'department': 'produce'}


In [39]:
(df_prods['department_id'] == 4).sum()

np.int64(1684)

In [40]:
df_produce = df_prods.loc[df_prods['department_id'] == 4]

In [41]:
df_produce

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
30,31,White Pearl Onions,123,4,7.5
42,43,Organic Clementines,123,4,11.5
44,45,European Cucumber,83,4,14.3
65,66,European Style Spring Mix,123,4,11.7
88,89,Yogurt Fruit Dip Sliced Apples,123,4,12.6
...,...,...,...,...,...
49582,49578,Black Garlic Bulbs,123,4,8.0
49623,49619,Opo Squash,83,4,12.7
49639,49635,"Baby Food Blueberry, Parsnip & Buckwheat Stage 2",83,4,12.5
49661,49657,Cabernet Tomatoes,83,4,8.3


In [42]:
# They’d also like to see details about products that customers might use to throw dinner parties. 
# Your task is to find all observations from the entire dataframe that include items from the following departments: 
# alcohol, deli, beverages, and meat/seafood. You’ll need to present this subset to your client.

In [51]:
# Creating a subset for dinner parties

df_dinner_parties = df_prods.loc[df_prods['department_id'].isin([5,7,19,20])]
df_dinner_parties.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8


In [60]:
# How many rows does the last dataframe you created have?
df_dinner_parties.shape

(13007, 5)

In [58]:
df_dinner_parties['prices'].describe()

count    13007.000000
mean         6.190221
std          3.609979
min          1.000000
25%          3.400000
50%          5.300000
75%          8.200000
max         15.000000
Name: prices, dtype: float64

In [59]:
df_dinner_parties['prices'].describe().round(0)

count    13007.0
mean         6.0
std          4.0
min          1.0
25%          3.0
50%          5.0
75%          8.0
max         15.0
Name: prices, dtype: float64

In [61]:
# Extract all the information about the user_id 1


In [62]:
# set a data frame for user_id 1
df_user1 = df_ords_2.loc[df_ords_2['user_id'] == '1']

In [63]:
df_user1

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [66]:
df_user1.describe()

Unnamed: 0,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


In [67]:
# Basic stats based on the information re user_id 1

# 11 total orders (order_number from 1 to 11).

# Buys regularly — average gap between orders is 19 days, ranging from 0 to 30 days.

# Orders at various times of day, but most often around 10 AM on average, with actual times ranging from 7 AM to 4 PM.

# Orders on different days of the week : average is around day 2 or 3, ranging from day 1 to day 4.

# 03 Exporting Data

In [68]:
df_ords_2.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled_task_4.4.csv'))

In [69]:
df_dep_t_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))