### **INDEX**
1. Setup Notebook
2. Following Along Exercise
3. Exercise Directions

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

### **1 SETUP NOTEBOOK**

In [2]:
# import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [4]:
# get the current working directory as a Path object
path = os.getcwd()

# update the path to parent folder
path = os.path.abspath(os.path.join(path, os.pardir))

In [6]:
# import orders.csv
df_orders = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders.csv'))

In [8]:
# import products.csv
df_products = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'))

In [10]:
# import departments.csv
df_departments = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'departments.csv'))

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

### **2 FOLLOWING EXERCISE INSTRUCTIONS**

**DATA WRANGLING**

In [12]:
# check orders.csv
df_orders.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [28]:
# orders.csv: drop eval_set
df_orders_dropped = df_orders.drop(columns = ['eval_set'])

In [32]:
# orders.csv: find missing values
df_orders_dropped['days_since_prior_order'].value_counts(dropna =False)

days_since_prior_order
30.0    369323
7.0     320608
6.0     240013
4.0     221696
3.0     217005
5.0     214503
NaN     206209
2.0     193206
8.0     181717
1.0     145247
9.0     118188
14.0    100230
10.0     95186
13.0     83214
11.0     80970
12.0     76146
0.0      67755
15.0     66579
16.0     46941
21.0     45470
17.0     39245
20.0     38527
18.0     35881
19.0     34384
22.0     32012
28.0     26777
23.0     23885
27.0     22013
24.0     20712
25.0     19234
29.0     19191
26.0     19016
Name: count, dtype: int64

In [34]:
# orders.csv: renaming column
df_orders_dropped.rename(columns = {'order_dow' : 'orders_day_of_week'}, inplace = True)

In [36]:
# orders.csv: control changed column name
df_orders_dropped.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


In [40]:
# orders.csv: change data type 
df_orders_dropped['order_id'] = df_orders_dropped['order_id'].astype('str')

In [42]:
# check departments.csv
df_departments.head()

Unnamed: 0,department_id,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,department,frozen,other,bakery,produce,alcohol,international,beverages,pets,dry goods pasta,...,meat seafood,pantry,breakfast,canned goods,dairy eggs,household,babies,snacks,deli,missing


In [17]:
#transposing df_departments with new dataframe
df_departments_tp = df_departments.T

In [44]:
# check transposed departments.csv
df_departments_tp.head()

Unnamed: 0,0
department_id,department
1,frozen
2,other
3,bakery
4,produce


In [46]:
# add index to departments
df_departments_tp.reset_index()

Unnamed: 0,index,0
0,department_id,department
1,1,frozen
2,2,other
3,3,bakery
4,4,produce
5,5,alcohol
6,6,international
7,7,beverages
8,8,pets
9,9,dry goods pasta


In [48]:
# create new header for departments 
new_header = df_departments_tp.iloc[0]

In [50]:
new_header

0    department
Name: department_id, dtype: object

In [52]:
# create a new table excluding row 0 and starting at header row 1
df_departments_new = df_departments_tp[1:]

In [54]:
df_departments_new

Unnamed: 0,0
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


In [60]:
# add in new header to departments
df_departments_new.columns = new_header

In [62]:
df_departments_new

department_id,department
1,frozen
2,other
3,bakery
4,produce
5,alcohol
6,international
7,beverages
8,pets
9,dry goods pasta
10,bulk


 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

**DATA DICTIONARY**

In [75]:
data_dictionary = df_departments_new.to_dict('index')

In [77]:
data_dictionary

{'1': {'department': 'frozen'},
 '2': {'department': 'other'},
 '3': {'department': 'bakery'},
 '4': {'department': 'produce'},
 '5': {'department': 'alcohol'},
 '6': {'department': 'international'},
 '7': {'department': 'beverages'},
 '8': {'department': 'pets'},
 '9': {'department': 'dry goods pasta'},
 '10': {'department': 'bulk'},
 '11': {'department': 'personal care'},
 '12': {'department': 'meat seafood'},
 '13': {'department': 'pantry'},
 '14': {'department': 'breakfast'},
 '15': {'department': 'canned goods'},
 '16': {'department': 'dairy eggs'},
 '17': {'department': 'household'},
 '18': {'department': 'babies'},
 '19': {'department': 'snacks'},
 '20': {'department': 'deli'},
 '21': {'department': 'missing'}}

In [71]:
df_products.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [79]:
print(data_dictionary.get('19'))

{'department': 'snacks'}


 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

**SUBSETTING**

In [86]:
# filtering product.csv
df_snacks =  df_products[df_products['department_id']==19]

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

**EXPORTING**

In [108]:
# export orders
df_orders_dropped.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [93]:
# export products
df_products.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_wrangled.csv'))

In [95]:
# export departments
df_departments_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

### **3 EXERCISE DIRECTIONS**

In [122]:
# evaluate orders 
df_orders_dropped.head(20)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [126]:
# dir2 data types orders 
df_orders_dropped.dtypes

order_id                  object
user_id                   object
order_number               int64
orders_day_of_week         int64
order_hour_of_day          int64
days_since_last_order    float64
dtype: object

In [128]:
df_orders_dropped['user_id'] = df_orders_dropped['user_id'].astype('str')

In [118]:
# dir3 rename unintuitive column orders
df_orders_dropped.rename(columns = {'days_since_prior_order' : 'days_since_last_order'}, inplace = True)

In [144]:
# dir4 most popular order time
df_orders_dropped['order_hour_of_day'].value_counts()

order_hour_of_day
10    288418
11    284728
15    283639
14    283042
13    277999
12    272841
16    272553
9     257812
17    228795
18    182912
8     178201
19    140569
20    104292
7      91868
21     78109
22     61468
23     40043
6      30529
0      22758
1      12398
5       9569
2       7539
4       5527
3       5474
Name: count, dtype: int64

> **10am is the hour most orders are being placed**

In [179]:
# dir5 meaning department_id 4 in products
print(data_dictionary.get('4'))

{'department': 'produce'}


In [187]:
# dir6 subset breakfast item sales
df_breakfast_sales =  df_products [df_products['department_id']==14]

In [189]:
df_breakfast_sales

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
27,28,Wheat Chex Cereal,121,14,10.1
33,34,,121,14,12.2
67,68,"Pancake Mix, Buttermilk",130,14,13.7
89,90,Smorz Cereal,121,14,3.9
210,211,Gluten Free Organic Cereal Coconut Maple Vanilla,130,14,3.6
...,...,...,...,...,...
49330,49326,Cereal Variety Fun Pack,121,14,9.1
49395,49391,Light and Fluffy Buttermilk Pancake Mix,130,14,2.0
49547,49543,Chocolate Cheerios Cereal,121,14,10.8
49637,49633,Shake 'N Pour Buttermilk Pancake Mix,130,14,14.2


In [197]:
# dir 7 subset dinner party sales
df_dinner_party = df_products.loc [df_products ['department_id'].isin([5, 7, 12, 20])]

In [199]:
df_dinner_party

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
6,7,Pure Coconut Water With Orange,98,7,4.4
9,10,Sparkling Orange Juice & Prickly Pear Beverage,115,7,8.4
10,11,Peach Mango Juice,31,7,2.8
16,17,Rendered Duck Fat,35,12,17.1
...,...,...,...,...,...
49676,49672,Cafe Mocha K-Cup Packs,26,7,6.5
49679,49675,Cinnamon Dolce Keurig Brewed K Cups,26,7,14.0
49680,49676,Ultra Red Energy Drink,64,7,14.5
49686,49682,California Limeade,98,7,4.3


In [201]:
# dir 8 number of rows last data frame

**The dinner party data frame has 7650 rows**

In [209]:
# dir9 customer information user_id 1
df_user_id_1 = df_orders_dropped [df_orders_dropped['user_id']=='1']

In [211]:
df_user_id_1

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [213]:
# dir10 gather basic stats

In [215]:
df_user_id_1.describe()

Unnamed: 0,order_number,orders_day_of_week,order_hour_of_day,days_since_last_order
count,11.0,11.0,11.0,10.0
mean,6.0,2.636364,10.090909,19.0
std,3.316625,1.286291,3.477198,9.030811
min,1.0,1.0,7.0,0.0
25%,3.5,1.5,7.5,14.25
50%,6.0,3.0,8.0,19.5
75%,8.5,4.0,13.0,26.25
max,11.0,4.0,16.0,30.0


**SAVING & EXPORTING**

In [221]:
# dir 12 export orders
df_orders_dropped.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_wrangled.csv'))

In [223]:
# dir 13 export departments
df_departments_new.to_csv(os.path.join(path, '02 Data','Prepared Data', 'departments_wrangled.csv'))