# Table Of Contents

- Import Libraries

- Load files and look at dataframes

- Optimize dataframes for merging

- Perform different merges on data

- Compare output of merges

- Export dataframe to pkl

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import os

# setting master path
path = r'/Users/Norberto/Desktop/2023-10 Instacart Basket Analysis'

## Step 3 & 4

3. Import the orders_products_combined dataframe from the pickle file you just saved.

4. Check the shape of the imported dataframe (it should be the same as the one you exported—always check!).

In [2]:
# read pkl files
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))

# see shape of pickle file
shape_ords_prods = df_ords_prods.shape
print(shape_ords_prods)

(32434489, 10)


## Step 5

5. Determine a suitable way to combine the orders_products_combined dataframe with your products data set. Make sure you’re using your wrangled, cleaned, and deduped products data set stored in your “Prepared Data” folder from the previous Exercise’s task.


In [3]:
# see column names for combined ords and prods df
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


In [4]:
# read and view shape of csv file
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'products_checked.csv'), index_col=0)
shape_prods = df_prods.shape
print(shape_prods)

(49672, 5)


In [5]:
# see column names in products dataframe
df_prods.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,1,Chocolate Sandwich Cookies,61,19,5.8
1,2,All-Seasons Salt,104,13,9.3
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,5,Green Chile Anytime Sauce,5,13,4.3


In [6]:
# delete old indicator column from previous merge to avoid error
del df_ords_prods['_merge']

#### Inner MERGE

In [7]:
# create new merged dataframe including product names and other data
df_merged_inner = df_ords_prods.merge(df_prods, on = 'product_id', how = 'inner',indicator = True)
shape_merged_inner = df_merged_inner.shape
print(shape_merged_inner)

(32404859, 14)


In [8]:
# see merged inner dataframe results
df_merged_inner.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


#### Left MERGE

In [9]:
# create new merged dataframe including product names and other data
df_merged_left = df_ords_prods.merge(df_prods, on = 'product_id', how = 'left',indicator = True)
shape_merged_left = df_merged_left.shape
print(shape_merged_left)

(32435059, 14)


In [10]:
# see merged left dataframe results
df_merged_left.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77.0,7.0,9.0,both
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,23.0,19.0,4.4,both
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,23.0,19.0,4.7,both
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both


#### Right MERGE

In [11]:
# create new merged dataframe including product names and other data
df_merged_right = df_ords_prods.merge(df_prods, on = 'product_id', how = 'right',indicator = True)
shape_merged_right = df_merged_right.shape
print(shape_merged_right)

(32404870, 14)


In [12]:
# see merged right dataframe results
df_merged_right.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,3139998.0,138.0,28.0,6.0,11.0,3.0,1,5.0,0.0,Chocolate Sandwich Cookies,61,19,5.8,both
1,1977647.0,138.0,30.0,6.0,17.0,20.0,1,1.0,1.0,Chocolate Sandwich Cookies,61,19,5.8,both
2,389851.0,709.0,2.0,0.0,21.0,6.0,1,20.0,0.0,Chocolate Sandwich Cookies,61,19,5.8,both
3,652770.0,764.0,1.0,3.0,13.0,,1,10.0,0.0,Chocolate Sandwich Cookies,61,19,5.8,both
4,1813452.0,764.0,3.0,4.0,17.0,9.0,1,11.0,1.0,Chocolate Sandwich Cookies,61,19,5.8,both


#### Outer MERGE

In [13]:
# create new merged dataframe including product names and other data
df_merged_outer = df_ords_prods.merge(df_prods, on = 'product_id', how = 'outer',indicator = True)
shape_merged_outer = df_merged_outer.shape
print(shape_merged_outer)

(32435070, 14)


In [14]:
# see merged outer dataframe results
df_merged_outer.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329.0,1.0,1.0,2.0,8.0,,196,1.0,0.0,Soda,77.0,7.0,9.0,both
1,2398795.0,1.0,2.0,3.0,7.0,15.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
2,473747.0,1.0,3.0,3.0,12.0,21.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
3,2254736.0,1.0,4.0,4.0,7.0,29.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both
4,431534.0,1.0,5.0,4.0,15.0,28.0,196,1.0,1.0,Soda,77.0,7.0,9.0,both


## Step 6

6. Confirm the results of the merge using the merge flag.

In [15]:
# printing the number of rows in each merge and comparing
print("ords_prods_rows - \t merge_rows \t\t= \t difference")
print("{} \t- \t({}) INNER \t= \t{} rows".format(shape_ords_prods[0], shape_merged_inner[0], shape_ords_prods[0] - shape_merged_inner[0]))
print("{} \t- \t({}) LEFT \t= \t{} rows".format(shape_ords_prods[0], shape_merged_left[0], shape_ords_prods[0] - shape_merged_left[0]))
print("{} \t- \t({}) RIGHT \t= \t{} rows".format(shape_ords_prods[0], shape_merged_right[0], shape_ords_prods[0] - shape_merged_right[0]))
print("{} \t- \t({}) OUTER \t= \t{} rows".format(shape_ords_prods[0], shape_merged_outer[0], shape_ords_prods[0] - shape_merged_outer[0]))

ords_prods_rows - 	 merge_rows 		= 	 difference
32434489 	- 	(32404859) INNER 	= 	29630 rows
32434489 	- 	(32435059) LEFT 	= 	-570 rows
32434489 	- 	(32404870) RIGHT 	= 	29619 rows
32434489 	- 	(32435070) OUTER 	= 	-581 rows


In [16]:
# Creating new dataframe with merged values without full data in both df
df_incomplete = df_merged_left[df_merged_left['_merge']!='both']
df_incomplete

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
2963,7099,27,63,3,10,1.0,6799,1,0,,,,,left_only
3205,1837192,27,80,2,8,6.0,6799,9,1,,,,,left_only
3253,3331846,28,3,0,16,2.0,34,6,0,,,,,left_only
3395,1385910,28,21,2,17,1.0,34,1,1,,,,,left_only
4694,2873174,38,1,0,10,,116,11,0,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32425294,443233,206139,2,1,17,17.0,34,2,1,,,,,left_only
32425300,1507543,206139,3,1,8,14.0,34,2,1,,,,,left_only
32427098,1282274,206155,4,6,9,3.0,2240,18,0,,,,,left_only
32431906,1586626,206194,9,1,20,17.0,1511,6,0,,,,,left_only


The data above shows some product IDs do not have information such as names, locations, or prices. Those orders were included in the final output, which amounts to 30200 records.

In [18]:
# Exporting left merge to pkl since it is the most comprehensive data set
df_merged_left.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged.pkl'))