# 1. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 2. Importing csvs 

In [2]:
# Import modified orders.csv

path = r'/Users/matthewabrams/Desktop/12-28-2021 Instacart Basket Analysis'
df_orders = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_clean_csv'))
df_orders.drop(columns = ('Unnamed: 0'), inplace = True)
df_orders

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False
...,...,...,...,...,...,...,...
3421078,2266710,206209,10,5,18,29.0,False
3421079,1854736,206209,11,4,10,30.0,False
3421080,626363,206209,12,1,12,18.0,False
3421081,2977660,206209,13,1,12,7.0,False


In [3]:
# Import modified products.csv
df_products = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'products_clean_csv'))
df_products.drop(columns = ('Unnamed: 0'), inplace = True)
df_products

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,unknown_product_name
0,1,Chocolate Sandwich Cookies,61,19,5.8,False
1,2,All-Seasons Salt,104,13,9.3,False
2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5,False
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5,False
4,5,Green Chile Anytime Sauce,5,13,4.3,False
...,...,...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5,5.3,False
49684,49685,En Croute Roast Hazelnut Cranberry,42,1,3.1,False
49685,49686,Artisan Baguette,112,3,7.8,False
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8,4.7,False


In [4]:
# Import prior_orders_products.csv

df_order_products = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'order_products__prior.csv'))
df_order_products

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
32434484,3421083,39678,6,1
32434485,3421083,11352,7,0
32434486,3421083,4600,8,0
32434487,3421083,24852,9,1


# 3. Checking for data consistency of df_order_products before merging

In [5]:
df_order_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32434489 entries, 0 to 32434488
Data columns (total 4 columns):
 #   Column             Dtype
---  ------             -----
 0   order_id           int64
 1   product_id         int64
 2   add_to_cart_order  int64
 3   reordered          int64
dtypes: int64(4)
memory usage: 989.8 MB


In [6]:
# No null values

df_order_products.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [7]:
# No duplicates

df_order_products[df_order_products.duplicated()]

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


# 4. Merging Data Sets

In [8]:
# Merging orders csv with orders_product_prior csv, labeled as orders_products_combined

orders_products_combined = df_orders.merge(df_order_products, on = 'order_id')
orders_products_combined

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered
0,2539329,1,1,2,8,,True,196,1,0
1,2539329,1,1,2,8,,True,14084,2,0
2,2539329,1,1,2,8,,True,12427,3,0
3,2539329,1,1,2,8,,True,26088,4,0
4,2539329,1,1,2,8,,True,26405,5,0
...,...,...,...,...,...,...,...,...,...,...
32434484,2977660,206209,13,1,12,7.0,False,14197,5,1
32434485,2977660,206209,13,1,12,7.0,False,38730,6,0
32434486,2977660,206209,13,1,12,7.0,False,31477,7,0
32434487,2977660,206209,13,1,12,7.0,False,6567,8,0


In [9]:
# To merge both orders_product_combined and product dataframes, use the product_id as a common identifier. Named this combined data frame df_merged
# Used inner join because we want to only focus on the customers who orders online 

df_merged = orders_products_combined.merge(df_products, on = 'product_id', indicator = True)
df_merged

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,unknown_product_name,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,False,both
1,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,False,both
2,473747,1,3,3,12,21.0,False,196,1,1,Soda,77,7,9.0,False,both
3,2254736,1,4,4,7,29.0,False,196,1,1,Soda,77,7,9.0,False,both
4,431534,1,5,4,15,28.0,False,196,1,1,Soda,77,7,9.0,False,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32433025,1320836,202557,17,2,15,1.0,False,43553,2,1,Orange Energy Shots,64,7,3.7,False,both
32433026,31526,202557,18,5,11,3.0,False,43553,2,1,Orange Energy Shots,64,7,3.7,False,both
32433027,758936,203436,1,2,7,,True,42338,4,0,"Zucchini Chips, Pesto",50,19,6.9,False,both
32433028,2745165,203436,2,3,5,15.0,False,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,False,both


In [10]:
df_merged.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'orders_products_combined.pkl'))