# 4.6 Merging and exporting data Part 2

### This script contains the following points:
#### 1. Check the dimensions of the imported dataframe
#### 2. Merge the orders_products_combined and df_prods dataframe
#### 3. Export merged orders_products_combined and df_prods dataframes to pkl

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Path to main project folder
path = r'C:\Users\Mark\_Instacart Basket Analysis'

In [3]:
# Retrieve the orders_products_combined.pkl file
orders_products_combined = pd.read_pickle(os.path.join(path, '03 Scripts', 'Prepared Data', 'orders_products_combined.pkl'))

In [4]:
# Retrieve the products.csv file
df_prods = pd.read_csv(os.path.join(path, '03 Scripts', 'Prepared Data', 'products_cleaned.csv'), index_col = False)

#### 1. Check the dimensions of the imported dataframe

In [5]:
# Ensure nothing looks out of place with the imported dataframes
orders_products_combined.head()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,1,2398795,1,prior,2,3,7,15.0,196,1,1,both
1,1,2398795,1,prior,2,3,7,15.0,10258,2,0,both
2,1,2398795,1,prior,2,3,7,15.0,12427,3,1,both
3,1,2398795,1,prior,2,3,7,15.0,13176,4,0,both
4,1,2398795,1,prior,2,3,7,15.0,26088,5,1,both


In [6]:
# Ensure nothing looks out of place with the imported dataframes
df_prods.head()

Unnamed: 0.1,Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8
1,1,2,All-Seasons Salt,104,13,9.3
2,2,3,Robust Golden Unsweetened Oolong Tea,94,7,4.5
3,3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1,10.5
4,4,5,Green Chile Anytime Sauce,5,13,4.3


In [7]:
# See if the data set is large, small, wide, or long.
orders_products_combined.shape

(30356421, 12)

In [8]:
# See if the data set is large, small, wide, or long.
df_prods.shape

(49672, 6)

#### 2. Merge the orders_products_combined and df_prods dataframe

In [9]:
# Merge the new orders_products_combined dataframe with the df_prods dataframe
# They have a different shape but share the "product_id" column
# The "indicator = 'match' argument" checks for a full match,
# and because we had a previous merge that took the default name, this column had to be given a new name
df_merged_large = orders_products_combined.merge(df_prods, how = 'inner', on = 'product_id', indicator = 'match')

In [10]:
# Ensure nothing looks out of place with the merged dataframes
df_merged_large.head()

Unnamed: 0,Unnamed: 0_x,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,Unnamed: 0_y,product_name,aisle_id,department_id,prices,match
0,1,2398795,1,prior,2,3,7,15.0,196,1,1,both,195,Soda,77,7,9.0,both
1,2,473747,1,prior,3,3,12,21.0,196,1,1,both,195,Soda,77,7,9.0,both
2,3,2254736,1,prior,4,4,7,29.0,196,1,1,both,195,Soda,77,7,9.0,both
3,4,431534,1,prior,5,4,15,28.0,196,1,1,both,195,Soda,77,7,9.0,both
4,5,3367565,1,prior,6,2,7,19.0,196,1,1,both,195,Soda,77,7,9.0,both


In [11]:
# The value_counts() function sums up all the values in the "match" column
# to see whether there is a full match or not
df_merged_large['match'].value_counts()

both          30328763
right_only           0
left_only            0
Name: match, dtype: int64

#### 3. Export merged orders_products_combined and df_prods dataframes to pkl

In [12]:
# Export merged data to pkl
df_merged_large.to_pickle(os.path.join(path, '03 Scripts','Prepared Data', 'orders_products_merged.pkl'))

In [13]:
# Export merged data to csv
df_merged_large.to_csv(os.path.join(path, '03 Scripts','Prepared Data', 'orders_products_merged.csv'))