# 4.6 Combining & Exporting Data PART ONE

## This script contains the following points:

### 01. Importing Libraries
### 02. Importing Data
### 03. Checking Dataframes
### 04. Combining Dataframes
### 05. Exporting Data

## 01. Importing Libraries

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import os

## 02. Importing Data

In [2]:
# Set project folder as a string
path = r'/Users/matthewjones/Documents/CareerFoundry/Data Immersion/Achievement 4/04-2024 Instacart Basket Analysis'

In [3]:
df_ords_prior = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'orders_products_prior.csv'), index_col = False)

In [4]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_cleaned.csv'), index_col = False)

## 03. Checking Dataframes

### orders_products_prior.csv

In [5]:
# Check the imported data's shape
df_ords_prior.shape

(32434489, 4)

In [6]:
# Check the output
df_ords_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [7]:
# Check for missing values
df_ords_prior.isnull().sum()

order_id             0
product_id           0
add_to_cart_order    0
reordered            0
dtype: int64

In [8]:
# Check for duplicated rows
df_ords_prior_dups = df_ords_prior[df_ords_prior.duplicated()]

In [9]:
df_ords_prior_dups

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered


##### No missing values or duplicated rows

In [10]:
df_ords_prior.dtypes

order_id             int64
product_id           int64
add_to_cart_order    int64
reordered            int64
dtype: object

In [11]:
df_ords_prior.describe()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
count,32434490.0,32434490.0,32434490.0,32434490.0
mean,1710749.0,25576.34,8.351076,0.5896975
std,987300.7,14096.69,7.126671,0.4918886
min,2.0,1.0,1.0,0.0
25%,855943.0,13530.0,3.0,0.0
50%,1711048.0,25256.0,6.0,1.0
75%,2565514.0,37935.0,11.0,1.0
max,3421083.0,49688.0,145.0,1.0


In [12]:
df_ords_prior['add_to_cart_order'].value_counts()

add_to_cart_order
1      3214874
2      3058126
3      2871133
4      2664106
5      2442025
        ...   
141          1
142          1
143          1
144          1
145          1
Name: count, Length: 145, dtype: int64

##### The only suspicious thing is that there seems to be one order that was significantly larger than most other orders purchased. This order had 145 items. This appears to be an anomoly, rather than an error on recording. But I would reach out to our data engineering team to make sure these values are correct.
##### For now, I will not do anything to these values

### orders_cleaned.csv

In [13]:
# Check the imported data's shape
df_ords.shape

(3421083, 9)

In [14]:
# Check the imported data's shape
df_ords.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer
0,0,0,2539329,1,1,2,8,,True
1,1,1,2398795,1,2,3,7,15.0,False
2,2,2,473747,1,3,3,12,21.0,False
3,3,3,2254736,1,4,4,7,29.0,False
4,4,4,431534,1,5,4,15,28.0,False


In [15]:
# Drop the two extra index columns
df_ords_better = df_ords.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1)

In [16]:
# Check the output
df_ords_better.head()

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer
0,2539329,1,1,2,8,,True
1,2398795,1,2,3,7,15.0,False
2,473747,1,3,3,12,21.0,False
3,2254736,1,4,4,7,29.0,False
4,431534,1,5,4,15,28.0,False


## 04. Combining Dataframes

### Merging with an inner join

In [17]:
# Merge df_ords_better and df_ords_prior using order_id as a key
df_merged_large = df_ords_better.merge(df_ords_prior, on = 'order_id', indicator = True)

In [18]:
# Check the shape of the merged dataframe
df_merged_large.shape

(32434489, 11)

In [19]:
# Check to see if merged dataframe is a full match
df_merged_large['_merge'].value_counts()

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64

### Merging with an outer join

In [20]:
# Merge df_ords_better and df_ords_prior using order_id as a key
df_merged_trial = df_ords_better.merge(df_ords_prior, on = 'order_id', how = 'outer', indicator = True)

In [21]:
# Check if we have a full match
df_merged_trial['_merge'].value_counts()

_merge
both          32434489
left_only       206209
right_only           0
Name: count, dtype: int64

In [22]:
df_merged_trial.head(10)

Unnamed: 0,order_id,user_id,order_number,order_day_of_week,order_hour_of_day,days_since_prior_order,new_customer,product_id,add_to_cart_order,reordered,_merge
0,1,112108,4,4,10,9.0,False,,,,left_only
1,2,202279,3,5,9,8.0,False,33120.0,1.0,1.0,both
2,2,202279,3,5,9,8.0,False,28985.0,2.0,1.0,both
3,2,202279,3,5,9,8.0,False,9327.0,3.0,0.0,both
4,2,202279,3,5,9,8.0,False,45918.0,4.0,1.0,both
5,2,202279,3,5,9,8.0,False,30035.0,5.0,0.0,both
6,2,202279,3,5,9,8.0,False,17794.0,6.0,1.0,both
7,2,202279,3,5,9,8.0,False,40141.0,7.0,1.0,both
8,2,202279,3,5,9,8.0,False,1819.0,8.0,1.0,both
9,2,202279,3,5,9,8.0,False,43668.0,9.0,0.0,both


In [23]:
# Check the shape of the merged dataframe
df_merged_trial.shape

(32640698, 11)

## 05. Exporting Data

In [24]:
# Export df_merged_large to the Prepared Data folder
df_merged_large.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_combined.pkl'))