# Import Merged DataFrame from Pickle File

In [1]:
import pandas as pd
import os

# Set the path to the main project folder
path = r'C:\Users\luis\Desktop\Carrer Foundry boot camp\Python Fundamentals for Data Analysts'

# Set the path to the pickle file
pickle_path = os.path.join(path, 'Data', 'Prepared Data', 'orders_products_combined.pkl')

# Import the dataframe from the pickle file
df_merged_large = pd.read_pickle(pickle_path)

# Display the first few rows of the dataframe to verify successful import
df_merged_large.head()


Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge
0,2539329,1,1,2,8,,196,1,0,both
1,2539329,1,1,2,8,,14084,2,0,both
2,2539329,1,1,2,8,,12427,3,0,both
3,2539329,1,1,2,8,,26088,4,0,both
4,2539329,1,1,2,8,,26405,5,0,both


# Check the Shape of the Imported DataFrame

In [2]:
# Check the shape of the imported dataframe
df_merged_large_shape = df_merged_large.shape

# Display the shape of the dataframe
print(f"The shape of the imported dataframe is: {df_merged_large_shape}")


The shape of the imported dataframe is: (32434489, 10)


# Combining Orders and Products Data

### Step 1: Import the Cleaned Products Data Set

In [3]:
# Define the path to the cleaned products data set
path_cleaned_products = r'C:\Users\luis\Desktop\Carrer Foundry boot camp\Python Fundamentals for Data Analysts\Data\Prepared Data\orders_cleaned.csv'

# Import the cleaned products data set
df_prods_clean = pd.read_csv(path_cleaned_products)


In [4]:
# Display the head of the cleaned products dataframe
df_prods_clean.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,0.0
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


### Step 2: Merge the DataFrames

In [11]:
# Optimize the data type of 'order_id'
df_prods_clean['order_id'] = df_prods_clean['order_id'].astype('int32')

In [15]:
# Function to process chunks and merge
def process_chunk(chunk, products_df):
    merged_chunk = chunk.merge(products_df, on='order_id', how='inner')
    return merged_chunk

In [16]:
# Split the DataFrame into smaller chunks
chunk_size = 100000  # Adjust the chunk size as needed
num_chunks = len(df_merged_large) // chunk_size + 1

In [17]:
# Initialize an empty list to hold processed chunks
chunks = []

In [18]:
for i in range(num_chunks):
    start = i * chunk_size
    end = (i + 1) * chunk_size
    chunk = df_merged_large.iloc[start:end]
    processed_chunk = process_chunk(chunk, df_prods_clean)
    chunks.append(process_chunk(chunk, df_prods_clean))

In [19]:
# Concatenate all processed chunks into a single dataframe
df_final_combined = pd.concat(chunks, ignore_index=True)

In [20]:
# Display the shape of the final combined dataframe
print(f"The shape of the final combined dataframe is: {df_final_combined.shape}")

The shape of the final combined dataframe is: (32434489, 16)


In [21]:
df_final_combined.head()

Unnamed: 0,order_id,user_id_x,order_number_x,order_dow_x,order_hour_of_day_x,days_since_prior_order_x,product_id,add_to_cart_order,reordered,_merge,user_id_y,eval_set,order_number_y,order_dow_y,order_hour_of_day_y,days_since_prior_order_y
0,2539329,1,1,2,8,,196,1,0,both,1,prior,1,2,8,0.0
1,2539329,1,1,2,8,,14084,2,0,both,1,prior,1,2,8,0.0
2,2539329,1,1,2,8,,12427,3,0,both,1,prior,1,2,8,0.0
3,2539329,1,1,2,8,,26088,4,0,both,1,prior,1,2,8,0.0
4,2539329,1,1,2,8,,26405,5,0,both,1,prior,1,2,8,0.0


In [22]:
# Check the merge indicator for full match
print(df_final_combined['_merge'].value_counts())

_merge
both          32434489
left_only            0
right_only           0
Name: count, dtype: int64


In [24]:
# Save the final combined dataframe as a pickle file
final_combined_path = os.path.join(path, 'Data', 'Prepared Data', 'ords_prods_merge.pkl')
df_final_combined.to_pickle(final_combined_path)