In [1]:
import pandas as pd
import os

# Define the base path
base_path = "../data/"

# Read the CSV files with more flexible parsing options
users_df = pd.read_csv(os.path.join(base_path, "users.csv"), engine='python', on_bad_lines='skip')
products_df = pd.read_csv(os.path.join(base_path, "products.csv"), engine='python', on_bad_lines='skip')
orders_df = pd.read_csv(os.path.join(base_path, "orders.csv"), engine='python', on_bad_lines='skip')
order_items_df = pd.read_csv(os.path.join(base_path, "order_items.csv"), engine='python', on_bad_lines='skip')

# Display the first few rows of each dataframe to understand the structure
print("Users DataFrame:")
print(users_df.head())
print("\nProducts DataFrame:")
print(products_df.head())
print("\nOrders DataFrame:")
print(orders_df.head())
print("\nOrder Items DataFrame:")
print(order_items_df.head())

# Join the dataframes based on foreign key relationships
# 1. Join order_items with products
order_items_with_products = pd.merge(
    order_items_df,
    products_df,
    on="product_id",
    how="left",
    validate="many_to_one"  # Many order items can reference one product
)

# 2. Join orders with users
orders_with_users = pd.merge(
    orders_df,
    users_df,
    on="user_id",
    how="left",
    validate="many_to_one"  # Many orders can be placed by one user
)

# 3. Join order_items_with_products with orders_with_users
complete_df = pd.merge(
    order_items_with_products,
    orders_with_users,
    on="order_id",
    how="left",
    validate="many_to_one"  # Many order items can belong to one order
)

# Display the combined dataframe
print("\nCombined DataFrame:")
print(complete_df.head())

# Display the shape of the combined dataframe
print(f"\nShape of the combined dataframe: {complete_df.shape}")

# Display column names of the combined dataframe
print("\nColumns in the combined dataframe:")
print(complete_df.columns.tolist())

Users DataFrame:
  user_id user_age_group user_gender user_location_city user_location_country  \
0  U00001          25-34      Female            Bangkok              Thailand   
1  U00002          18-24        Male         Chiang Mai              Thailand   
2  U00003          35-44        Male         Nonthaburi              Thailand   
3  U00004          45-54      Female             Phuket              Thailand   
4  U00005          25-34       Other            Bangkok              Thailand   

  user_location_segment user_signup_date user_signup_device user_loyalty_tier  \
0                 Urban       2022-03-15     Mobile_Android            Silver   
1                 Urban       2023-01-20         Mobile_iOS            Bronze   
2              Suburban       2021-11-05            Desktop              Gold   
3                 Urban       2022-08-10     Mobile_Android            Silver   
4                 Urban       2023-05-01         Mobile_iOS            Bronze   

  user_pr

In [2]:
# Create a recommendation model based on user gender
print("\n--- Building Gender-Based Product Recommendation Model ---")

# Step 1: Analyze product purchases by gender
# Group by user_gender and product_id, then count occurrences and sum quantities
gender_product_counts = complete_df.groupby(['user_gender', 'product_id']).agg({
    'quantity_ordered': 'sum',
    'order_item_id': 'count'
}).reset_index()

# Rename columns for clarity
gender_product_counts.columns = ['user_gender', 'product_id', 'total_quantity_ordered', 'purchase_frequency']

# Calculate a popularity score (combining frequency and quantity)
gender_product_counts['popularity_score'] = (
    gender_product_counts['purchase_frequency'] * 0.7 + 
    gender_product_counts['total_quantity_ordered'] * 0.3
)

# Step 2: Get product details for recommendations
# Create a copy to avoid SettingWithCopyWarning
product_details = products_df[['product_id', 'product_name', 'product_category_l1', 'product_brand']].copy()

# Fill missing values with appropriate placeholders
product_details['product_name'] = product_details['product_name'].fillna('Unnamed Product')
product_details['product_category_l1'] = product_details['product_category_l1'].fillna('Uncategorized')
product_details['product_brand'] = product_details['product_brand'].fillna('Unknown Brand')

# Step 3: Merge product details with gender preferences
gender_product_recommendations = pd.merge(
    gender_product_counts,
    product_details,
    on='product_id',
    how='left',
    validate='many_to_one'
)


--- Building Gender-Based Product Recommendation Model ---


In [3]:
# Step 4: Create the recommendation model (dictionary)
recommendation_model = {}

# For each gender, get the top products by popularity score
for gender in gender_product_recommendations['user_gender'].unique():
    # Skip if gender is NaN
    if pd.isna(gender):
        continue
        
    # Get products for this gender, sorted by popularity score
    gender_recs = gender_product_recommendations[
        gender_product_recommendations['user_gender'] == gender
    ].sort_values('popularity_score', ascending=False)
    
    # Get top recommendations and ensure no missing values
    top_recs = gender_recs[['product_id', 'product_name', 'product_category_l1', 'popularity_score']].head(10)
    
    # Fill any remaining NaN values
    top_recs['product_name'] = top_recs['product_name'].fillna('Unnamed Product')
    top_recs['product_category_l1'] = top_recs['product_category_l1'].fillna('Uncategorized')
    
    # Store top recommendations for this gender
    recommendation_model[gender] = top_recs

# Display the recommendation model
print("\nRecommendation Model by Gender:")
for gender, recommendations in recommendation_model.items():
    print(f"\nTop recommendations for {gender}:")
    # Format the output to be more readable
    top_recs = recommendations.head(3)
    for _, row in top_recs.iterrows():
        print(f"  - {row['product_name']} (ID: {row['product_id']}) - Category: {row['product_category_l1']}, Score: {row['popularity_score']:.1f}")


Recommendation Model by Gender:

Top recommendations for Female:
  - Moisturizing Lipstick - Velvet Red (ID: P00016) - Category: Beauty, Score: 11.6
  - Vitamin C Serum - Brightening (ID: P00008) - Category: Beauty, Score: 9.3
  - Unnamed Product (ID: P00033) - Category: Uncategorized, Score: 7.6

Top recommendations for Male:
  - Comfy Cotton T-Shirt - Blue (ID: P00002) - Category: Fashion, Score: 11.1
  - Bluetooth Wireless Headphones (ID: P00011) - Category: Electronics, Score: 8.6
  - Smart Light Bulb - WiFi RGB (ID: P00041) - Category: Electronics, Score: 7.5

Top recommendations for Other:
  - Organic Green Tea Bags (50 count) (ID: P00032) - Category: Groceries, Score: 1.6
  - Hardcover Journal - Lined Pages (ID: P00034) - Category: Office Supplies, Score: 1.3
  - Unnamed Product (ID: P00009) - Category: Uncategorized, Score: 1.0
