<a href="https://colab.research.google.com/github/mohitchauhan/ml-samples/blob/main/product_recomm_by_customer_id.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task
Prepare the `shopify.orders` and `order_line` data for product recommendation model training.

In [None]:
!pip install tensorflow



## Load data

### Subtask:
Load the `shopify.orders` and `order_line` data into pandas DataFrames.


**Reasoning**:
Load the `shopify.orders` and `order_line` datasets into pandas DataFrames.



#IMPORTS

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import MeanSquaredError, MeanAbsoluteError
import plotly.express as px
from tensorflow.keras.regularizers import l2

In [4]:

# The following lines adjust the granularity of reporting.
pd.options.display.max_rows = 10
pd.options.display.float_format = "{:.1f}".format


# =============== Load & Aggregate ===============
orders_df = pd.read_csv(
    "https://drive.google.com/uc?export=download&id=1uS5r1vkQFn-VtdUFwMySpgcN4KIIhYP3"
)

# Aggregate to (customer_id, product_id) interactions
df = orders_df.groupby(['customer_id', 'product_id']).agg(
    purchase_count=('product_id', 'size'),
    product_type=('product_type', 'first'),
    price=('price', 'first')
).reset_index()

# Clean product_type
df['product_type'] = df['product_type'].fillna('furniture')
df.loc[df['product_type'] == '', 'product_type'] = 'furniture'

**Reasoning**:
The previous command failed because the file 'shopify.orders.csv' was not found. I need to check the available files in the current directory to confirm the correct file name and load the data again.



Data Analysis

In [5]:
print("Data desc", df.columns)
print(f'Count {df.size}')
print(df.sample(20))
px.scatter(df, x="customer_id", y="price")


Data desc Index(['customer_id', 'product_id', 'purchase_count', 'product_type', 'price'], dtype='object')
Count 36490
        customer_id     product_id  purchase_count   product_type   price
1477  6466758443158  7831378624662               2   Center Table 16380.0
730   6321601216662  7491240296598               8   Center Table 18900.0
5962  6739033096342  7816578531478               8  Bedroom Chair 10400.0
3221  6645228765334  7803717714070              10           Sofa 21500.0
1059  6425573654678  7718221578390               3         Marble 16200.0
...             ...            ...             ...            ...     ...
7121  7358722637974  8165209538710              12  Bedroom Chair 13200.0
2610  6610404409494  7663734587542               7     Dining Set 29300.0
1840  6499882533014  8011343298710               1         Marble 18000.0
645   6295406084246  7677861626006               5   Center Table 13230.0
6841  7137887322262  7979911413910               2        Chester 16

# =============== Encode IDs to contiguous indices ===============
# Important: embeddings need small contiguous integer indices.

In [None]:
cust_le = LabelEncoder()
prod_le = LabelEncoder()
ptype_le = LabelEncoder()
price_le = LabelEncoder()

df['customer_idx'] = cust_le.fit_transform(df['customer_id'].astype(str))
df['product_idx']  = prod_le.fit_transform(df['product_id'].astype(str))
df['ptype_idx']    = ptype_le.fit_transform(df['product_type'].astype(str))
df['price_idx']    = price_le.fit_transform(df['price'].astype(str))

# Target
df['purchase_count'] = df['purchase_count'].astype('float32')

# =============== Train / Test split ===============


In [None]:
feat_cols = ['customer_idx', 'product_idx', 'ptype_idx', 'price_idx']
work = df[feat_cols + ['purchase_count']].copy()
train_df, test_df = train_test_split(work, test_size=0.2, random_state=42)

# =============== Numpy tensors with CORRECT shapes & dtypes ===============


In [None]:
def to_int32_2d(a):
    a = np.asarray(a, dtype=np.int32)
    if a.ndim == 1:
        a = a.reshape(-1, 1)
    return a

train_customer = to_int32_2d(train_df['customer_idx'].values)
test_customer  = to_int32_2d(test_df['customer_idx'].values)

train_product = to_int32_2d(train_df['product_idx'].values)
test_product  = to_int32_2d(test_df['product_idx'].values)

train_ptype = to_int32_2d(train_df['ptype_idx'].values)
test_ptype  = to_int32_2d(test_df['ptype_idx'].values)

train_price = np.asarray(train_df['price_idx'].values, dtype=np.float32).reshape(-1, 1)
test_price  = np.asarray(test_df['price_idx'].values, dtype=np.float32).reshape(-1, 1)

y_train = np.asarray(train_df['purchase_count'].values, dtype=np.float32)
y_test  = np.asarray(test_df['purchase_count'].values, dtype=np.float32)


# Sanity checks
for name, arr in [
    ("train_customer", train_customer),
    ("train_product", train_product),
    ("train_ptype", train_ptype),
    ("train_price", train_price),
    ("y_train", y_train),
]:
    assert np.isfinite(arr).all(), f"Non-finite values in {name}"
    print(name, arr.shape, arr.dtype)

train_customer (5838, 1) int32
train_product (5838, 1) int32
train_ptype (5838, 1) int32
train_price (5838, 1) float32
y_train (5838,) float32


# =============== Build a small embedding model ===============


In [None]:
n_customers = int(df['customer_idx'].max()) + 1
n_products  = int(df['product_idx'].max()) + 1
n_ptypes    = int(df['ptype_idx'].max()) + 1
n_prices    = int(df['price_idx'].max()) + 1


# Inputs
inp_cust = Input(shape=(1,), dtype='int32', name='customer_idx')
inp_prod = Input(shape=(1,), dtype='int32', name='product_idx')
inp_ptyp = Input(shape=(1,), dtype='int32', name='ptype_idx')
inp_price = Input(shape=(1,), dtype='int32', name='price_idx')


# Embeddings (small dims are fine to start)
emb_cust = Embedding(input_dim=n_customers, output_dim=16, name='emb_customer')(inp_cust)
emb_prod = Embedding(input_dim=n_products,  output_dim=16, name='emb_product')(inp_prod)
emb_ptyp = Embedding(input_dim=n_ptypes,    output_dim=8,  name='emb_ptype')(inp_ptyp)
emb_price = Embedding(input_dim=n_prices,    output_dim=8,  name='emb_price')(inp_price)


# Flatten (since each input is a single index)
flat_cust = Flatten()(emb_cust)
flat_prod = Flatten()(emb_prod)
flat_ptyp = Flatten()(emb_ptyp)
flat_price = Flatten()(emb_price)


# Concatenate
x = Concatenate()([flat_cust, flat_prod, flat_ptyp, flat_price])

# MLP head
x = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)
x = Dropout(0.2)(x)
x = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(x)
out = Dense(1, activation='linear')(x)  # regression on purchase_count


model = Model(inputs=[inp_cust, inp_prod, inp_ptyp, inp_price], outputs=out)


# model.compile(
#     optimizer=Adam(learning_rate=0.001),
#     loss=MeanSquaredError(),
#     metrics=[MeanAbsoluteError(), MeanSquaredError()]
# )


model.compile(
    optimizer="adam",
    loss="mse",  # or "mae"
    metrics=["mae", "mse"]
)



model.summary()

# =============== Train ===============


In [None]:
train_inputs = [train_customer, train_product, train_ptype, train_price]
test_inputs  = [test_customer,  test_product,  test_ptype, test_price]

# history = model.fit(
#     train_inputs,
#     y_train,
#     epochs=10,
#     batch_size=256,
#     validation_data=(test_inputs, y_test),
#     verbose=1
# )

# Ensure y_train is 2D with shape (n_samples, 1)
y_train = y_train.reshape(-1, 1).astype("float32")
y_test  = y_test.reshape(-1, 1).astype("float32")

# Fit
history = model.fit(
    train_inputs,
    y_train,
    validation_data=(test_inputs, y_test),
    epochs=10,
    batch_size=32
)
print("Done. History keys:", history.history.keys())

Epoch 1/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 145.7824 - mae: 6.7671 - mse: 145.6793 - val_loss: 64.2105 - val_mae: 4.5360 - val_mse: 64.0819
Epoch 2/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 85.2184 - mae: 4.2845 - mse: 85.0861 - val_loss: 60.1922 - val_mae: 4.5069 - val_mse: 60.0423
Epoch 3/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 69.7175 - mae: 3.5795 - mse: 69.5617 - val_loss: 64.8414 - val_mae: 4.6894 - val_mse: 64.6649
Epoch 4/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 94.3510 - mae: 3.3639 - mse: 94.1670 - val_loss: 67.0076 - val_mae: 4.7070 - val_mse: 66.8181
Epoch 5/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 63.1020 - mae: 3.1560 - mse: 62.9050 - val_loss: 70.2542 - val_mae: 4.7098 - val_mse: 70.0479
Epoch 6/10
[1m183/183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
# Example: Recommend products for a specific customer

# Choose a customer ID for recommendation
example_customer_id = 6715565867158 #orders_df['customer_id'].iloc[10]  # Using the first customer

print(f"Recommending products for customer: {example_customer_id}")

# Encode the example customer ID
example_customer_idx = cust_le.transform([str(example_customer_id)])[0]

# Get all unique product indices
all_product_indices = df['product_idx'].unique()

# Create input arrays for the model for all products with the example customer
example_customer_inputs = np.full((len(all_product_indices), 1), example_customer_idx, dtype=np.int32)
example_product_inputs = to_int32_2d(all_product_indices)

# FIX: ensure one product_idx -> one ptype_idx
product_ptype_map = df[['product_idx', 'ptype_idx']].drop_duplicates().set_index('product_idx')
all_ptype_indices = product_ptype_map.loc[all_product_indices, 'ptype_idx'].values
example_ptype_inputs = to_int32_2d(all_ptype_indices)

# FIX: ensure one product_idx -> one price_idx
product_price_map = df[['product_idx', 'price_idx']].drop_duplicates().set_index('product_idx')
all_price_indices = product_price_map.loc[all_product_indices, 'price_idx'].values
example_price_inputs = to_int32_2d(all_price_indices)


# Predict purchase counts
predicted_purchase_counts = model.predict(
    [example_customer_inputs, example_product_inputs, example_ptype_inputs, example_price_inputs],
    verbose=0
).flatten()

# Build recommendation DataFrame
recommendations_df = pd.DataFrame({
    'product_idx': all_product_indices,
    'predicted_purchase_count': predicted_purchase_counts
})

# Merge with product details (including price)
recommendations_df = recommendations_df.merge(
    df[['product_idx', 'product_id', 'product_type', 'price']].drop_duplicates(),
    on='product_idx',
    how='left'
)

# Get actual purchase counts for the example customer
actual_purchases = df[df['customer_id'] == example_customer_id][['product_idx', 'purchase_count']]
actual_purchases = actual_purchases.rename(columns={'purchase_count': 'current_purchase_count'})

# Merge actual purchase counts into the recommendations DataFrame
recommendations_df = recommendations_df.merge(
    actual_purchases,
    on='product_idx',
    how='left'
)

# Fill NaN values in 'current_purchase_count' with 0 (for products not purchased)
recommendations_df['current_purchase_count'] = recommendations_df['current_purchase_count'].fillna(0)


# Sort all product types, not just one
top_recommendations = recommendations_df.sort_values(by='predicted_purchase_count', ascending=False)

# Display top products
print("\nTop Recommended Products:")
display(top_recommendations.head(30))
px.scatter(top_recommendations, x="product_id", y = "predicted_purchase_count")
print("\n \n")
px.scatter_3d(top_recommendations, x="product_id", y = "predicted_purchase_count", z = "product_type")

Recommending products for customer: 6715565867158

Top Recommended Products:


Unnamed: 0,product_idx,predicted_purchase_count,product_id,product_type,price,current_purchase_count
135,504,72.8,7642617348246,Sofa,12000.0,88.0
2962,2895,68.0,7968687390870,sofa,2500.0,0.0
919,1574,62.2,7778950119574,Sofa,20000.0,0.0
2039,1744,61.3,7784662761622,Dining Set,3200.0,0.0
2030,2134,60.3,7805865623702,Dining Set,22750.0,0.0
...,...,...,...,...,...,...
3059,1997,47.6,7800598266006,Sofa,4700.0,0.0
1421,2159,47.5,7806597791894,Bedroom Chair,14000.0,0.0
1422,205,47.4,7492101144726,Dining Set,47700.0,0.0
1253,457,47.3,7638264905878,Bed,71500.0,0.0



 

