Install and imports

In [19]:
# If you need packages (run once in a notebook cell). Uncomment to install.
# !pip install xgboost joblib tensorflow scikit-learn

# Imports
import os
import pandas as pd
import numpy as np
import joblib
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns

# Try import xgboost (optional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception as e:
    print("xgboost not available; XGBoost model will be skipped. Install with `pip install xgboost` if desired.")
    HAS_XGB = False

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


Paths and load data

In [20]:
# Paths (change if your repo path differs)
DATA_FP = "C:/Users/awini/formative2-mlp/data/processed/merged_customer_data.csv"
MODELS_DIR = "C:/Users/awini/formative2-mlp/data/processed/models"
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

# Load
df = pd.read_csv(DATA_FP)
print("Loaded:", DATA_FP)
print("Shape:", df.shape)
display(df.head())
df.info()

Loaded: C:/Users/awini/formative2-mlp/data/processed/merged_customer_data.csv
Shape: (117, 10)


Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,100,"Twitter,Instagram",77.0,4.4,"Negative,Neutral",1113,172,2024-04-22,Clothing,4.0
1,100,"Twitter,Instagram",77.0,4.4,"Negative,Neutral",1147,387,2024-05-26,Books,4.6
2,101,Twitter,68.0,1.0,Neutral,1017,271,2024-01-17,Books,2.1
3,101,Twitter,68.0,1.0,Neutral,1021,192,2024-01-21,Groceries,3.4
4,101,Twitter,68.0,1.0,Neutral,1059,408,2024-02-28,Books,2.5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id_new          117 non-null    int64  
 1   social_media_platform    117 non-null    object 
 2   engagement_score         117 non-null    float64
 3   purchase_interest_score  117 non-null    float64
 4   review_sentiment         117 non-null    object 
 5   transaction_id           117 non-null    int64  
 6   purchase_amount          117 non-null    int64  
 7   purchase_date            117 non-null    object 
 8   product_category         117 non-null    object 
 9   customer_rating          117 non-null    float64
dtypes: float64(3), int64(3), object(4)
memory usage: 9.3+ KB


Quick cleaning & choose target/features

In [21]:
# Target
target_col = "product_category"
assert target_col in df.columns, f"Target column '{target_col}' not found."

# Drop rows without target
df = df.dropna(subset=[target_col]).copy()

# Candidate numeric and categorical features — adjust if columns differ
numeric_cols = [c for c in ['engagement_score', 'purchase_interest_score', 'purchase_amount', 'customer_rating'] if c in df.columns]
categorical_cols = [c for c in ['social_media_platform', 'review_sentiment'] if c in df.columns]

print("Numeric features:", numeric_cols)
print("Categorical features:", categorical_cols)

# If social_media_platform has combined values like "Twitter,Instagram", simplify to primary platform
if 'social_media_platform' in df.columns:
    df['primary_platform'] = df['social_media_platform'].astype(str).str.split(',').str[0].fillna('unknown')
    if 'primary_platform' not in categorical_cols:
        categorical_cols.append('primary_platform')
    if 'social_media_platform' in categorical_cols:
        categorical_cols.remove('social_media_platform')

# Fill missing numeric with median
for c in numeric_cols:
    if df[c].isna().sum() > 0:
        df[c].fillna(df[c].median(), inplace=True)

# Fill missing categorical with 'unknown'
for c in categorical_cols:
    df[c] = df[c].fillna('unknown')

print("After cleaning, sample:")
display(df[numeric_cols + categorical_cols + [target_col]].head())


Numeric features: ['engagement_score', 'purchase_interest_score', 'purchase_amount', 'customer_rating']
Categorical features: ['social_media_platform', 'review_sentiment']
After cleaning, sample:


Unnamed: 0,engagement_score,purchase_interest_score,purchase_amount,customer_rating,review_sentiment,primary_platform,product_category
0,77.0,4.4,172,4.0,"Negative,Neutral",Twitter,Clothing
1,77.0,4.4,387,4.6,"Negative,Neutral",Twitter,Books
2,68.0,1.0,271,2.1,Neutral,Twitter,Books
3,68.0,1.0,192,3.4,Neutral,Twitter,Groceries
4,68.0,1.0,408,2.5,Neutral,Twitter,Books


Build preprocessing pipeline and split

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# One-hot encode categorical, scale numeric
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numeric_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
], remainder='drop')

# Prepare X and y
X = df[numeric_cols + categorical_cols]
y = df[target_col].astype(str)

# Train/test split (stratify to keep class distribution)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)


Train shape: (93, 6) Test shape: (24, 6)


Helper: evaluate & save utilities

In [23]:
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1m = f1_score(y_test, y_pred, average='macro')
    print(f"--- {name} ---")
    print("Accuracy:", acc)
    print("F1 (macro):", f1m)
    print("Classification report:")
    print(classification_report(y_test, y_pred))
    return {'name': name, 'accuracy': acc, 'f1_macro': f1m}

def save_sklearn_model(model, out_fp):
    joblib.dump(model, out_fp)
    print("Saved sklearn model to:", out_fp)

def save_keras_model(keras_model, out_fp):
    # Keras model saved in the TensorFlow SavedModel or .keras format
    keras_model.save(out_fp)
    print("Saved Keras model to:", out_fp)


Train Random Forest (pipeline)

In [24]:
rf_pipeline = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1))
])

rf_pipeline.fit(X_train, y_train)
rf_res = evaluate_model("Random Forest", rf_pipeline, X_test, y_test)
# save
rf_fp = os.path.join(MODELS_DIR, "product_recommender_rf.joblib")
save_sklearn_model(rf_pipeline, rf_fp)


--- Random Forest ---
Accuracy: 0.16666666666666666
F1 (macro): 0.12571428571428572
Classification report:
              precision    recall  f1-score   support

       Books       0.00      0.00      0.00         4
    Clothing       0.17      0.25      0.20         4
 Electronics       0.38      0.50      0.43         6
   Groceries       0.00      0.00      0.00         4
      Sports       0.00      0.00      0.00         6

    accuracy                           0.17        24
   macro avg       0.11      0.15      0.13        24
weighted avg       0.12      0.17      0.14        24

Saved sklearn model to: C:/Users/awini/formative2-mlp/data/processed/models\product_recommender_rf.joblib


Train Logistic Regression

In [25]:
lr_pipeline = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, solver='saga', n_jobs=-1))
])
lr_pipeline.fit(X_train, y_train)
lr_res = evaluate_model("Logistic Regression", lr_pipeline, X_test, y_test)
lr_fp = os.path.join(MODELS_DIR, "product_recommender_lr.joblib")
save_sklearn_model(lr_pipeline, lr_fp)


--- Logistic Regression ---
Accuracy: 0.20833333333333334
F1 (macro): 0.19653679653679654
Classification report:
              precision    recall  f1-score   support

       Books       0.00      0.00      0.00         4
    Clothing       0.29      0.50      0.36         4
 Electronics       0.33      0.33      0.33         6
   Groceries       0.33      0.25      0.29         4
      Sports       0.00      0.00      0.00         6

    accuracy                           0.21        24
   macro avg       0.19      0.22      0.20        24
weighted avg       0.19      0.21      0.19        24

Saved sklearn model to: C:/Users/awini/formative2-mlp/data/processed/models\product_recommender_lr.joblib


Train XGBoost (if available)

In [26]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Now fit the pipeline
xgb_pipeline = Pipeline(steps=[
    ('preproc', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42, n_jobs=-1))
])

xgb_pipeline.fit(X_train, y_train_enc)

# Evaluate
xgb_res = evaluate_model("XGBoost", xgb_pipeline, X_test, y_test_enc)

# Save model
xgb_fp = os.path.join(MODELS_DIR, "product_recommender_xgb.joblib")
save_sklearn_model(xgb_pipeline, xgb_fp)


--- XGBoost ---
Accuracy: 0.25
F1 (macro): 0.2188888888888889
Classification report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.20      0.25      0.22         4
           2       0.33      0.50      0.40         6
           3       0.25      0.25      0.25         4
           4       0.33      0.17      0.22         6

    accuracy                           0.25        24
   macro avg       0.22      0.23      0.22        24
weighted avg       0.24      0.25      0.23        24

Saved sklearn model to: C:/Users/awini/formative2-mlp/data/processed/models\product_recommender_xgb.joblib


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Prepare data for Keras (dense NN)

In [27]:
# Apply preprocessor to get numpy arrays (fit already on training pipelines above)
preprocessor.fit(X_train)  # ensure fitted
X_train_pre = preprocessor.transform(X_train)
X_test_pre = preprocessor.transform(X_test)

# Encode target labels to integers for Keras
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)
num_classes = len(le.classes_)
print("Number of classes:", num_classes)


Number of classes: 5


Build, train, and save Keras model

In [28]:
# Compute class weights to handle imbalance
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_enc), y=y_train_enc)
class_weight_dict = {i: w for i, w in enumerate(class_weights)}
print("Class weights:", class_weight_dict)

# Keras model architecture (simple dense network)
input_shape = X_train_pre.shape[1]
keras_model = keras.Sequential([
    layers.Input(shape=(input_shape,)),
    layers.Dense(256, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.2),
    layers.Dense(num_classes, activation='softmax')
])

keras_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
keras_model.summary()

# Train
history = keras_model.fit(
    X_train_pre, y_train_enc,
    validation_split=0.1,
    epochs=30,
    batch_size=32,
    class_weight=class_weight_dict,
    verbose=2
)

# Evaluate
loss, acc = keras_model.evaluate(X_test_pre, y_test_enc, verbose=0)
yprob = keras_model.predict(X_test_pre)
y_pred_keras = np.argmax(yprob, axis=1)
f1m_keras = f1_score(y_test_enc, y_pred_keras, average='macro')
print("Keras Test Accuracy:", acc)
print("Keras Test F1 (macro):", f1m_keras)

# Save Keras model in .keras (Keras v3 format) or SavedModel format
keras_fp = os.path.join(MODELS_DIR, "product_recommender_keras.keras")
save_keras_model(keras_model, keras_fp)

# Save label encoder so we can map ints back to categories
joblib.dump(le, os.path.join(MODELS_DIR, "label_encoder.joblib"))
print("Saved label encoder.")


Class weights: {0: np.float64(1.1625), 1: np.float64(1.0333333333333334), 2: np.float64(0.8857142857142857), 3: np.float64(1.1625), 4: np.float64(0.8454545454545455)}


Epoch 1/30
3/3 - 2s - 589ms/step - accuracy: 0.1807 - loss: 1.6397 - val_accuracy: 0.1000 - val_loss: 1.6838
Epoch 2/30
3/3 - 0s - 67ms/step - accuracy: 0.3012 - loss: 1.5977 - val_accuracy: 0.2000 - val_loss: 1.6579
Epoch 3/30
3/3 - 0s - 40ms/step - accuracy: 0.2892 - loss: 1.5412 - val_accuracy: 0.3000 - val_loss: 1.6467
Epoch 4/30
3/3 - 0s - 37ms/step - accuracy: 0.3133 - loss: 1.5098 - val_accuracy: 0.3000 - val_loss: 1.6386
Epoch 5/30
3/3 - 0s - 65ms/step - accuracy: 0.4096 - loss: 1.4894 - val_accuracy: 0.2000 - val_loss: 1.6367
Epoch 6/30
3/3 - 0s - 57ms/step - accuracy: 0.3976 - loss: 1.4691 - val_accuracy: 0.2000 - val_loss: 1.6343
Epoch 7/30
3/3 - 0s - 37ms/step - accuracy: 0.4458 - loss: 1.4278 - val_accuracy: 0.2000 - val_loss: 1.6256
Epoch 8/30
3/3 - 0s - 35ms/step - accuracy: 0.5060 - loss: 1.4081 - val_accuracy: 0.3000 - val_loss: 1.6246
Epoch 9/30
3/3 - 0s - 38ms/step - accuracy: 0.4458 - loss: 1.3989 - val_accuracy: 0.3000 - val_loss: 1.6294
Epoch 10/30
3/3 - 0s - 44ms

Compare model results & choose best

In [29]:
results = [rf_res, lr_res]
if xgb_res:
    results.append(xgb_res)
# Keras result
results.append({'name': 'Keras NN', 'accuracy': float(acc), 'f1_macro': float(f1m_keras)})

res_df = pd.DataFrame(results).sort_values(by='f1_macro', ascending=False).reset_index(drop=True)
display(res_df)

best = res_df.iloc[0]
print("Best model by F1-macro:", best['name'], "F1:", best['f1_macro'])

# Map best model name to saved file
model_map = {
    'Random Forest': rf_fp,
    'Logistic Regression': lr_fp,
    'XGBoost': xgb_fp if HAS_XGB else None,
    'Keras NN': keras_fp
}
best_model_fp = model_map.get(best['name'])
print("Best model file:", best_model_fp)


Unnamed: 0,name,accuracy,f1_macro
0,XGBoost,0.25,0.218889
1,Logistic Regression,0.208333,0.196537
2,Random Forest,0.166667,0.125714
3,Keras NN,0.125,0.106667


Best model by F1-macro: XGBoost F1: 0.2188888888888889
Best model file: C:/Users/awini/formative2-mlp/data/processed/models\product_recommender_xgb.joblib


Save a pointer to the best model for downstream integration

In [30]:
# Save a small JSON/text pointer to the best model so integration scripts can load it
import json
out = {
    'best_model_name': best['name'],
    'best_model_path': str(best_model_fp),
    'label_encoder': str(os.path.join(MODELS_DIR, "label_encoder.joblib"))
}
with open(os.path.join(MODELS_DIR, 'best_model_info.json'), 'w') as f:
    json.dump(out, f, indent=2)
print("Saved best model info to", os.path.join(MODELS_DIR, 'best_model_info.json'))
display(out)


Saved best model info to C:/Users/awini/formative2-mlp/data/processed/models\best_model_info.json


{'best_model_name': 'XGBoost',
 'best_model_path': 'C:/Users/awini/formative2-mlp/data/processed/models\\product_recommender_xgb.joblib',
 'label_encoder': 'C:/Users/awini/formative2-mlp/data/processed/models\\label_encoder.joblib'}