In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle

# Load the expanded dataset
df = pd.read_csv('dataset/fashion_data_large.csv')

# Convert multi-label color field to individual binary fields
colors = df['color'].str.get_dummies(sep=', ')
df = pd.concat([df.drop('color', axis=1), colors], axis=1)

# Selecting example features and labels
features = df.drop('label', axis=1)
labels = df['label']

# Convert categorical data to numerical data
features = pd.get_dummies(features)

# Store the column names
feature_names = features.columns.tolist()

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Define the model with hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Save the best model and feature names
model_path = 'models/recommendation_model.pkl'
with open(model_path, 'wb') as f:
    pickle.dump((best_model, feature_names), f)

print(f"Best model and feature names saved to {model_path}")


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best model and feature names saved to models/recommendation_model.pkl
