In [None]:
import pandas as pd
from io import StringIO
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split , cross_val_score
import joblib

In [None]:
with open('D:\DS\crwa\Data-Science-project\preprocess_data.json', 'r') as file:
    json_data = file.read()

df = pd.read_json(StringIO(json_data))

In [None]:
col = ['url','name','asin']
df.drop(columns=col, inplace=True)

In [None]:
df = df[df['department'] != 0]

In [None]:
def balance_categorical_data(X, category_columns):
    """
    Balances the dataset by duplicating rows from under-represented categories in a specified categorical column.
    
    Parameters:
    X (pd.DataFrame): The feature data with categorical features.
    category_column (str): The column name containing the categorical feature to balance.
    
    Returns:
    X_resampled (pd.DataFrame): The resampled feature data.
    """
    for category_column in category_columns:
        # Count the occurrences of each category in the specified column
        category_counts = X[category_column].value_counts()
        
        # Identify the category with the most and least occurrences
        majority_category = category_counts.idxmax()
        minority_category = category_counts.idxmin()
        
        # Get the number of occurrences of the majority and minority categories
        majority_count = category_counts[majority_category]
        minority_count = category_counts[minority_category]
        
        # Find how many rows need to be added for each minority category
        duplication_factor = majority_count // (minority_count*3)
        
        # Separate rows belonging to the minority category
        minority_data = X[X[category_column] == minority_category]
        
        # Duplicate the rows for the minority category
        X = pd.concat([X] + [minority_data] * duplication_factor, axis=0)
        
        # Shuffle the dataset to ensure randomness after duplication
        X = X.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return X

In [None]:
imbalance_col = ['department', 'origin', 'Not Bleach', 'Tumble Dry', 'Tie', 'No closure', 'Elastic', 'Lace Up', 'Drawstring']
df = balance_categorical_data(df,imbalance_col)

In [None]:
df

In [None]:
target = 'price'
features = [col for col in df.columns if col != target]
x = df[features]
y = df[target]

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x,y, test_size=0.2, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(x_train, y_train)

In [None]:
from sklearn.model_selection import GridSearchCV , KFold 
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

rf = RandomForestRegressor()
cv = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=cv, scoring=['r2','neg_mean_squared_error'], refit='neg_mean_squared_error', verbose=2)
grid_search.fit(x_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, median_absolute_error, r2_score
best_model = grid_search.best_estimator_
y_pred = best_model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
mae = median_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Best Parameters: {grid_search.best_params_}')
print(f'MAE: {mae}')
print(f'RMSE: {rmse}')
print(f'R²: {r2}')

In [None]:
joblib.dump(best_model, "random_forest_price_prediction_model.pkl")

In [None]:
import numpy as np

results = grid_search.cv_results_

max_depths = [params['max_depth'] for params in results['params']]

mean_r2_scores = results['mean_test_r2']
mean_neg_mse_scores = results['mean_test_neg_mean_squared_error']

In [None]:
mean_mse_scores = -1 * np.array(mean_neg_mse_scores)
mean_rmse_scores = np.sqrt(mean_mse_scores)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 5))
plt.plot(max_depths, mean_r2_scores, 'o-', label="R² Score")
plt.title('R² Score vs Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('R² Score')
plt.grid(True)
plt.legend()
plt.show()

plt.figure(figsize=(10, 5))
plt.plot(max_depths, mean_rmse_scores, 'o-', color='orange', label="RMSE")
plt.title('RMSE vs Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('RMSE')
plt.grid(True)
plt.legend()
plt.show()
