In [59]:
#load library
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error

In [60]:
path="C:/Users/melika/Desktop/melika/ML_return_rate_prediction/csv/"
#load data
train=pd.read_csv(f"{path}train.csv",sep=",")
validation=pd.read_csv(f"{path}validation.csv",sep=",")
test=pd.read_csv(f"{path}test.csv",sep=",")

In [61]:
# Data Overview
print(train.head(2))
print(train.describe())
print(train.info())

               pid        brand  season  category subcategory subsubcategory  \
0  1b421e5362218a1  CanvasCraft  Winter  Sweaters       Adult     Waist Wrap   
1  1e735b1fbe1c94c  CanvasCraft  Winter  Sweaters       Adult      High-Neck   

   numeric_size  count_size  price  total_sold  ...  online_return  \
0             0           1  58.40           1  ...              0   
1             0           1  55.27           1  ...              0   

   online_percentage  unique_channel_count  SR  SO  introduction_time  \
0                1.0                     1   5   1           0.753425   
1                1.0                     1   5   1           0.950685   

   introduction_season  first_return  return_rate  yearly_return_rate  
0                    2           0.0          0.0                 0.0  
1                    3           0.0          0.0                 0.0  

[2 rows x 24 columns]
       numeric_size   count_size        price   total_sold  total_returned  \
count   407

In [62]:
# Encoding categorical variables
# Assuming 'season' and 'subcategory' are categorical
train_encoded = pd.get_dummies(train, columns=["season", "subcategory"], drop_first=True)
validation_encoded = pd.get_dummies(validation, columns=["season", "subcategory"], drop_first=True)

# Get the columns after encoding in the training set
train_columns = train_encoded.columns

# Reindex validation_encoded to match train_encoded columns, filling missing columns with 0
validation_encoded = validation_encoded.reindex(columns=train_columns, fill_value=0)

In [63]:
print(train_encoded.head(2))
print(train_encoded.dtypes)


               pid        brand  category subsubcategory  numeric_size  \
0  1b421e5362218a1  CanvasCraft  Sweaters     Waist Wrap             0   
1  1e735b1fbe1c94c  CanvasCraft  Sweaters      High-Neck             0   

   count_size  price  total_sold  total_returned  offline_sales  ...  SR  SO  \
0           1  58.40           1               0              0  ...   5   1   
1           1  55.27           1               0              0  ...   5   1   

   introduction_time  introduction_season  first_return  return_rate  \
0           0.753425                    2           0.0          0.0   
1           0.950685                    3           0.0          0.0   

   yearly_return_rate  season_Winter  subcategory_End Items  \
0                 0.0           True                  False   
1                 0.0           True                  False   

   subcategory_Petite  
0               False  
1               False  

[2 rows x 25 columns]
pid                       object
b

## linear regression

In [64]:
# Define response and predictors
X = train_encoded.drop(['yearly_return_rate', 'pid','brand', 'category', 'subsubcategory'], axis=1)
y = train_encoded['yearly_return_rate']

# Convert bool columns to int (if any)
X = X.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# Fit the model with all features and get summary
model = sm.OLS(y, sm.add_constant(X)).fit()
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:     yearly_return_rate   R-squared:                       0.511
Model:                            OLS   Adj. R-squared:                  0.509
Method:                 Least Squares   F-statistic:                     235.6
Date:                Tue, 06 Aug 2024   Prob (F-statistic):               0.00
Time:                        17:36:17   Log-Likelihood:                 2451.9
No. Observations:                4079   AIC:                            -4866.
Df Residuals:                    4060   BIC:                            -4746.
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    -0.05

  X = X.applymap(lambda x: int(x) if isinstance(x, bool) else x)


In [65]:
# Prepare the validation set features and target
X_val = validation_encoded.drop(['yearly_return_rate', 'pid', 'brand', 'category', 'subsubcategory'], axis=1)
y_val = validation_encoded['yearly_return_rate']

# Convert boolean columns to integers
X_val = X_val.applymap(lambda x: int(x) if isinstance(x, bool) else x)

# Add constant to match the model fitting
X_val = sm.add_constant(X_val)

# Predict on the validation set
val_preds = model.predict(X_val)

# Calculate MAE
reg_mae = mean_absolute_error(y_val, val_preds)

# Create a DataFrame to store the results
results = pd.DataFrame({'Model': ['OLS'], 'MAE': [reg_mae]})
results


  X_val = X_val.applymap(lambda x: int(x) if isinstance(x, bool) else x)


Unnamed: 0,Model,MAE
0,OLS,0.090396


# ridge regression

In [66]:
from sklearn.linear_model import Ridge	

# Fit the model 
ridge_reg = Ridge(alpha=1.0)				
ridge_reg_model = ridge_reg .fit(X, y)			

# Predict on the validation set
X_val = X_val.drop(columns=["const"])
val_preds = ridge_reg_model.predict(X_val)

# Calculate MAE
ridge_reg_mae = mean_absolute_error(y_val, val_preds)

# Create a new row to add to the DataFrame
new_row = pd.DataFrame({'Model': ['Ridge'], 'MAE': [ridge_reg_mae]})

# Assuming results_df is your existing DataFrame
results = pd.concat([results, new_row], ignore_index=True)

results


Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454


# lasso regression

In [67]:
from sklearn.linear_model import Lasso

# Fit the model 
lasso_reg = Lasso(alpha=0.0025)				
lasso_reg_model = lasso_reg .fit(X, y)			

# Predict on the validation set
val_preds = lasso_reg_model.predict(X_val)

# Calculate MAE
lasso_reg_mae = mean_absolute_error(y_val, val_preds)

# Create a new row to add to the DataFrame
new_row = pd.DataFrame({'Model': ['Laaso'], 'MAE': [lasso_reg_mae]})

# Assuming results_df is your existing DataFrame
results = pd.concat([results, new_row], ignore_index=True)

results


Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965


# GAM

In [68]:
# Train Random Forest and get feature importances
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=700, min_samples_leaf=50, max_features=5, random_state=0)
rf.fit(X, y)
importance = rf.feature_importances_

# Get top 8 features
features = X.columns
top_features = [features[i] for i in importance.argsort()[-8:][::-1]]
top_features


['return_rate',
 'SR',
 'online_return',
 'count_size',
 'total_returned',
 'total_sold',
 'first_return',
 'introduction_time']

# decision_tree 

In [69]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error

# Define the parameter grid
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6]
}

# Create the DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=42)

# Define the scoring metric
scoring = make_scorer(mean_absolute_error, greater_is_better=False)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring=scoring, n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X, y)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Convert negative MSE back to positive
print(f"Best parameters: {best_params}")
print(f"Best mean squared error: {best_score}")



Best parameters: {'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 2}
Best mean squared error: 0.08292343486073399


In [70]:
#add to result data frame
results.loc[len(results.index)] = ['DT', best_score] 

results

Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923


#  Bagging        


In [71]:
from sklearn.ensemble import RandomForestRegressor

# Assuming you have your training data as X_train and y_train
n_features = X.shape[1]  # Get the number of features

# Create a Random Forest Classifier
rf = RandomForestRegressor(max_features=n_features)

# Fit the model to your data
rf.fit(X, y)

# Make predictions
val_preds = rf.predict(X_val)

# Calculate MAE
bagging_mae = mean_absolute_error(y_val, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['Bagging', bagging_mae] 
results

Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303


# Random forest

In [72]:
n_features = X.shape[1]  # Get the number of features
#find the squre root of number of features
n_features=np.sqrt(n_features)
n_features


4.47213595499958

In [73]:
from sklearn.ensemble import RandomForestRegressor

# Create a Random Forest Classifier
rf = RandomForestRegressor(max_features=4)

# Fit the model to your data
rf.fit(X, y)

# Make predictions
val_preds = rf.predict(X_val)

# Calculate MAE
rf_mae = mean_absolute_error(y_val, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['RF', rf_mae] 
results

Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303
5,RF,0.079119


# Boosting

In [74]:
from sklearn.ensemble import GradientBoostingRegressor 

boosting = GradientBoostingRegressor (learning_rate=0.15)

# Fit the model to your data
boosting.fit(X, y)

# Make predictions
val_preds = boosting.predict(X_val)

# Calculate MAE
boosting_mae = mean_absolute_error(y_val, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['boosting', boosting_mae] 
results


Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303
5,RF,0.079119
6,boosting,0.078581


# XGBOOST

In [75]:
import xgboost as xgb

# Create an XGBoost Classifier
xgb_model = xgb.XGBRegressor(learning_rate=0.09)

# Fit the model to your data
xgb_model.fit(X, y)

# Make predictions
val_preds = xgb_model.predict(X_val)

# Calculate MAE
xgb_mae = mean_absolute_error(y_val, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['XGBOOST', xgb_mae] 
results


Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303
5,RF,0.079119
6,boosting,0.078581
7,XGBOOST,0.077711


In [79]:
train.columns


Index(['pid', 'brand', 'season', 'category', 'subcategory', 'subsubcategory',
       'numeric_size', 'count_size', 'price', 'total_sold', 'total_returned',
       'offline_sales', 'offline_return', 'online_sales', 'online_return',
       'online_percentage', 'unique_channel_count', 'SR', 'SO',
       'introduction_time', 'introduction_season', 'first_return',
       'return_rate', 'yearly_return_rate'],
      dtype='object')

In [81]:
train_x=train.drop(columns=['pid', 'yearly_return_rate'])
val_x=validation.drop(columns=['pid', 'yearly_return_rate'])

train_y=train[['yearly_return_rate']]
val_y=validation[['yearly_return_rate']]
train_x.columns

Index(['brand', 'season', 'category', 'subcategory', 'subsubcategory',
       'numeric_size', 'count_size', 'price', 'total_sold', 'total_returned',
       'offline_sales', 'offline_return', 'online_sales', 'online_return',
       'online_percentage', 'unique_channel_count', 'SR', 'SO',
       'introduction_time', 'introduction_season', 'first_return',
       'return_rate'],
      dtype='object')

In [84]:
import catboost as cb

# Define categorical features for CatBoost
categorical_features = [0,1,2,3,4,5]
# Create a CatBoostClassifier
catboost_model = cb.CatBoostRegressor(iterations=100, learning_rate=0.1, depth=6, cat_features=categorical_features, verbose=0)

# Fit the model to your data
catboost_model.fit(train_x, train_y)
# Make predictions
val_preds = catboost_model.predict(val_x)

# Calculate MAE
catboost_mae = mean_absolute_error(val_y, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['catboost', catboost_mae] 
results

Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303
5,RF,0.079119
6,boosting,0.078581
7,XGBOOST,0.077711
8,catboost,0.07791


# LIGHTGBM

In [89]:
import lightgbm as lgb

# Define categorical features for CatBoost
categorical_features = [0,1,2,3,4,5]
# Create a model for 
train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_features)

params = {
    'objective': 'regression',  # or 'binary' for classification tasks
    'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
    'metric': 'mae',  # Metric for regression
    'learning_rate': 0.1,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5
}

# Training the model
gbm = lgb.train(params, train_data, num_boost_round=100)

val_preds = gbm.predict(X_val)

# Calculate MAE
gbm_mae = mean_absolute_error(y_val, val_preds)

#add to result data frame
results.loc[len(results.index)] = ['lightgbm', gbm_mae] 
results

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001564 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 826
[LightGBM] [Info] Number of data points in the train set: 4079, number of used features: 20
[LightGBM] [Info] Start training from score 0.142757


Unnamed: 0,Model,MAE
0,OLS,0.090396
1,Ridge,0.090454
2,Laaso,0.089965
3,DT,0.082923
4,Bagging,0.078303
5,RF,0.079119
6,boosting,0.078581
7,XGBOOST,0.077711
8,catboost,0.07791
9,lightgbm,0.079856
