<a href="https://colab.research.google.com/github/maleklachheb/Prediction-of-Product-Sales/blob/main/Sale_prediction_MV2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',100)
import missingno
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
def regression_metrics(y_true, y_pred, label='', verbose = True, output_dict=False):
  # Get metrics
  mae = mean_absolute_error(y_true, y_pred)
  mse = mean_squared_error(y_true, y_pred)
  rmse = mean_squared_error(y_true, y_pred, squared=False)
  r_squared = r2_score(y_true, y_pred)
  if verbose == True:
    # Print Result with Label and Header
    header = "-"*60
    print(header, f"Regression Metrics: {label}", header, sep='\n')
    print(f"- MAE = {mae:,.3f}")
    print(f"- MSE = {mse:,.3f}")
    print(f"- RMSE = {rmse:,.3f}")
    print(f"- R^2 = {r_squared:,.3f}")
  if output_dict == True:
      metrics = {'Label':label, 'MAE':mae,
                 'MSE':mse, 'RMSE':rmse, 'R^2':r_squared}
      return metrics
def evaluate_regression(reg, X_train, y_train, X_test, y_test, verbose = True,
                        output_frame=False):
  # Get predictions for training data
  y_train_pred = reg.predict(X_train)
  # Call the helper function to obtain regression metrics for training data
  results_train = regression_metrics(y_train, y_train_pred, verbose = verbose,
                                     output_dict=output_frame,
                                     label='Training Data')
  print()
  # Get predictions for test data
  y_test_pred = reg.predict(X_test)
  # Call the helper function to obtain regression metrics for test data
  results_test = regression_metrics(y_test, y_test_pred, verbose = verbose,
                                  output_dict=output_frame,
                                    label='Test Data' )
  # Store results in a dataframe if ouput_frame is True
  if output_frame:
    results_df = pd.DataFrame([results_train,results_test])
    # Set the label as the index
    results_df = results_df.set_index('Label')
    # Set index.name to none to get a cleaner looking result
    results_df.index.name=None
    # Return the dataframe
    return results_df.round(3)

In [50]:
df = pd.read_csv('/content/drive/MyDrive/dojo/sales_predictions_2023.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [63]:
df.drop_duplicates(inplace=True)
df['Item_Fat_Content'].replace("LF","Low Fat", inplace=True)
df['Item_Fat_Content'].replace("low fat","Low Fat", inplace=True)
df['Item_Fat_Content'].replace("reg","Regular", inplace=True)
df['Item_Fat_Content'].value_counts()
print(df.isna().sum())
average_item_weight = df['Item_Weight'].mean()

# Fill missing values in the 'Item_Weight' column with the average value
df['Item_Weight'].fillna(average_item_weight, inplace=True)
print(df.isna().sum())

Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64
Item_Identifier                 0
Item_Weight                     0
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64


In [64]:
# The target we are trying to predict
y = df['Item_Outlet_Sales']
# The features we will use to make the prediction
X = df.drop(columns=['Item_Identifier', 'Item_Outlet_Sales'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [65]:
impute_median = SimpleImputer(strategy='median')
scaler = StandardScaler()
num_pipe = make_pipeline(impute_median, scaler)
col=X_train.select_dtypes("number").columns
num_pipe.fit(X_train[col])


In [66]:
s = ['Medium','High', 'Small','Tier1','Tier2','Tier3','Supermarket Type1','Supermarket Type2','Supermarket Type3','Grocery Store','Low Fat','Regular', 'Tier 1', 'Tier 3', 'Tier 2']
ord_encoder = OrdinalEncoder(categories=s)
scale=StandardScaler()
ord_pipe= make_pipeline(ord_encoder,scale)
ord_cols = ['Outlet_Size','Outlet_Location_Type','Outlet_Type','Item_Fat_Content']

impute_na_ord = SimpleImputer(strategy='most_frequent', fill_value='NA')
ordinal_category_orders = [s,s,s,s]
ord_encoder = OrdinalEncoder(categories=ordinal_category_orders)

ord_pipe = make_pipeline(impute_na_ord, ord_encoder)
ord_pipe.fit(X_train[ord_cols])

In [67]:
ohe_cols =  ['Item_Type']
impute_na = SimpleImputer(strategy='constant', fill_value = "NA")
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# Fit the pipeline on the numeric training data
ohe_pipe = make_pipeline(
    impute_na,
    ohe_encoder
)
ohe_pipe.fit(X_train[ohe_cols])


# Transform the training data
X_train_ohe = ohe_pipe.transform(X_train[ohe_cols])
X_test_ohe = ohe_pipe.transform(X_test[ohe_cols])

pd.DataFrame(X_train_ohe).head()

pd.DataFrame(X_test_ohe).head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [68]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, col),
        ('other', ord_pipe, ord_cols)
    ],
    verbose_feature_names_out=False
)


X_train_processed = preprocessor.fit_transform(X_train)
X_train_processed = preprocessor.transform(X_train)

# Transform the test data
X_test_processed = preprocessor.transform(X_test)

In [69]:
X_train_processed_df = pd.DataFrame(X_train_processed)

# Check the data types of all columns
print("Data Types of Transformed Training Data:")
print(X_train_processed_df.dtypes)

# Check if original numeric features have been scaled
print("\nOriginal Numeric Features After Scaling:")

print(X_train_processed_df[X_train_processed_df.select_dtypes("number").columns].describe())

Data Types of Transformed Training Data:
0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
dtype: object

Original Numeric Features After Scaling:
                  0             1             2             3            4  \
count  6.392000e+03  6.392000e+03  6.392000e+03  6.392000e+03  6392.000000   
mean   5.585853e-17 -6.169450e-17  4.668773e-17 -4.250806e-15     0.667240   
std    1.000078e+00  1.000078e+00  1.000078e+00  1.000078e+00     0.884083   
min   -1.978469e+00 -1.291052e+00 -1.767529e+00 -1.532139e+00     0.000000   
25%   -8.055738e-01 -7.624234e-01 -7.638272e-01 -1.293807e+00     0.000000   
50%   -9.180470e-03 -2.318711e-01  3.400912e-02  1.361872e-01     0.000000   
75%    7.598674e-01  5.596016e-01  7.172910e-01  7.320181e-01     2.000000   
max    2.005105e+00  5.132050e+00  1.994559e+00  1.327849e+00     2.000000   

                 5            6            7  
count  6392.000000  6392.000000  6392.000000  
me

In [70]:
from sklearn.ensemble import RandomForestRegressor
# Instantiate default random forest model
rf = RandomForestRegressor(random_state = 42)
# Model Pipeline
rf_pipe = make_pipeline(preprocessor, rf)
print(df.shape)

(8523, 12)


rf non enhance

In [71]:
rf_pipe.fit(X_train, y_train)

In [72]:
evaluate_regression(rf_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 298.489
- MSE = 184,021.148
- RMSE = 428.977
- R^2 = 0.938

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 775.172
- MSE = 1,234,298.146
- RMSE = 1,110.990
- R^2 = 0.553


In [None]:
rf_pipe.get_params()

In [74]:
params = {'randomforestregressor__max_depth': [None,10,15,20],
          'randomforestregressor__n_estimators':[10,100,150,200],
          'randomforestregressor__min_samples_leaf':[2,3,4],
          'randomforestregressor__max_features':['sqrt','log2',None],
          'randomforestregressor__oob_score':[True,False],
          }

In [75]:
gridsearch = GridSearchCV(rf_pipe, params, n_jobs=-1, cv = 3, verbose=1)
# Fit the gridsearch on training data
gridsearch.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


In [76]:
gridsearch.best_params_

{'randomforestregressor__max_depth': 10,
 'randomforestregressor__max_features': 'log2',
 'randomforestregressor__min_samples_leaf': 4,
 'randomforestregressor__n_estimators': 150,
 'randomforestregressor__oob_score': True}

In [77]:
best_rf = gridsearch.best_estimator_
evaluate_regression(best_rf, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 679.785
- MSE = 938,972.808
- RMSE = 969.006
- R^2 = 0.683

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 729.424
- MSE = 1,096,553.076
- RMSE = 1,047.164
- R^2 = 0.603


In [78]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg

In [82]:
# Fit the model on the training data
lr_pipe = make_pipeline(preprocessor,lin_reg)
lr_pipe.fit(X_train, y_train)
evaluate_regression(lr_pipe, X_train, y_train, X_test, y_test)

------------------------------------------------------------
Regression Metrics: Training Data
------------------------------------------------------------
- MAE = 969.904
- MSE = 1,680,199.515
- RMSE = 1,296.225
- R^2 = 0.432

------------------------------------------------------------
Regression Metrics: Test Data
------------------------------------------------------------
- MAE = 943.540
- MSE = 1,585,289.619
- RMSE = 1,259.083
- R^2 = 0.425


**Linear Regression:
R^2 = 0.425
RMSE = 1,259.083
Random Forest Regression:
R^2 = 0.553
RMSE = 1,110.990
The random forest model outperforms the linear regression model in terms of both R^2 and RMSE on the test data. This indicates that the random forest model provides better predictions.**

**WE  choose to work with random forst regression because we prioritize predictive accuracy. and  Forest Regression have a higher computational resources ** ****