In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv('ecommerce_sales_analysis.csv')
df.head()

Unnamed: 0,product_id,product_name,category,price,review_score,review_count,sales_month_1,sales_month_2,sales_month_3,sales_month_4,sales_month_5,sales_month_6,sales_month_7,sales_month_8,sales_month_9,sales_month_10,sales_month_11,sales_month_12
0,1,Product_1,Clothing,190.4,1.7,220,479,449,92,784,604,904,446,603,807,252,695,306
1,2,Product_2,Home & Kitchen,475.6,3.2,903,21,989,861,863,524,128,610,436,176,294,772,353
2,3,Product_3,Toys,367.34,4.5,163,348,558,567,143,771,409,290,828,340,667,267,392
3,4,Product_4,Toys,301.34,3.9,951,725,678,59,15,937,421,670,933,56,157,168,203
4,5,Product_5,Books,82.23,4.2,220,682,451,649,301,620,293,411,258,854,548,770,257


In [None]:
# Data cleaning

In [5]:
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_id      1000 non-null   int64  
 1   product_name    1000 non-null   object 
 2   category        1000 non-null   object 
 3   price           1000 non-null   float64
 4   review_score    1000 non-null   float64
 5   review_count    1000 non-null   int64  
 6   sales_month_1   1000 non-null   int64  
 7   sales_month_2   1000 non-null   int64  
 8   sales_month_3   1000 non-null   int64  
 9   sales_month_4   1000 non-null   int64  
 10  sales_month_5   1000 non-null   int64  
 11  sales_month_6   1000 non-null   int64  
 12  sales_month_7   1000 non-null   int64  
 13  sales_month_8   1000 non-null   int64  
 14  sales_month_9   1000 non-null   int64  
 15  sales_month_10  1000 non-null   int64  
 16  sales_month_11  1000 non-null   int64  
 17  sales_month_12  1000 non-null   in

In [6]:
df.drop('product_id', inplace=True, axis=1)

In [7]:
df.head()

Unnamed: 0,product_name,category,price,review_score,review_count,sales_month_1,sales_month_2,sales_month_3,sales_month_4,sales_month_5,sales_month_6,sales_month_7,sales_month_8,sales_month_9,sales_month_10,sales_month_11,sales_month_12
0,Product_1,Clothing,190.4,1.7,220,479,449,92,784,604,904,446,603,807,252,695,306
1,Product_2,Home & Kitchen,475.6,3.2,903,21,989,861,863,524,128,610,436,176,294,772,353
2,Product_3,Toys,367.34,4.5,163,348,558,567,143,771,409,290,828,340,667,267,392
3,Product_4,Toys,301.34,3.9,951,725,678,59,15,937,421,670,933,56,157,168,203
4,Product_5,Books,82.23,4.2,220,682,451,649,301,620,293,411,258,854,548,770,257


In [8]:
## get all the numeric features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

Num of Numerical Features : 15


In [None]:
# Train and Split model

In [9]:
from sklearn.model_selection import train_test_split
X = df.drop(['product_name'], axis=1)
y = df['product_name']

In [10]:
X.head()

Unnamed: 0,category,price,review_score,review_count,sales_month_1,sales_month_2,sales_month_3,sales_month_4,sales_month_5,sales_month_6,sales_month_7,sales_month_8,sales_month_9,sales_month_10,sales_month_11,sales_month_12
0,Clothing,190.4,1.7,220,479,449,92,784,604,904,446,603,807,252,695,306
1,Home & Kitchen,475.6,3.2,903,21,989,861,863,524,128,610,436,176,294,772,353
2,Toys,367.34,4.5,163,348,558,567,143,771,409,290,828,340,667,267,392
3,Toys,301.34,3.9,951,725,678,59,15,937,421,670,933,56,157,168,203
4,Books,82.23,4.2,220,682,451,649,301,620,293,411,258,854,548,770,257


In [11]:
# Separate dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((800, 16), (200, 16))

In [12]:
# Create Column Transformer with 3 types of transformers
cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
         ("OneHotEncoder", oh_transformer, cat_features),
          ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [13]:
preprocessor

In [14]:
# applying Transformation in training(fit_transform)
X_train=preprocessor.fit_transform(X_train)

In [15]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,0.0,0.0,0.0,0.0,0.0,0.0,-1.534081,1.062609,0.584945,-0.029871,...,-0.450352,-1.150427,-1.436279,-0.364301,-1.758934,-0.668824,-0.353755,1.414920,-1.703713,1.430834
1,0.0,0.0,1.0,0.0,0.0,0.0,1.536060,-1.332003,0.868787,-0.774280,...,-0.041723,-1.431729,-1.260291,-0.402497,-0.479568,0.533471,1.080926,0.304433,1.436924,-0.227268
2,0.0,0.0,0.0,0.0,1.0,0.0,0.394086,-1.503046,1.382578,-0.322810,...,-0.167717,-0.049834,0.589310,-0.471944,-1.693414,-1.619637,-0.795735,1.684759,0.878589,0.625061
3,0.0,0.0,0.0,0.0,0.0,1.0,-1.594424,-0.391262,0.811300,-0.367612,...,-1.015622,-1.220753,-0.276828,1.583685,1.627420,1.580743,-1.258762,1.515245,-0.621938,-0.721476
4,0.0,0.0,0.0,1.0,0.0,0.0,1.433807,-1.332003,-1.172005,0.993691,...,1.194380,-1.561831,-1.391419,-1.492813,-1.748589,1.377490,1.165113,-0.439352,-0.984856,0.091460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,0.0,0.0,0.0,0.0,0.0,0.0,-0.290269,-0.391262,-1.689389,1.369341,...,-1.090537,0.245532,-1.194726,1.302425,-0.434739,-0.017724,0.081209,0.435892,1.314788,-0.384842
796,0.0,0.0,0.0,0.0,0.0,0.0,1.073264,0.635000,-0.557611,-0.143600,...,0.938987,0.579578,1.645239,-0.315688,0.537717,0.006390,0.961662,-1.051677,-0.580063,-0.148482
797,1.0,0.0,0.0,0.0,0.0,0.0,0.957685,0.891565,1.508331,-0.253883,...,0.666567,0.604192,0.226982,-0.867791,-0.165761,1.360265,0.004038,-1.477191,-1.319857,1.681519
798,0.0,0.0,1.0,0.0,0.0,0.0,0.702847,-1.674090,-0.543240,1.665726,...,0.240912,-1.023842,0.703185,-0.447637,0.210117,0.822849,-0.115226,-0.947893,-0.911574,-0.370517


In [16]:
# applying Transformation in test(fit_transform)
X_test = preprocessor.fit_transform(X_test)

In [17]:
print(X_test)

[[ 0.          0.          1.         ...  1.6237362  -0.95933158
   0.86914824]
 [ 0.          0.          0.         ... -1.00428019  1.51083293
  -1.14260899]
 [ 0.          0.          0.         ... -1.63766364  1.38260035
   1.03285362]
 ...
 [ 0.          1.          0.         ... -1.16175121  0.2048853
   1.32024751]
 [ 0.          0.          0.         ...  1.4487684   0.74143743
   1.0692326 ]
 [ 0.          0.          1.         ...  1.56074779 -0.79397904
   1.2074727 ]]


In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import BaggingClassifier, BaggingRegressor
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Scale Data

In [19]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [29]:
#Automatic Probelm type detection
if y.dtype == "object" or len(y.unique()) < 20:
    problem = "classification"
else:
    problem = "regression"

print("\nDetected Problem Type:", problem)


Detected Problem Type: classification


In [28]:
#Build Bagging Model
if problem == "classification":
    base = DecisionTreeClassifier()
    model = BaggingClassifier(
        estimator=base,
        n_estimators=100,
        max_samples=0.8,
        bootstrap=True,
        random_state=42
    )
else:
    base = DecisionTreeRegressor()
    model = BaggingRegressor(
        estimator=base,
        n_estimators=100,
        max_samples=0.8,
        bootstrap=True,
        random_state=42
    )


In [22]:
#Train Model
model.fit(X_train, y_train)

In [23]:
#Predict
y_pred = model.predict(X_test)

In [24]:
#Evaluate Model
if problem == "classification":
    print("\nAccuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))

else:
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print("\nRMSE:", rmse)
    print("R² Score:", r2)



Accuracy: 0.0

Classification Report:
               precision    recall  f1-score   support

  Product_10       0.00      0.00      0.00       0.0
 Product_101       0.00      0.00      0.00       0.0
 Product_102       0.00      0.00      0.00       1.0
 Product_108       0.00      0.00      0.00       1.0
 Product_109       0.00      0.00      0.00       0.0
  Product_11       0.00      0.00      0.00       1.0
 Product_111       0.00      0.00      0.00       1.0
 Product_121       0.00      0.00      0.00       1.0
 Product_125       0.00      0.00      0.00       0.0
 Product_131       0.00      0.00      0.00       0.0
 Product_132       0.00      0.00      0.00       0.0
 Product_137       0.00      0.00      0.00       1.0
 Product_138       0.00      0.00      0.00       1.0
 Product_140       0.00      0.00      0.00       1.0
 Product_148       0.00      0.00      0.00       0.0
 Product_150       0.00      0.00      0.00       0.0
 Product_156       0.00      0.00      0.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
