**2. Dataset Schema and Storage**



In [None]:
import pandas as pd
import os

In [None]:
dtypes = {
    'Channel': 'int16',
    'Region': 'int16',
    'Fresh': 'int64',  # Continuous spending monetary unit
    'Milk': 'int64',
    'Grocery': 'int64',
    'Frozen': 'int64',
    'Detergents_Paper': 'int64',
    'Delicassen': 'int64'
}

# Display feature types
for variable, ftype in dtypes.items():
    print(f"Column: {variable}, Type: {ftype}")

Column: Channel, Type: int16
Column: Region, Type: int16
Column: Fresh, Type: int64
Column: Milk, Type: int64
Column: Grocery, Type: int64
Column: Frozen, Type: int64
Column: Detergents_Paper, Type: int64
Column: Delicassen, Type: int64


In [None]:
%%time

ws_cust_df = pd.read_csv("/content/Wholesale customers data.csv", dtype = dtypes)

CPU times: user 7.27 ms, sys: 1.92 ms, total: 9.19 ms
Wall time: 12.2 ms


In [None]:
print("Minimum values for each column:")
print(ws_cust_df.min())

Minimum values for each column:
Channel              1
Region               1
Fresh                3
Milk                55
Grocery              3
Frozen              25
Detergents_Paper     3
Delicassen           3
dtype: int64


In [None]:
ws_cust_df.dtypes

Unnamed: 0,0
Channel,int16
Region,int16
Fresh,int64
Milk,int64
Grocery,int64
Frozen,int64
Detergents_Paper,int64
Delicassen,int64


In [None]:
ws_cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int16
 1   Region            440 non-null    int16
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int16(2), int64(6)
memory usage: 22.5 KB


In [None]:
ws_cust_df.shape

(440, 8)

In [None]:
ws_cust_df.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [None]:
ws_cust_df2 = ws_cust_df[(ws_cust_df['Channel'].isin([1, 2]))                  # Condition for 'Channel'
    & (ws_cust_df['Region'].isin([1, 2, 3]))                      # Condition for 'Region'
]

In [None]:
ws_cust_df2.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [None]:
ws_cust_df2.to_parquet("/content/Wholesale_customers_data.parquet")

**3. Profiling the Dataset**


In [None]:
!pip install ydata_profiling

Collecting ydata_profiling
  Downloading ydata_profiling-4.12.0-py2.py3-none-any.whl.metadata (20 kB)
Collecting visions<0.7.7,>=0.7.5 (from visions[type_image_path]<0.7.7,>=0.7.5->ydata_profiling)
  Downloading visions-0.7.6-py3-none-any.whl.metadata (11 kB)
Collecting htmlmin==0.1.12 (from ydata_profiling)
  Downloading htmlmin-0.1.12.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting phik<0.13,>=0.11.1 (from ydata_profiling)
  Downloading phik-0.12.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata_profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting imagehash==4.3.1 (from ydata_profiling)
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting dacite>=1.8 (from ydata_profiling)
  Downloading dacite-1.8.1-py3-none-any.whl.metadata (15 kB)
Collecting PyWavelets (from imagehash==4.3.1->ydata_profiling)
  Downloading pywavelets-1.

In [None]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
ws_cust_df2.head(10)

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185
5,2,3,9413,8259,5126,666,1795,1451
6,2,3,12126,3199,6975,480,3140,545
7,2,3,7579,4956,9426,1669,3321,2566
8,1,3,5963,3648,6192,425,1716,750
9,2,3,6006,11093,18881,1159,7425,2098


In [None]:
profile = ProfileReport(ws_cust_df2, title="Pandas Profiling Report")

In [None]:
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
profile.to_file("Wholesale_Customers_Data_Profile.html")

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

**4. Train and Test data Split**


In [None]:
ws_cust_df2.columns

Index(['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen',
       'Detergents_Paper', 'Delicassen'],
      dtype='object')

In [None]:
x_features = ['Channel', 'Region', 'Fresh', 'Milk', 'Grocery', 'Frozen',
       'Detergents_Paper', 'Delicassen']

In [None]:
num_vars = ['Fresh', 'Milk', 'Grocery', 'Frozen',
       'Detergents_Paper', 'Delicassen']

In [None]:
cat_vars = list(set(x_features) - set(num_vars))

In [None]:
cat_vars

['Channel', 'Region']

In [None]:
ws_cust_df2[x_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int16
 1   Region            440 non-null    int16
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int16(2), int64(6)
memory usage: 22.5 KB


Setting X and Y variables

In [None]:
X = ws_cust_df2[x_features]
y = ws_cust_df2['Region']

Data Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# First, split into training (60%) and temp (40%) sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.6, random_state=23)

# Now, split the temp (40%) data into test (20%) and production (20%) sets
X_test, X_prod, y_test, y_prod = train_test_split(X_temp, y_temp, train_size=0.5, random_state=23)

In [None]:
X_train.shape

(264, 8)

In [None]:
X_test.shape

(88, 8)

In [None]:
X_prod.shape

(88, 8)

In [None]:
X_train.to_parquet("train.parquet")

In [None]:
X_test.to_parquet("test.parquet")

In [None]:
X_prod.to_parquet("prod.parquet")

**5. Data Version Control**

In [None]:
!wget https://github.com/ajose26/Group13_Wholesale_Customer/raw/main/dataset/train.parquet -O train.parquet # Download the Parquet file from the raw URL

train_ws_cust_df = pd.read_parquet("train.parquet") # Read the downloaded Parquet file

--2024-12-03 18:31:13--  https://github.com/ajose26/Group13_Wholesale_Customer/raw/main/dataset/train.parquet
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ajose26/Group13_Wholesale_Customer/main/dataset/train.parquet [following]
--2024-12-03 18:31:14--  https://raw.githubusercontent.com/ajose26/Group13_Wholesale_Customer/main/dataset/train.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16618 (16K) [application/octet-stream]
Saving to: ‘train.parquet’


2024-12-03 18:31:14 (3.58 MB/s) - ‘train.parquet’ saved [16618/16618]



In [None]:
train_ws_cust_df

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
257,1,1,8475,1931,1883,5004,3593,987
366,1,3,9561,2217,1664,1173,222,447
5,2,3,9413,8259,5126,666,1795,1451
289,1,3,42786,286,471,1388,32,22
414,1,3,5969,1990,3417,5679,1135,290
...,...,...,...,...,...,...,...,...
237,1,1,18692,3838,593,4634,28,1215
31,1,3,2612,4339,3133,2088,820,985
40,1,3,24025,4332,4757,9510,1145,5864
230,2,1,11072,5989,5615,8321,955,2137


In [None]:
!wget https://github.com/ajose26/Group13_Wholesale_Customer/raw/main/dataset/test.parquet -O test.parquet # Download the Parquet file from the raw URL

test_ws_cust_df = pd.read_parquet("test.parquet") # Read the downloaded Parquet file

--2024-12-03 18:36:40--  https://github.com/ajose26/Group13_Wholesale_Customer/raw/main/dataset/test.parquet
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/ajose26/Group13_Wholesale_Customer/main/dataset/test.parquet [following]
--2024-12-03 18:36:41--  https://raw.githubusercontent.com/ajose26/Group13_Wholesale_Customer/main/dataset/test.parquet
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10093 (9.9K) [application/octet-stream]
Saving to: ‘test.parquet’


2024-12-03 18:36:41 (8.66 MB/s) - ‘test.parquet’ saved [10093/10093]



In [None]:
test_ws_cust_df

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
232,1,1,25962,1780,3838,638,284,834
382,1,3,34454,7435,8469,2540,1711,2893
357,2,3,37,1275,22272,137,6747,110
400,1,3,4446,906,1238,3576,153,1014
74,2,3,8190,6343,9794,1285,1901,1780
...,...,...,...,...,...,...,...,...
73,2,3,19899,5332,8713,8132,764,648
325,1,2,32717,16784,13626,60869,1272,5609
135,1,3,6300,1289,2591,1170,199,326
326,1,2,4414,1610,1431,3498,387,834


**6. ML Pipeline with Scikit Learn**

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score

In [None]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot',
                                           OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_vars),
        ('cat', categorical_transformer, cat_vars),
    ])

params = { "n_estimators": 400,
           "max_depth": 4 }

# Create the full pipeline combining preprocessing and model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**params))
])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Calculate Precision, Recall, and F1-score with multiclass setting
precision = precision_score(y_test, y_pred, average='macro')  # Use 'macro', 'micro', or 'weighted'
recall = recall_score(y_test, y_pred, average='macro')        # Use 'macro', 'micro', or 'weighted'
f1 = f1_score(y_test, y_pred, average='macro')                # Use 'macro', 'micro', or 'weighted'

# Print the results
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

Precision: 1.0
Recall: 1.0
F1-score: 1.0


**7. ML Experimentation and Tracking with Weights and Biases**

In [None]:
!pip install wandb



Initializing Weights and Biases

In [None]:
import wandb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
os.environ["WANDB_API_KEY"] = "98d04c41a10f0deaa9331c691925872d6f2dbdb4"

1. Experiment 1 - Logistic Regression Model

In [128]:
# Initialize W&B with the experiment name
wandb.init(project="mlops_wholesalecustomers", name="Logistic Regression Experiment")

# Logistic Regression Model
logreg = LogisticRegression(multi_class='ovr')

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True)
cross_val_scores = cross_val_score(logreg, X_train, y_train, cv=kfold, scoring='accuracy')

# Log cross-validation metrics
wandb.log({
    "cross_val_mean": cross_val_scores.mean(),
    "cross_val_std": cross_val_scores.std(),
})

# Fit the model on the training data
logreg.fit(X_train, y_train)

# Predict on the test set
y_pred = logreg.predict(X_test)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Log precision, recall, and F1-score to W&B
wandb.log({
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
})

wandb.finish()



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

VBox(children=(Label(value='0.017 MB of 0.017 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
cross_val_mean,▁
cross_val_std,▁
f1_score,▁
precision,▁
recall,▁

0,1
cross_val_mean,0.8865
cross_val_std,0.03123
f1_score,0.61501
precision,0.57272
recall,0.66667


2. Experiment 2 - Decision Tree Classifiier

In [None]:
# Initialize W&B with the experiment name
wandb.init(project="mlops_wholesalecustomers", name="Decision Tree Experiment")

# Decision Tree Model
dt = DecisionTreeClassifier(max_depth=5)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True)
cross_val_scores = cross_val_score(dt, X_train, y_train, cv=kfold, scoring='accuracy')

# Log cross-validation metrics
wandb.log({
    "cross_val_mean": cross_val_scores.mean(),
    "cross_val_std": cross_val_scores.std(),
})

# Fit the model on the training data
dt.fit(X_train, y_train)

# Predict on the test set
y_pred = dt.predict(X_test)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Log precision, recall, and F1-score to W&B
wandb.log({
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
})

wandb.finish()



0,1
cross_val_mean,▁
cross_val_std,▁
f1_score,▁
precision,▁
recall,▁

0,1
cross_val_mean,1
cross_val_std,0
f1_score,1
precision,1
recall,1


3. Experiment 3 - Random Forest Model

In [None]:
# Initialize W&B with the experiment name
wandb.init(project="mlops_wholesalecustomers", name="Random Forest Experiment")

# Random Forest Model
rf = RandomForestClassifier(n_estimators=100, max_depth=5)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True)
cross_val_scores = cross_val_score(rf, X_train, y_train, cv=kfold, scoring='accuracy')

# Log cross-validation metrics
wandb.log({
    "cross_val_mean": cross_val_scores.mean(),
    "cross_val_std": cross_val_scores.std(),
})

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Log precision, recall, and F1-score to W&B
wandb.log({
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
})

wandb.finish()



0,1
cross_val_mean,▁
cross_val_std,▁
f1_score,▁
precision,▁
recall,▁

0,1
cross_val_mean,0.99245
cross_val_std,0.00924
f1_score,0.9246
precision,0.94872
recall,0.91667


4. Experiment 4 - Gradient Boosting Model

In [None]:
# Initialize W&B with the experiment name
wandb.init(project="mlops_wholesalecustomers", name="Gradient Boosting Experiment")

# Gradient Boosting Model
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5)

# K-Fold Cross-Validation
kfold = KFold(n_splits=5, shuffle=True)
cross_val_scores = cross_val_score(gb, X_train, y_train, cv=kfold, scoring='accuracy')

# Log cross-validation metrics
wandb.log({
    "cross_val_mean": cross_val_scores.mean(),
    "cross_val_std": cross_val_scores.std(),
})

# Fit the model on the training data
gb.fit(X_train, y_train)

# Predict on the test set
y_pred = gb.predict(X_test)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

# Log precision, recall, and F1-score to W&B
wandb.log({
    "precision": precision,
    "recall": recall,
    "f1_score": f1,
})

wandb.finish()



0,1
cross_val_mean,▁
cross_val_std,▁
f1_score,▁
precision,▁
recall,▁

0,1
cross_val_mean,1
cross_val_std,0
f1_score,1
precision,1
recall,1


5. Experiment 5 - Sweep Features

In [124]:
def train_decision_tree(config=None):
    # Initialize WandB
    with wandb.init(config=config):
        config = wandb.config

        dtree = DecisionTreeRegressor(max_depth=config.max_depth)

        dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('dt_model', dtree)])
        dtree_model.fit(X_train, y_train)


       # Calculate Precision, recall and F1-score
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

       # Log metrics
        wandb.log({
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        })


In [125]:
sweep_config = {
    "method": "grid",  # Can be 'grid', 'random', or 'bayes'
    "metric": {"name": "r2", "goal": "maximize"},
    "parameters": {
        "max_depth": {
            "values": [4, 6, 8, 12]  # Depths to evaluate
        },
    },
}

In [126]:
sweep_id = wandb.sweep(sweep_config, project="mlops_wholesalecustomers")

Create sweep with ID: i0sv5e3j
Sweep URL: https://wandb.ai/ashish-jose6260-essex-lake-group/mlops_wholesalecustomers/sweeps/i0sv5e3j


In [127]:
wandb.agent(sweep_id,
            function=train_decision_tree)  # Run all experiments

[34m[1mwandb[0m: Agent Starting Run: 41uapjar with config:
[34m[1mwandb[0m: 	max_depth: 4


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
f1_score,▁
precision,▁
recall,▁

0,1
f1_score,1
precision,1
recall,1


[34m[1mwandb[0m: Agent Starting Run: vz04cs6g with config:
[34m[1mwandb[0m: 	max_depth: 6


0,1
f1_score,▁
precision,▁
recall,▁

0,1
f1_score,1
precision,1
recall,1


[34m[1mwandb[0m: Agent Starting Run: 5z9lm3m4 with config:
[34m[1mwandb[0m: 	max_depth: 8


VBox(children=(Label(value='0.012 MB of 0.012 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
f1_score,▁
precision,▁
recall,▁

0,1
f1_score,1
precision,1
recall,1


[34m[1mwandb[0m: Agent Starting Run: 0430qrt6 with config:
[34m[1mwandb[0m: 	max_depth: 12


0,1
f1_score,▁
precision,▁
recall,▁

0,1
f1_score,1
precision,1
recall,1


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.
