## **1. Import Package Modules**

In [81]:
# Import Packages
import os, mlflow
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

# Import module (Regression Model)
# Import module (Regression Model)
from sklearn.linear_model import LogisticRegression, Ridge, SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection, preprocessing, datasets, metrics

# Import Pipeline
from sklearn.pipeline import Pipeline

## **2. Loading Dataset diamond.csv**

In [82]:
diamond_data = pd.read_csv(r"C:\Users\001057\Desktop\ML_Ciast\batch_1\assessment\datasets\diamonds.csv")
print(diamond_data.head())

   Unnamed: 0  carat      cut color clarity  depth  table  price     x     y  \
0           1   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98   
1           2   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84   
2           3   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07   
3           4   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23   
4           5   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35   

      z  
0  2.43  
1  2.31  
2  2.31  
3  2.63  
4  2.75  


In [83]:
# Checking Dataset
print(diamond_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  53940 non-null  int64  
 1   carat       53940 non-null  float64
 2   cut         53940 non-null  object 
 3   color       53940 non-null  object 
 4   clarity     53940 non-null  object 
 5   depth       53940 non-null  float64
 6   table       53940 non-null  float64
 7   price       53940 non-null  int64  
 8   x           53940 non-null  float64
 9   y           53940 non-null  float64
 10  z           53940 non-null  float64
dtypes: float64(6), int64(2), object(3)
memory usage: 4.5+ MB
None


In [84]:
diamond_data.describe()

Unnamed: 0.1,Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,26970.5,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,15571.281097,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,1.0,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,13485.75,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,26970.5,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,40455.25,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,53940.0,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [85]:
# Drop x,y,z column since it only refers to difference tolerance of cutting
diamond_data = diamond_data.drop(['x','y','z'],axis=1)

In [86]:
diamond_data = diamond_data.drop(['Unnamed: 0'],axis=1)

In [87]:
print(diamond_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
dtypes: float64(3), int64(1), object(3)
memory usage: 2.9+ MB
None


## **3. EDA**

In [88]:
# Checking missing values
print(diamond_data.isna().sum())

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
dtype: int64


In [89]:
# Checking Duplicates
print(diamond_data.duplicated().sum())

803


## **4. Data Preprocessing**

In [90]:
print(diamond_data.value_counts())

carat  cut        color  clarity  depth  table  price
0.79   Ideal      G      SI1      62.3   57.0   2898     5
0.30   Ideal      H      SI1      62.2   57.0   450      5
                  E      VS2      61.5   55.0   844      4
0.31   Very Good  E      SI1      63.4   55.0   698      4
       Ideal      D      VS2      61.5   56.0   734      4
                                                        ..
0.50   Very Good  E      VS2      61.8   57.0   1547     1
                                         58.0   1624     1
                                  62.1   62.0   1451     1
                                  62.4   57.0   1568     1
5.01   Fair       J      I1       65.5   59.0   18018    1
Name: count, Length: 53137, dtype: int64


In [91]:
diamond_data['cut'].value_counts()

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64

In [92]:
diamond_data['color'].value_counts()

color
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: count, dtype: int64

In [93]:
diamond_data['clarity'].value_counts()

clarity
SI1     13065
VS2     12258
SI2      9194
VS1      8171
VVS2     5066
VVS1     3655
IF       1790
I1        741
Name: count, dtype: int64

In [94]:
diamond_data['price'].value_counts()

price
605      132
802      127
625      126
828      125
776      124
        ... 
8816       1
14704      1
14699      1
14698      1
9793       1
Name: count, Length: 11602, dtype: int64

In [95]:
diamond_data.count()

carat      53940
cut        53940
color      53940
clarity    53940
depth      53940
table      53940
price      53940
dtype: int64

In [96]:
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price
0,0.23,Ideal,E,SI2,61.5,55.0,326
1,0.21,Premium,E,SI1,59.8,61.0,326
2,0.23,Good,E,VS1,56.9,65.0,327
3,0.29,Premium,I,VS2,62.4,58.0,334
4,0.31,Good,J,SI2,63.3,58.0,335


In [97]:
# Change object --> numerical data
# Check for value count for categorical 
print (np.unique(diamond_data[['cut']].values))
print (np.unique(diamond_data[['color']].values))
print (np.unique(diamond_data[['clarity']].values))


['Fair' 'Good' 'Ideal' 'Premium' 'Very Good']
['D' 'E' 'F' 'G' 'H' 'I' 'J']
['I1' 'IF' 'SI1' 'SI2' 'VS1' 'VS2' 'VVS1' 'VVS2']


In [98]:
ordinal_encoder = preprocessing.OrdinalEncoder()
diamond_data[['cut','color','clarity']] = ordinal_encoder.fit_transform(diamond_data[['cut','color','clarity']].values)
print(diamond_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  float64
 2   color    53940 non-null  float64
 3   clarity  53940 non-null  float64
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
dtypes: float64(6), int64(1)
memory usage: 2.9 MB
None


## **4. Data Splitting**

In [99]:
features = diamond_data.drop('price', axis=1)
labels = diamond_data['price']

In [100]:
x_train, x_test, y_train, y_test = train_test_split(features,labels,train_size=0.7,shuffle=True,random_state=42)

## **5. Pipeline Creation**

In [101]:
scaler_dict = {
    'min_max' : MinMaxScaler(),
    'standard' : StandardScaler()
}

model_dict = {
    'sgd' : SGDRegressor(),
    'knn_r' : KNeighborsRegressor(),
    'svr' : SVR(),
    'decision_tree_r' : DecisionTreeRegressor(),
    'random_forest_r' : RandomForestRegressor()    
}

# Create empty list of pipeline
pipelines = []
for scaler_name, scaler_class in scaler_dict.items():
    for model_name, model_class in model_dict.items():
        # Pipeline(list_of_tuples)
        pipeline = Pipeline([    
            (scaler_name,scaler_class),
            (model_name,model_class)
        ])
        pipelines.append(pipeline)

In [102]:
# Define a function to contain the pipeline training and evaluation code
def train_evaluate(pipeline,x_train,y_train,x_test,y_test):
  
    # Step 1: Perform pipeline training with scaler
    pipeline.fit(x_train,y_train) # Train together with feature scaling
    prediction = pipeline.predict(x_test)

    # Step 2: Evaluate pipeline
    print("Mean Squared Error:", metrics.mean_squared_error(y_test,prediction))
    print("Mean Absolute Error:", metrics.mean_absolute_error(y_test,prediction))
    print("Mean Absolute Percentage Error:", metrics.mean_absolute_percentage_error(y_test,prediction))
    r2 = pipeline.score(x_test,y_test)
    print("R2 Score:", r2)
    return r2

In [103]:
# Loop through the list of pipelines to perform the training and evaluation
r2_list = []
for i, pipeline in enumerate(pipelines):  # enumerate always return index value
    print("\nTraining and Evaluating pipeline #", i+1)
    print("Steps:",pipeline.steps)
    r2 = train_evaluate(pipeline,x_train,y_train,x_test,y_test)
    r2_list.append(r2)


Training and Evaluating pipeline # 1
Steps: [('min_max', MinMaxScaler()), ('sgd', SGDRegressor())]


2024/08/09 15:43:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '114dc0ffbb374232841b6910c982bcfe', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
2024/08/09 15:43:31 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'f7c5208699d24d5694a72fb62123f2b3', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Mean Squared Error: 1860855.0507421885
Mean Absolute Error: 923.4247292826412
Mean Absolute Percentage Error: 0.4310240010539155
R2 Score: 0.8806835455973899

Training and Evaluating pipeline # 2
Steps: [('min_max', MinMaxScaler()), ('knn_r', KNeighborsRegressor())]
Mean Squared Error: 603160.155755778
Mean Absolute Error: 402.7626622172785
Mean Absolute Percentage Error: 0.12505323064319565


2024/08/09 15:43:40 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c43fd159718d4ace8fdf3772a68019e6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


R2 Score: 0.9613258801683656

Training and Evaluating pipeline # 3
Steps: [('min_max', MinMaxScaler()), ('svr', SVR())]
Mean Squared Error: 13359451.020152168
Mean Absolute Error: 2285.266720425127
Mean Absolute Percentage Error: 0.8766739077231559


2024/08/09 15:51:37 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '22822e63698f4853bc47cd5d590876f0', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


R2 Score: 0.14340328234908306

Training and Evaluating pipeline # 4
Steps: [('min_max', MinMaxScaler()), ('decision_tree_r', DecisionTreeRegressor())]


2024/08/09 15:51:44 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'af22a8984bb04a06b9cc59ed84cc5dca', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Mean Squared Error: 516151.27349224384
Mean Absolute Error: 358.8992262125466
Mean Absolute Percentage Error: 0.10343609673795941
R2 Score: 0.9669048162220246

Training and Evaluating pipeline # 5
Steps: [('min_max', MinMaxScaler()), ('random_forest_r', RandomForestRegressor())]
Mean Squared Error: 306638.6087943577
Mean Absolute Error: 285.3854619603494
Mean Absolute Percentage Error: 0.08364263070940317


2024/08/09 15:52:06 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b56192306f0c4469bbc261f045a05972', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


R2 Score: 0.9803385913536364

Training and Evaluating pipeline # 6
Steps: [('standard', StandardScaler()), ('sgd', SGDRegressor())]


2024/08/09 15:52:13 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '2c8a08d772064eebac8af1d1c4a792d5', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Mean Squared Error: 1859112.4602272017
Mean Absolute Error: 929.8506356163617
Mean Absolute Percentage Error: 0.45741978052010934
R2 Score: 0.8807952790296316

Training and Evaluating pipeline # 7
Steps: [('standard', StandardScaler()), ('knn_r', KNeighborsRegressor())]
Mean Squared Error: 776534.624852305
Mean Absolute Error: 489.53746137683845
Mean Absolute Percentage Error: 0.15910131206465888


2024/08/09 15:52:23 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7959026748314857bf3d0d7c8a7947b1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


R2 Score: 0.9502092556208052

Training and Evaluating pipeline # 8
Steps: [('standard', StandardScaler()), ('svr', SVR())]
Mean Squared Error: 9505115.868596125
Mean Absolute Error: 1675.8046744678197
Mean Absolute Percentage Error: 0.5136811203382545


2024/08/09 16:00:21 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'c514e4ce06d44c7884c173d2069cef76', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


R2 Score: 0.3905399973659738

Training and Evaluating pipeline # 9
Steps: [('standard', StandardScaler()), ('decision_tree_r', DecisionTreeRegressor())]


2024/08/09 16:00:29 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'cea5305652b14ecba8e9c22e15ca0821', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Mean Squared Error: 504209.73218091036
Mean Absolute Error: 355.9479016955936
Mean Absolute Percentage Error: 0.10289044948004936
R2 Score: 0.9676704977665395

Training and Evaluating pipeline # 10
Steps: [('standard', StandardScaler()), ('random_forest_r', RandomForestRegressor())]
Mean Squared Error: 307454.9700478274
Mean Absolute Error: 284.9478043020816
Mean Absolute Percentage Error: 0.08337546131309748
R2 Score: 0.980286246959463


In [104]:
# Find the best pipeline by looking at the score
best_pipeline_index = np.where(np.array(r2_list)==max(r2_list))
print(best_pipeline_index[0][0])
print("Best Pipeline is #", best_pipeline_index[0][0]+1)
print("Step:", pipelines[best_pipeline_index[0][0]].steps)

4
Best Pipeline is # 5
Step: [('min_max', MinMaxScaler()), ('random_forest_r', RandomForestRegressor())]


In [105]:
# Get the best pipeline out
best_pipeline = pipelines[best_pipeline_index[0][0]]

## **6. MLFLOW**

In [106]:
# Check working directory
print(os.getcwd())

C:\Users\001057\Desktop\ML_Ciast\batch_1\assessment


In [107]:
# Checking working directory for mlflow
# os.chdir("..")
print(os.getcwd())

C:\Users\001057\Desktop\ML_Ciast\batch_1\assessment


In [108]:
os.chdir(r"C:\Users\001057\Desktop\ML_Ciast\batch_1\assessment")
print(os.getcwd())

C:\Users\001057\Desktop\ML_Ciast\batch_1\assessment


In [109]:
# Create the training in mlflow experiment
# mlflow.create_experiment('diamond_experiment')

In [110]:
# Set the experiment
mlflow.set_experiment('diamond_experiment')

# Set the experiment tag
mlflow.set_experiment_tag('diamond_version','2.15.1')

In [111]:
model_dict = {
    'sgd' : SGDRegressor(),
    'knn_r' : KNeighborsRegressor(),
    # 'svr' : SVR(),
    'decision_tree_r' : DecisionTreeRegressor(),
    'random_forest_r' : RandomForestRegressor()    
}

In [112]:
for model_name, model_class in model_dict.items():
    pipeline = Pipeline([
    ('scaler', preprocessing.MinMaxScaler()),
    (model_name, model_class)
    ])
    # Start mlflow run4
    with mlflow.start_run(run_name=model_name):
        mlflow.sklearn.autolog()
        pipeline.fit(x_train,y_train.values)
        prediction = pipeline.predict(x_test)
        mae = metrics.mean_squared_error(y_test.values,prediction)
        mse = metrics.mean_squared_error(y_test,prediction)
        mape = metrics.mean_absolute_percentage_error(y_test,prediction)
        r2 = pipeline.score(x_test,y_test)


## **7. Pickle**

In [113]:
with open('src/ordinal_encoder.pkl','wb') as f:
    pkl.dump(ordinal_encoder,f)