In [1]:
!pip install mlflow --quiet --use-deprecated=legacy-resolver



In [2]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn import set_config
import mlflow
import mlflow.sklearn


In [3]:
# Load data set
df = pd.read_csv('https://raw.githubusercontent.com/pvateekul/2110446_DSDE_2023s2/main/code/Week03_ML/mushroom2020_dataset.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 24 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   id                           5824 non-null   int64  
 1   label                        5764 non-null   object 
 2   cap-shape                    5824 non-null   object 
 3   cap-surface                  5797 non-null   object 
 4   bruises                      5725 non-null   object 
 5   odor                         5725 non-null   object 
 6   gill-attachment              5725 non-null   object 
 7   gill-spacing                 5694 non-null   object 
 8   gill-size                    5703 non-null   object 
 9   stalk-shape                  5703 non-null   object 
 10  stalk-root                   5793 non-null   object 
 11  stalk-surface-above-ring     5793 non-null   object 
 12  stalk-surface-below-ring     5793 non-null   object 
 13  veil-type         

In [4]:
# drop rows with missing label
df = df.dropna(subset=['label'])

# Drop unrelevant variables
df = df.drop(columns=['id','gill-attachment', 'gill-spacing', 'gill-size','gill-color-rate','stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'stalk-color-above-ring-rate','stalk-color-below-ring-rate','veil-color-rate','veil-type'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5764 entries, 0 to 5823
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   label              5764 non-null   object 
 1   cap-shape          5764 non-null   object 
 2   cap-surface        5737 non-null   object 
 3   bruises            5665 non-null   object 
 4   odor               5665 non-null   object 
 5   stalk-shape        5643 non-null   object 
 6   ring-number        5702 non-null   object 
 7   ring-type          5702 non-null   object 
 8   spore-print-color  5708 non-null   object 
 9   population         5708 non-null   object 
 10  habitat            5733 non-null   object 
 11  cap-color-rate     5737 non-null   float64
dtypes: float64(1), object(11)
memory usage: 585.4+ KB


In [6]:
# Convert the label variable e (edible) to 1 and p (poisonous) to 0
df['label'] = df['label'].replace({'e':1, 'p':0})

df['label'].value_counts()

label
0    3660
1    2104
Name: count, dtype: int64

In [7]:
df.head()

Unnamed: 0,label,cap-shape,cap-surface,bruises,odor,stalk-shape,ring-number,ring-type,spore-print-color,population,habitat,cap-color-rate
0,0,x,s,t,p,e,o,p,k,s,u,1.0
1,1,x,s,t,a,e,o,p,n,n,g,2.0
2,1,b,s,t,l,e,o,p,n,n,m,3.0
3,0,x,y,t,p,e,o,p,k,s,u,3.0
4,1,x,s,f,n,t,o,e,n,a,g,4.0


Preprocess Pipeline

In [8]:
# Fill missing values by adding the mean for numeric variables and the mode for nominal variables.

numberPipeline = Pipeline([
	('imputer', SimpleImputer(strategy='mean')),
	('scale', StandardScaler())
])

categoryPipeline = Pipeline([
	('imputer', SimpleImputer(strategy='most_frequent')),
	('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

In [9]:
numberColumns = df.select_dtypes(include=['int64', 'float64']).columns
numberColumns = numberColumns.drop('label')

categoryColumns = df.select_dtypes(include=['object']).columns

colTrans = ColumnTransformer(
	transformers = [
		('num', numberPipeline, numberColumns),
		('cat', categoryPipeline, categoryColumns)
	],
	n_jobs=-1
)

Model Pipeline

In [10]:
modelPipeline = Pipeline([
	('preprocess', colTrans),
	('model', RandomForestClassifier(random_state=2020))
])

set_config(display='diagram')
display(modelPipeline)

Train Test Split

In [11]:
# split train/test with 20% test
train, test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=2020)

Xtrain = train.drop(columns=['label'])
Ytrain = train['label']

Xtest = test.drop(columns=['label'])
Ytest = test['label']

print(Ytrain.value_counts())
print(Ytest.value_counts())

label
0    2928
1    1683
Name: count, dtype: int64
label
0    732
1    421
Name: count, dtype: int64


Gridsearch CV

In [12]:
modelPipeline.get_params()

{'memory': None,
 'steps': [('preprocess', ColumnTransformer(n_jobs=-1,
                     transformers=[('num',
                                    Pipeline(steps=[('imputer', SimpleImputer()),
                                                    ('scale', StandardScaler())]),
                                    Index(['cap-color-rate'], dtype='object')),
                                   ('cat',
                                    Pipeline(steps=[('imputer',
                                                     SimpleImputer(strategy='most_frequent')),
                                                    ('encode',
                                                     OneHotEncoder(drop='first',
                                                                   handle_unknown='ignore',
                                                                   sparse_output=False))]),
                                    Index(['cap-shape', 'cap-surface', 'bruises', 'odor', 'stalk-shape',
     

In [13]:
param_grid = {
	'model__criterion': ['gini', 'entropy'],
	'model__max_depth': [2, 3, 6],
	'model__min_samples_leaf': [2, 5, 10],
	'model__n_estimators': [100, 200],
	'model__random_state': [2020]
}

grid_search = GridSearchCV(modelPipeline, param_grid=param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(Xtrain, Ytrain)
print(grid_search.best_params_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
{'model__criterion': 'gini', 'model__max_depth': 6, 'model__min_samples_leaf': 2, 'model__n_estimators': 100, 'model__random_state': 2020}


In [14]:
Ypred = grid_search.predict(Xtest)
print(confusion_matrix(Ytest, Ypred))
print(classification_report(Ytest, Ypred, digits=4))

[[731   1]
 [  5 416]]
              precision    recall  f1-score   support

           0     0.9932    0.9986    0.9959       732
           1     0.9976    0.9881    0.9928       421

    accuracy                         0.9948      1153
   macro avg     0.9954    0.9934    0.9944      1153
weighted avg     0.9948    0.9948    0.9948      1153



ML Flow

In [15]:
local_registry = 'sqlite:///mlflow.db'
mlflow.set_tracking_uri(local_registry)
experiment_id = mlflow.set_experiment('mushroom2020')

def eval_metrics(actual, pred):
	accuracy = accuracy_score(actual, pred)
	return accuracy

def train_model(criterion, max_depth, min_samples_leaf, n_estimators, random_state):

	# Starting the Experiement
	with mlflow.start_run():
		# Training the model
		# add cv=StratifiedKFold(n_splits=5) like in grid_search
		modelPipeline.set_params(
			model__criterion=criterion, 
			model__max_depth=max_depth, 
			model__min_samples_leaf=min_samples_leaf, 
			model__n_estimators=n_estimators, 
			model__random_state=random_state,
		)
		modelPipeline.fit(Xtrain, Ytrain)

		# Predicting the model
		Ypred = modelPipeline.predict(Xtest)

		# Calculating the accuracy
		accuracy = eval_metrics(Ytest, Ypred)

		print("RandomForest Model (criterion={}, max_depth={}, min_samples_leaf={}, n_estimators={}, random_state={}):".format(criterion, max_depth, min_samples_leaf, n_estimators, random_state))
		print("  Accuracy: {}".format(accuracy))

		# Logging the parameters
		mlflow.log_param('criterion', criterion)
		mlflow.log_param('max_depth', max_depth)
		mlflow.log_param('min_samples_leaf', min_samples_leaf)
		mlflow.log_param('n_estimators', n_estimators)
		mlflow.log_param('random_state', random_state)

		# Logging the metrics
		mlflow.log_metric('accuracy', accuracy)

		# Logging the model
		mlflow.sklearn.log_model(modelPipeline, 'model')

		# Returning the accuracy
		return accuracy

In [16]:
criterion = param_grid['model__criterion']
max_depth = param_grid['model__max_depth']
min_samples_leaf = param_grid['model__min_samples_leaf']
n_estimators = param_grid['model__n_estimators']

for c in criterion:
	for d in max_depth:
		for l in min_samples_leaf:
			for n in n_estimators:
				train_model(c, d, l, n, 2020)

RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=2, n_estimators=100, random_state=2020):
  Accuracy: 0.9722463139635733
RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=2, n_estimators=200, random_state=2020):
  Accuracy: 0.971379011274935
RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=5, n_estimators=100, random_state=2020):
  Accuracy: 0.9722463139635733
RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=5, n_estimators=200, random_state=2020):
  Accuracy: 0.9705117085862967
RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=10, n_estimators=100, random_state=2020):
  Accuracy: 0.9722463139635733
RandomForest Model (criterion=gini, max_depth=2, min_samples_leaf=10, n_estimators=200, random_state=2020):
  Accuracy: 0.9705117085862967
RandomForest Model (criterion=gini, max_depth=3, min_samples_leaf=2, n_estimators=100, random_state=2020):
  Accuracy: 0.9774501300954033
RandomForest Model (criterion=gin

In [17]:
# Best 5 runs
best_runs = mlflow.search_runs(order_by=['metrics.accuracy DESC'], max_results=5)
best_runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,params.n_estimators,params.max_depth,params.min_samples_leaf,params.criterion,params.random_state,tags.mlflow.user,tags.mlflow.log-model.history,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.runName
0,b7875cc03b124f8395c05b33f5463b8e,1,FINISHED,file:///d:/Code/CU/Data_Sci/Assignment5/mlruns...,2024-02-12 15:57:56.598000+00:00,2024-02-12 15:57:58.977000+00:00,0.999133,200,6,2,gini,2020,bossy,"[{""run_id"": ""b7875cc03b124f8395c05b33f5463b8e""...",LOCAL,C:\Users\bossy\AppData\Roaming\Python\Python31...,invincible-yak-210
1,33fe77e596a74b9781f11430eb054560,1,FINISHED,file:///d:/Code/CU/Data_Sci/Assignment5/mlruns...,2024-02-12 15:55:33.642000+00:00,2024-02-12 15:55:36.116000+00:00,0.999133,200,6,2,gini,2020,bossy,"[{""run_id"": ""33fe77e596a74b9781f11430eb054560""...",LOCAL,C:\Users\bossy\AppData\Roaming\Python\Python31...,welcoming-mare-702
2,d2c5b34d89304633904d7f4999be6bbb,1,FINISHED,file:///d:/Code/CU/Data_Sci/Assignment5/mlruns...,2024-02-12 15:58:06.139000+00:00,2024-02-12 15:58:08.582000+00:00,0.998265,200,6,10,gini,2020,bossy,"[{""run_id"": ""d2c5b34d89304633904d7f4999be6bbb""...",LOCAL,C:\Users\bossy\AppData\Roaming\Python\Python31...,auspicious-hare-799
3,41207372ef884e18b11f92382ba59060,1,FINISHED,file:///d:/Code/CU/Data_Sci/Assignment5/mlruns...,2024-02-12 15:58:01.352000+00:00,2024-02-12 15:58:03.859000+00:00,0.998265,200,6,5,gini,2020,bossy,"[{""run_id"": ""41207372ef884e18b11f92382ba59060""...",LOCAL,C:\Users\bossy\AppData\Roaming\Python\Python31...,nebulous-frog-886
4,4003ec22c1fd42e8a91ac7247ef42320,1,FINISHED,file:///d:/Code/CU/Data_Sci/Assignment5/mlruns...,2024-02-12 15:55:43.152000+00:00,2024-02-12 15:55:45.519000+00:00,0.998265,200,6,10,gini,2020,bossy,"[{""run_id"": ""4003ec22c1fd42e8a91ac7247ef42320""...",LOCAL,C:\Users\bossy\AppData\Roaming\Python\Python31...,unleashed-goat-212


In [18]:
run_id = best_runs.iloc[0]['run_id']
model = mlflow.sklearn.load_model(f'runs:/{run_id}/model')

print("run id:", run_id)
print("model:", model)

run id: b7875cc03b124f8395c05b33f5463b8e
model: Pipeline(steps=[('preprocess',
                 ColumnTransformer(n_jobs=-1,
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scale',
                                                                   StandardScaler())]),
                                                  Index(['cap-color-rate'], dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('encode',
                                                                   OneHotEnc

In [19]:
# Predicting the model
Ypred = model.predict(Xtest)

# Calculating the accuracy
print(classification_report(Ytest, Ypred, digits=4))
print(confusion_matrix(Ytest, Ypred))

              precision    recall  f1-score   support

           0     1.0000    0.9986    0.9993       732
           1     0.9976    1.0000    0.9988       421

    accuracy                         0.9991      1153
   macro avg     0.9988    0.9993    0.9991      1153
weighted avg     0.9991    0.9991    0.9991      1153

[[731   1]
 [  0 421]]
