In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv("mushrooms.csv")

In [4]:
df.head()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8124 entries, 0 to 8123
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   class                     8124 non-null   object
 1   cap-shape                 8124 non-null   object
 2   cap-surface               8124 non-null   object
 3   cap-color                 8124 non-null   object
 4   bruises                   8124 non-null   object
 5   odor                      8124 non-null   object
 6   gill-attachment           8124 non-null   object
 7   gill-spacing              8124 non-null   object
 8   gill-size                 8124 non-null   object
 9   gill-color                8124 non-null   object
 10  stalk-shape               8124 non-null   object
 11  stalk-root                8124 non-null   object
 12  stalk-surface-above-ring  8124 non-null   object
 13  stalk-surface-below-ring  8124 non-null   object
 14  stalk-color-above-ring  

In [6]:
df.replace({

'class': {'e': 'edible', 'p': 'poisonous'},
'cap-shape': {'x':'convex', 'b':'bell', 's':'sunken', 'f' :'flat','k':'knobbed', 'c':'conical'},
'cap-surface': {'s':'smooth', 'y':'scaly', 'f':'fibrous', 'g':'grooves'},
'cap-color': {'n':'brown', 'y':'yellow', 'w':'white', 'g':'green', 'e':'red', 'p':'pink', 'b':'buff', 'u':'purple', 'c':'cinnamon', 'r':'green'},
'bruises': {'t':'bruises', 'f':'noBruises'},
'odor': {'p':'pungent', 'a':'almond', 'l':'anise', 'n':'none', 'f':'foul', 'c':'creosote', 'y':'fishy', 's':'spicy', 'm':'musty'},
'gill-attachment': {'f':'free', 'a':'attached'},
'gill-spacing': {'c':'close', 'w':'crowded'},
'gill-size': {'n':'narrow', 'b':'broad'},
'gill-color': {'k':'black', 'n':'brown', 'g':'gray', 'p':'pink', 'w':'white', 'h':'chocolate', 'u':'purple', 'e':'red', 'b':'buff', 'r':'green', 'y':'yellow', 'o':'orange'},
'stalk-shape': {'e':'enlarging', 't':'tapering'},
'stalk-root': {'e':'equal', 'c':'club', 'b':'bulbous', 'r':'rooted', '?':'missing'},
'stalk-surface-above-ring': {'s':'smooth', 'f':'fibrous', 'k':'silky', 'y':'scaly'},
'stalk-surface-below-ring': {'s':'smooth', 'f':'fibrous', 'k':'silky', 'y':'scaly'},
'stalk-color-above-ring': {'w':'white', 'g':'gray', 'p' :'pink','n' :'brown','b':'buff', 'e':'red', 'o':'orange', 'c':'cinnamon', 'y':'yellow'},
'stalk-color-below-ring': {'w':'white', 'g':'gray', 'p' :'pink','n' :'brown','b':'buff', 'e':'red', 'o':'orange', 'c':'cinnamon', 'y':'yellow'},
'veil-type': {'p':'partial'},
'veil-color':{'w':'white','n' :'brown', 'o':'orange','y':'yellow'},
'ring-number': {'o':'one','t':'two', 'n':'none'},
'ring-type': {'p':'pendant', 'e':'evanescent', 'l' :'large','f':'flaring', 'n':'none'},
'spore-print-color': {'k':'black','n':'brown', 'u':'purple', 'h':'chocolate', 'w':'white','r':'green', 'o':'orange', 'y':'yellow', 'b':'buff'},
'population': {'s':'scattered', 'n':'numerous', 'a':'abundant', 'v':'several', 'y':'solitary' ,'c':'clustered'},
'habitat': {'u':'urban', 'g':'grasses','m':'meadows', 'd':'woods', 'p':'paths', 'w':'waste', 'l':'leaves'}
    }, inplace=True)


In [9]:
# droping columns veil-type as it has only one value
df.drop('veil-type', axis=1, inplace=True)

KeyError: "['veil-type'] not found in axis"

In [10]:
# Splitting features and target variable
X = df.drop(columns=['class'])
y = df['class']

In [11]:
# selecting features
features = X.select_dtypes(include=[object]).columns

features= list(features.difference(['class']))

print('\n','Features','\n',features,'\n')


 Features 
 ['bruises', 'cap-color', 'cap-shape', 'cap-surface', 'gill-attachment', 'gill-color', 'gill-size', 'gill-spacing', 'habitat', 'odor', 'population', 'ring-number', 'ring-type', 'spore-print-color', 'stalk-color-above-ring', 'stalk-color-below-ring', 'stalk-root', 'stalk-shape', 'stalk-surface-above-ring', 'stalk-surface-below-ring', 'veil-color'] 



In [12]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
#exporting features into meta data
def summarize(df,features):
  results=[]

  for column in df[features]:
      # Get the unique members of the column
      members = df[column].unique().tolist()
      # Append the column name and its unique members to the results list
      results.append([column, members])

  return pd.DataFrame(results, columns=['Column Name', 'Members'])

# Create a DataFrame from the results list
summarize(X_train,features)

Unnamed: 0,Column Name,Members
0,bruises,"[noBruises, bruises]"
1,cap-color,"[red, brown, yellow, green, white, buff, pink,..."
2,cap-shape,"[knobbed, convex, flat, bell, sunken, conical]"
3,cap-surface,"[smooth, scaly, fibrous, grooves]"
4,gill-attachment,"[free, attached]"
5,gill-color,"[buff, purple, black, white, red, pink, brown,..."
6,gill-size,"[narrow, broad]"
7,gill-spacing,"[close, crowded]"
8,habitat,"[woods, paths, leaves, meadows, waste, urban, ..."
9,odor,"[spicy, foul, fishy, none, anise, pungent, alm..."


In [14]:
summarize(df,features).to_dict()

{'Column Name': {0: 'bruises',
  1: 'cap-color',
  2: 'cap-shape',
  3: 'cap-surface',
  4: 'gill-attachment',
  5: 'gill-color',
  6: 'gill-size',
  7: 'gill-spacing',
  8: 'habitat',
  9: 'odor',
  10: 'population',
  11: 'ring-number',
  12: 'ring-type',
  13: 'spore-print-color',
  14: 'stalk-color-above-ring',
  15: 'stalk-color-below-ring',
  16: 'stalk-root',
  17: 'stalk-shape',
  18: 'stalk-surface-above-ring',
  19: 'stalk-surface-below-ring',
  20: 'veil-color'},
 'Members': {0: ['bruises', 'noBruises'],
  1: ['brown',
   'yellow',
   'white',
   'green',
   'red',
   'pink',
   'buff',
   'purple',
   'cinnamon'],
  2: ['convex', 'bell', 'sunken', 'flat', 'knobbed', 'conical'],
  3: ['smooth', 'scaly', 'fibrous', 'grooves'],
  4: ['free', 'attached'],
  5: ['black',
   'brown',
   'gray',
   'pink',
   'white',
   'chocolate',
   'purple',
   'red',
   'buff',
   'green',
   'yellow',
   'orange'],
  6: ['narrow', 'broad'],
  7: ['close', 'crowded'],
  8: ['urban', 'grasses

In [15]:
# EXPORTING FOR DE

my_feature_dict = {'FEATURES' : summarize(df,features).to_dict()}

my_feature_dict

{'FEATURES': {'Column Name': {0: 'bruises',
   1: 'cap-color',
   2: 'cap-shape',
   3: 'cap-surface',
   4: 'gill-attachment',
   5: 'gill-color',
   6: 'gill-size',
   7: 'gill-spacing',
   8: 'habitat',
   9: 'odor',
   10: 'population',
   11: 'ring-number',
   12: 'ring-type',
   13: 'spore-print-color',
   14: 'stalk-color-above-ring',
   15: 'stalk-color-below-ring',
   16: 'stalk-root',
   17: 'stalk-shape',
   18: 'stalk-surface-above-ring',
   19: 'stalk-surface-below-ring',
   20: 'veil-color'},
  'Members': {0: ['bruises', 'noBruises'],
   1: ['brown',
    'yellow',
    'white',
    'green',
    'red',
    'pink',
    'buff',
    'purple',
    'cinnamon'],
   2: ['convex', 'bell', 'sunken', 'flat', 'knobbed', 'conical'],
   3: ['smooth', 'scaly', 'fibrous', 'grooves'],
   4: ['free', 'attached'],
   5: ['black',
    'brown',
    'gray',
    'pink',
    'white',
    'chocolate',
    'purple',
    'red',
    'buff',
    'green',
    'yellow',
    'orange'],
   6: ['narrow', '

In [16]:
# save dictionary to person_data.pkl file
with open('feature_dict.pkl', 'wb') as fp:
    pickle.dump(my_feature_dict, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


In [32]:
#creating pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

features = X_train.columns.tolist()

# Use OneHotEncoder directly
preprocessor_stage_1 = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore',sparse_output=False, drop='first'), features)
    ]
)

# Final pipeline
preprocessor_stack = Pipeline(steps=[
    ('preprocessor_stage_1', preprocessor_stage_1)
])

In [33]:
preprocessor_stack

0,1,2
,steps,"[('preprocessor_stage_1', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [34]:
#fitting the pipeline
preprocessor_stack.fit(X_train)

0,1,2
,steps,"[('preprocessor_stage_1', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [35]:
# preprocessor_stack.fit(X_train)

# Transform data
X_encoded = preprocessor_stack.transform(X_train)

# Get feature names
encoder = preprocessor_stack.named_steps['preprocessor_stage_1']
feature_names = encoder.get_feature_names_out()

# Convert to DataFrame
import pandas as pd
df_encoded = pd.DataFrame(X_encoded, columns=feature_names)

print(df_encoded.head())

   cat__cap-shape_conical  cat__cap-shape_convex  cat__cap-shape_flat  \
0                     0.0                    0.0                  0.0   
1                     0.0                    1.0                  0.0   
2                     0.0                    0.0                  1.0   
3                     0.0                    0.0                  1.0   
4                     0.0                    0.0                  0.0   

   cat__cap-shape_knobbed  cat__cap-shape_sunken  cat__cap-surface_grooves  \
0                     1.0                    0.0                       0.0   
1                     0.0                    0.0                       0.0   
2                     0.0                    0.0                       0.0   
3                     0.0                    0.0                       0.0   
4                     0.0                    0.0                       0.0   

   cat__cap-surface_scaly  cat__cap-surface_smooth  cat__cap-color_buff  \
0                

In [36]:
y_train

7873    poisonous
6515    poisonous
6141    poisonous
2764       edible
438        edible
          ...    
5226    poisonous
5390       edible
860        edible
7603    poisonous
7270       edible
Name: class, Length: 6499, dtype: object

In [37]:
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor_stack),
    ('classifier', RandomForestClassifier(
    max_depth=5,
    min_samples_split=10,
    min_samples_leaf=5,
    n_estimators=100,
    max_features='sqrt',
    random_state=42
))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,steps,"[('preprocessor_stage_1', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [38]:
X_train.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
7873,knobbed,smooth,red,noBruises,spicy,free,close,narrow,buff,tapering,...,smooth,silky,pink,white,white,one,evanescent,white,several,woods
6515,convex,smooth,brown,noBruises,foul,free,close,narrow,buff,tapering,...,silky,smooth,white,white,white,one,evanescent,white,several,paths
6141,flat,scaly,red,noBruises,fishy,free,close,narrow,buff,tapering,...,smooth,smooth,pink,white,white,one,evanescent,white,several,leaves
2764,flat,fibrous,brown,bruises,none,free,close,broad,purple,tapering,...,smooth,smooth,gray,pink,white,one,pendant,brown,several,woods
438,bell,scaly,yellow,bruises,anise,free,close,broad,black,enlarging,...,smooth,smooth,white,white,white,one,pendant,brown,numerous,meadows


In [39]:
# Checking Training Accuracy
y_train_pred = pipeline.predict(X_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print("Accuracy:", accuracy_score(y_train,y_train_pred))
print("\nClassification Report:\n", classification_report(y_train,y_train_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_train,y_train_pred))

Accuracy: 0.9899984613017387

Classification Report:
               precision    recall  f1-score   support

      edible       0.98      1.00      0.99      3365
   poisonous       1.00      0.98      0.99      3134

    accuracy                           0.99      6499
   macro avg       0.99      0.99      0.99      6499
weighted avg       0.99      0.99      0.99      6499


Confusion Matrix:
 [[3365    0]
 [  65 3069]]


In [45]:
import dill

# save trained pipeline file

with open('pipeline1.pkl', 'wb') as file:
    dill.dump(pipeline, file)

print('pipeline saved successfully to file')

pipeline saved successfully to file


In [46]:
#Load the saved pipeline from the file

with open('pipeline1.pkl', 'rb') as file:
    loaded_pipeline = dill.load(file)

print('pipeline loaded successfully to file')

pipeline loaded successfully to file


In [47]:
loaded_pipeline.__getstate__()

{'steps': [('preprocessor',
   Pipeline(steps=[('preprocessor_stage_1',
                    ColumnTransformer(transformers=[('cat',
                                                     OneHotEncoder(drop='first',
                                                                   handle_unknown='ignore',
                                                                   sparse_output=False),
                                                     ['cap-shape', 'cap-surface',
                                                      'cap-color', 'bruises',
                                                      'odor', 'gill-attachment',
                                                      'gill-spacing', 'gill-size',
                                                      'gill-color', 'stalk-shape',
                                                      'stalk-root',
                                                      'stalk-surface-above-ring',
                                                 