In [17]:
import numpy as np
from numpy import array 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import pydotplus

## FE

In [18]:
data = pd.read_csv('./data/mushrooms.csv')
# replacing the datapoints having '?' as data with numpy "nan"
data["stalk-root"]= data["stalk-root"].replace('?', np.nan)

### Dropping unnecessary features

In [19]:
# finding the number of unique values in each column
for cols in data.columns:
    print(cols, data[cols].nunique())

class 2
cap-shape 6
cap-surface 4
cap-color 10
bruises 2
odor 9
gill-attachment 2
gill-spacing 2
gill-size 2
gill-color 12
stalk-shape 2
stalk-root 4
stalk-surface-above-ring 4
stalk-surface-below-ring 4
stalk-color-above-ring 9
stalk-color-below-ring 9
veil-type 1
veil-color 4
ring-number 3
ring-type 5
spore-print-color 9
population 6
habitat 7


veil-type column has only one unique value, so it is not going to contribute towards classification, drop it.

In [20]:
data= data.drop(['veil-type'], axis=1) # dropping veil-type

In [21]:
data.columns

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-color', 'ring-number', 'ring-type',
       'spore-print-color', 'population', 'habitat'],
      dtype='object')

Since we will be using xgboost, which is a tree based model, we don't need to scale the data.

# train_test_split

In [22]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [36]:
## Independent and dependent features
X = data.drop(['class'], axis=1)  
y = data["class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

# Selecting the best features for our model:
##### using SelectKBest method with score_func as chi2

In [37]:
from sklearn.feature_selection import SelectKBest, chi2

In [40]:
select = SelectKBest(score_func=chi2, k=12)
fs= select.fit_transform(X_train, y_train) # fitting & transorming train data
print("After selecting best 12 features:", fs.shape)

ValueError: could not convert string to float: 'k'

We've selected 12 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list.  The fs object contains selected x data. 

In [34]:
filter = select.get_support()
filter

AttributeError: 'SelectKBest' object has no attribute 'scores_'

In [None]:
filter = select.get_support()
features = array(X_train.columns)
 
print("All features:")
print(features)
print("                               ") 
print("Selected best 12:")
print(features[filter])

All features:
['cap-shape' 'cap-surface' 'cap-color' 'bruises' 'odor' 'gill-attachment'
 'gill-spacing' 'gill-size' 'gill-color' 'stalk-shape' 'stalk-root'
 'stalk-surface-above-ring' 'stalk-surface-below-ring'
 'stalk-color-above-ring' 'stalk-color-below-ring' 'veil-color'
 'ring-number' 'ring-type' 'spore-print-color' 'population' 'habitat']
                               
Selected best 12:
['cap-surface' 'bruises' 'gill-spacing' 'gill-size' 'gill-color'
 'stalk-root' 'stalk-surface-above-ring' 'stalk-surface-below-ring'
 'ring-type' 'spore-print-color' 'population' 'habitat']


In [None]:
X_train_new= X_train[features[filter]] # creating new X_train with selected features

In [None]:
X_train_new.columns

Index(['cap-surface', 'bruises', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-root', 'stalk-surface-above-ring', 'stalk-surface-below-ring',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

In [None]:
X_test_new= X_test[features[filter]] # creating new X_test with selected features

In [None]:
# Define which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns
# select numerical and catagorical datas

In [None]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
# if no outlyers use mean, if outlyers are there use median or mode
# for catagorical features use most frequent value
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import LabelEncoder # Label Encoding i.e., converting nominal catagorical features to numerical features(Feature Engineering)
# handling missing values-->feature scaling-->ordinal encoding....data should be handled in this order. This is what pipelining is all about
## pipelines: pipeline is combining multiple steps one after the other
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # to group the pipelines together

In [None]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='categorical')),
    ('get_dummies',LabelEncoder()),
    ('scaler',StandardScaler())
    ]

)
# combine numerical and catagorical pipeline
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])


In [None]:
## Train test split

#from sklearn.model_selection import train_test_split

#X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [None]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())
# fit_transform training data, only transform for test data & convert to dataframe

In [None]:
filter = select.get_support()
features = array(X_train.columns)
X_train= X_train[features[filter]] # creating new X_train with selected features
X_test= X_test[features[filter]] # creating new X_test with selected features

In [None]:
X_train.head()

Unnamed: 0,num_pipeline__cap-surface,num_pipeline__bruises,num_pipeline__gill-spacing,num_pipeline__gill-size,num_pipeline__gill-color,num_pipeline__stalk-root,num_pipeline__stalk-surface-above-ring,num_pipeline__stalk-surface-below-ring,num_pipeline__ring-type,num_pipeline__spore-print-color,num_pipeline__population,num_pipeline__habitat
0,0.136921,-0.842237,-0.438473,1.49821,-1.354131,1.525962,0.688059,-0.887615,-1.26185,1.419717,0.287342,-0.877177
1,0.136921,-0.842237,-0.438473,1.49821,-1.354131,-0.65317,-0.91873,0.593034,-1.26185,1.419717,0.287342,1.443749
2,0.951054,-0.842237,-0.438473,1.49821,-1.354131,-0.65317,0.688059,0.593034,-1.26185,1.419717,0.287342,0.283286
3,-1.491346,1.187314,-0.438473,-0.667463,1.177739,-0.65317,0.688059,0.593034,0.954123,-0.255816,0.287342,-0.877177
4,0.951054,1.187314,-0.438473,-0.667463,-0.228856,0.436396,0.688059,0.593034,0.954123,-0.255816,-1.30129,0.863517


In [None]:
## Model Training
from sklearn.svm import SVC        # models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score     # parameters

In [None]:
XGBoost=XGBClassifier()       # create object of the model
XGBoost.fit(X_train,y_train)     # apply object to train data

In [None]:
from sklearn import metrics


In [None]:
import numpy as np
def evaluate_model(true, predicted):
    report= classification_report(true, predicted)
    cm = confusion_matrix(true, predicted)
    accuracy = metrics.accuracy_score(true, predicted)
    return report, cm, accuracy

In [None]:
## Train multiple models

models={
    'XGBoost':XGBClassifier(),          # keys for different models: values for models; so this is key-value pair
    'Random_forest':RandomForestClassifier(),
    'SVC':SVC()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    #mae, rmse, r2_square=evaluate_model(y_test,y_pred)
    accuracy, report, cm= model_report(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    
    print("accuracy",accuracy*100)
    print("report", report)
    print("confusion matrix", cm)

    r2_list.append(accuracy)
    
    print('='*35)
    print('\n')


NameError: name 'X_train' is not defined

In [None]:
model_list

['XGBoost', 'Random_forest', 'SVC']