# Binary Prediction of Poisonous Mushrooms - Modeling

[Competition Link](https://www.kaggle.com/competitions/playground-series-s4e8/data)

Goal of the competition is to predict if a mushroom is poisonous or not based on various mushroom parameters.

| Date | User | Change Type | Remarks |  
| ---- | ---- | ----------- | ------- |
| 29/08/2024   | Martin | Create   | Notebook created. Feature engineering and XGBoost | 
| 17/09/2024   | Martin | Update   | Feature engineering exploration | 


# Content

* [Feature Engineering](#feature-engineering)
* [Baseline - XGBoost](#baseline---xgboost)

# Feature Engineering

In [1]:
import os
os.chdir("/tmp/poison_mushrooms")

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import useful_functions as uf

import string

In [28]:
df = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")

## General cleaning

In [29]:
df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [30]:
df_test.head()

Unnamed: 0,id,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,3116945,8.64,x,,n,t,,,w,11.13,...,b,,w,u,w,t,g,,d,a
1,3116946,6.9,o,t,o,f,,c,y,1.27,...,,,n,,,f,f,,d,a
2,3116947,2.0,b,g,n,f,,c,n,6.18,...,,,n,,,f,f,,d,s
3,3116948,3.47,x,t,n,f,s,c,n,4.98,...,,,w,,n,t,z,,d,u
4,3116949,6.17,x,h,y,f,p,,y,6.73,...,,,y,,y,t,,,d,u


In [31]:
# Remove columns with too many Null
columns_to_remove = [
  "id",
  "stem-root",
  "veil-type",
  "veil-color",
  "spore-print-color"
]
df = df.drop(columns_to_remove, axis=1)

df_test_id = df_test['id']
df_test = df_test.drop(columns_to_remove, axis=1)

In [5]:
# Check which columns contain Nan values and how many
df.isna().sum()

class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-surface            1980861
stem-color                   38
has-ring                     24
ring-type                128880
habitat                      45
season                        0
dtype: int64

In [40]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer, SimpleImputer

import keras
from keras import Sequential
from keras.layers import Embedding, Dense, Flatten

In [32]:
le = LabelEncoder()
mapper = {}

# Set invalid categorical values to NA for each column
valid_values = {
  'cap-shape': list(string.ascii_lowercase),
  'cap-surface': list(string.ascii_lowercase), 
  'cap-color': list(string.ascii_lowercase), 
  'does-bruise-or-bleed': ["f", "t"],
  'gill-attachment': list(string.ascii_lowercase),
  'gill-spacing': ["c", "d", "e", "f"],
  'gill-color': list(string.ascii_lowercase),
  'stem-surface': list(string.ascii_lowercase),
  'stem-color': list(string.ascii_lowercase),
  'has-ring': ["f", "t"],
  'ring-type': list(string.ascii_lowercase),
  'habitat': list(string.ascii_lowercase),
  'season': ['a', 'w', 'u', 's']
}

for col, l in valid_values.items():
  # Replace all invalid characters with NA
  df[col] = df[col].apply(lambda x: np.nan if x not in l else x)

  # Add column and entry to mapper, map non-NA values
  col_subset = df.loc[df[col].notna(), col]
  unique_values = col_subset.unique()
  mapper[col] = {unique_values[i]: i for i in range(len(unique_values))}
  col_subset = col_subset.apply(lambda x: mapper[col][x])
  df.loc[df[col].notna(), col] = col_subset

# Convert remaining class into label
df['class'] = le.fit_transform(df['class'])

# Use most-frequent to fill missing values
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
cat_filled = imputer.fit_transform(df[valid_values.keys()])
cat_filled = pd.DataFrame(cat_filled, columns=valid_values.keys())

# Join back to main dataframe
df = pd.concat([df[['class', 'cap-diameter', 'stem-width', 'stem-height']], cat_filled], axis=1)

In [33]:
df1 = df.copy()

Unnamed: 0,class,cap-diameter,stem-width,stem-height,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-surface,stem-color,has-ring,ring-type,habitat,season
0,0,8.80,15.39,4.51,0,0,0,0,0,0,0,1,0,0,0,0,0
1,1,4.51,6.48,4.79,1,1,1,0,0,0,1,0,1,1,1,0,1
2,0,6.94,9.93,6.85,0,0,2,0,1,0,0,1,2,0,0,1,1
3,0,3.88,6.53,4.16,0,2,3,0,2,0,2,1,0,0,0,0,2
4,0,5.85,8.36,3.37,1,3,4,0,3,0,0,1,0,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,0,9.29,18.81,12.14,0,4,5,1,0,0,0,1,0,1,5,0,2
3116941,0,10.88,26.97,6.65,6,4,4,1,3,0,6,1,0,0,0,0,2
3116942,1,7.82,11.06,9.51,1,5,6,0,0,0,0,1,3,1,1,0,0
3116943,0,9.45,17.77,9.13,2,8,5,1,4,0,6,0,0,1,3,0,2


In [44]:
input_dim = len(df1['cap-shape'].unique())
embedding_size = min(50, round((input_dim + 1) / 2))

model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=embedding_size, input_length=1, name='embedding'))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])
model.fit(
  x=df1['cap-shape'].astype(np.float32),
  y=df1['class'],
  epochs=5,
  batch_size=10000
)

Epoch 1/5





[1m286/312[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 887us/step - accuracy: 0.5240 - loss: 0.2786




[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.5256 - loss: 0.2760
Epoch 2/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 945us/step - accuracy: 0.5498 - loss: 0.2408
Epoch 3/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 961us/step - accuracy: 0.5502 - loss: 0.2408
Epoch 4/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5496 - loss: 0.2408  
Epoch 5/5
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.5495 - loss: 0.2409


<keras.src.callbacks.history.History at 0x7f40926cb6d0>

In [48]:
pd.DataFrame(model.layers[0].get_weights()[0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,-0.256273,-0.200032,-0.484405,0.253479,0.379109,-0.484333,-0.466125,-0.330997,-0.32455,-0.44919,-0.308003,-0.101675
1,-0.256698,0.0037,-0.504553,0.295949,0.502532,-0.508184,-0.188055,-0.097733,-0.270953,-0.452042,-0.236684,-0.04001
2,-0.058196,-0.04233,0.166906,0.218829,0.223736,0.187058,0.279696,-0.109741,-0.152099,-0.215615,-0.206389,0.303283
3,-0.68056,0.26757,-0.715011,0.483869,0.431844,-0.754758,-0.605892,0.053042,-0.324664,-0.772865,-0.419146,0.089518
4,-0.370258,0.074879,-0.492895,-0.53994,0.357646,-0.516324,-0.397758,0.561153,-0.351302,-0.507786,0.257314,0.220742
5,-0.299434,-0.027957,-0.494453,-0.397255,0.2728,-0.479064,-0.2634,0.129398,-0.277471,-0.424497,0.214545,0.280124
6,-0.277047,-0.018433,-0.053668,0.270567,0.264657,0.265747,0.241233,-0.244508,-0.114206,-0.292297,-0.332875,0.388399
7,-0.147263,-0.286842,0.207211,0.282406,0.310294,-0.304281,0.31277,-0.33271,-0.314463,-0.227769,-0.278999,0.176693
8,-0.11913,-0.082461,-0.041041,0.069651,0.157039,0.144814,0.128465,0.017562,-0.025441,-0.125619,-0.148188,0.136951
9,-0.162016,0.134899,0.236065,0.18,0.115736,0.167139,0.218488,-0.019224,-0.207096,-0.187263,-0.22127,0.15988


In [15]:
# Tested with KNNImputer but too long
# imputer = KNNImputer()
# # perform imputation on categorical variables
# imputer.fit_transform(df1[['cap-shape', 'cap-color', 'cap-surface']])


In [8]:
# cap-diameter, stem-height and stem-width are numerical values

0          f
1          x
2          f
3          f
4          x
          ..
3116940    f
3116941    s
3116942    x
3116943    p
3116944    x
Name: cap-shape, Length: 3116945, dtype: object

* https://towardsdatascience.com/deep-embeddings-for-categorical-variables-cat2vec-b05c8ab63ac0
* https://contrib.scikit-learn.org/category_encoders/catboost.html
* https://xgboost.readthedocs.io/en/stable/get_started.html

# Baseline XGBoost

In [20]:
import xgboost as xgb
from xgboost import XGBClassifier

In [5]:
df.dtypes

class                    object
cap-diameter            float64
cap-shape                object
cap-surface              object
cap-color                object
does-bruise-or-bleed     object
gill-attachment          object
gill-spacing             object
gill-color               object
stem-height             float64
stem-width              float64
stem-surface             object
stem-color               object
has-ring                 object
ring-type                object
habitat                  object
season                   object
dtype: object

In [9]:
# Split variables
y = df['class']
X = df.drop('class', axis=1)

mapper = {
  'e': 0,
  'p': 1
}
y = [mapper[i] for i in y]


In [13]:
# Setting categorical variables
for t, col in zip(X.dtypes, X.columns):
  if t == 'object':
    X[col] = X[col].astype("category")

In [17]:
# Define XGBoost model
clf = XGBClassifier(
  tree_method='hist',
  enable_categorical=True,
  device='cuda'
)
clf.fit(X, y)
clf.save_model("models/baseline_xgb.json")

In [25]:
# Predictions
ids = df_test['id']
df_test = df_test.drop('id', axis=1)

# Setting columns
for t, col in zip(df_test.dtypes, df_test.columns):
  if t == 'object':
    df_test[col] = df_test[col].astype("category")

preds = clf.predict(df_test, device='cuda')

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [32]:
# Creating output
reverse_mapper = {v: k for k, v in mapper.items()}
result = [reverse_mapper[i] for i in preds]

final = pd.DataFrame({
  'id': ids,
  'class': result
})

final.to_csv('results/baseline_xgb.csv', index=False)

Score on Kaggle: 0.17899