# Anyone craving wild mushrooms?

## Importing Libraries and Data

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import matthews_corrcoef, accuracy_score, precision_score, recall_score


import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
train_df = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv')

## Exploration

In [3]:
train_df.head()

Unnamed: 0,id,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,0,e,8.8,f,s,u,f,a,c,w,...,,,w,,,f,f,,d,a
1,1,p,4.51,x,h,o,f,a,c,n,...,,y,o,,,t,z,,d,w
2,2,e,6.94,f,s,b,f,x,c,w,...,,s,n,,,f,f,,l,w
3,3,e,3.88,f,y,g,f,s,,g,...,,,w,,,f,f,,d,u
4,4,e,5.85,x,l,w,f,d,,w,...,,,w,,,f,f,,g,a


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3116945 entries, 0 to 3116944
Data columns (total 22 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   class                 object 
 2   cap-diameter          float64
 3   cap-shape             object 
 4   cap-surface           object 
 5   cap-color             object 
 6   does-bruise-or-bleed  object 
 7   gill-attachment       object 
 8   gill-spacing          object 
 9   gill-color            object 
 10  stem-height           float64
 11  stem-width            float64
 12  stem-root             object 
 13  stem-surface          object 
 14  stem-color            object 
 15  veil-type             object 
 16  veil-color            object 
 17  has-ring              object 
 18  ring-type             object 
 19  spore-print-color     object 
 20  habitat               object 
 21  season                object 
dtypes: float64(3), int64(1), object(18)
memory

In [5]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 21 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   cap-diameter          float64
 2   cap-shape             object 
 3   cap-surface           object 
 4   cap-color             object 
 5   does-bruise-or-bleed  object 
 6   gill-attachment       object 
 7   gill-spacing          object 
 8   gill-color            object 
 9   stem-height           float64
 10  stem-width            float64
 11  stem-root             object 
 12  stem-surface          object 
 13  stem-color            object 
 14  veil-type             object 
 15  veil-color            object 
 16  has-ring              object 
 17  ring-type             object 
 18  spore-print-color     object 
 19  habitat               object 
 20  season                object 
dtypes: float64(3), int64(1), object(17)
memory usage: 332.9+ MB


In [6]:
train_df.isna().sum()

id                            0
class                         0
cap-diameter                  4
cap-shape                    40
cap-surface              671023
cap-color                    12
does-bruise-or-bleed          8
gill-attachment          523936
gill-spacing            1258435
gill-color                   57
stem-height                   0
stem-width                    0
stem-root               2757023
stem-surface            1980861
stem-color                   38
veil-type               2957493
veil-color              2740947
has-ring                     24
ring-type                128880
spore-print-color       2849682
habitat                      45
season                        0
dtype: int64

In [7]:
train_df.nunique()

id                      3116945
class                         2
cap-diameter               3913
cap-shape                    74
cap-surface                  83
cap-color                    78
does-bruise-or-bleed         26
gill-attachment              78
gill-spacing                 48
gill-color                   63
stem-height                2749
stem-width                 5836
stem-root                    38
stem-surface                 60
stem-color                   59
veil-type                    22
veil-color                   24
has-ring                     23
ring-type                    40
spore-print-color            32
habitat                      52
season                        4
dtype: int64

In [8]:
test_df.nunique()

id                      2077964
cap-diameter               3745
cap-shape                    62
cap-surface                  59
cap-color                    57
does-bruise-or-bleed         22
gill-attachment              66
gill-spacing                 35
gill-color                   56
stem-height                2664
stem-width                 5610
stem-root                    31
stem-surface                 54
stem-color                   55
veil-type                    15
veil-color                   23
has-ring                     23
ring-type                    36
spore-print-color            33
habitat                      39
season                        4
dtype: int64

In [9]:
results = []

for i , column in enumerate(train_df.select_dtypes(include=['object', 'category']).columns):
    unique_vals = train_df[column].unique()
    results.append({'Index': i, 'Column': column, 'Unique Values': unique_vals})

unique_values_df = pd.DataFrame(results)
unique_values_df.drop(columns=['Index'], inplace=True)
unique_values_df

Unnamed: 0,Column,Unique Values
0,class,"[e, p]"
1,cap-shape,"[f, x, p, b, o, c, s, d, e, n, nan, w, k, l, 1..."
2,cap-surface,"[s, h, y, l, t, e, g, nan, d, i, w, k, 15.94, ..."
3,cap-color,"[u, o, b, g, w, n, e, y, r, p, k, l, i, h, d, ..."
4,does-bruise-or-bleed,"[f, t, d, has-ring, w, o, b, x, p, nan, g, y, ..."
5,gill-attachment,"[a, x, s, d, e, nan, f, p, l, m, b, 32.54, n, ..."
6,gill-spacing,"[c, nan, d, f, x, b, a, 3.61, 2.69, k, 4.8, e,..."
7,gill-color,"[w, n, g, k, y, f, p, o, b, u, e, r, d, t, 3.4..."
8,stem-root,"[nan, b, c, r, s, f, 5.59, 2.77, 20.01, y, o, ..."
9,stem-surface,"[nan, y, s, t, g, h, k, i, f, l, d, x, 12.04, ..."


## Preprocessing

In [10]:
all_data = pd.concat([test_df, train_df.drop('class', axis=1)], axis=0)
categoricals = all_data.select_dtypes(include=['object', 'category']).copy()
alphabet_values = [chr(i) for i in range(97, 123)]
for col in categoricals.columns:
    categoricals[col] = categoricals[col].apply(lambda x: x if x in alphabet_values else 'UD')

ohe = OneHotEncoder()
ohe.fit(categoricals)
del all_data, categoricals

I fitted the OHE on the combined dataset to have a standard output when encoding either set.

In [11]:
#preparing categorical data:
def categorical_data(df):
    categoricals = df.select_dtypes(include=['object', 'category']).copy()
    
    for col in categoricals.columns:
        categoricals[col] = categoricals[col].apply(lambda x: x if x in alphabet_values else 'UD')

    fa = ohe.transform(categoricals).toarray()
    fl = ohe.get_feature_names_out()
    categoricals = pd.DataFrame(fa, columns=fl)
    categoricals = categoricals.astype('int8')
    
    return categoricals

In [12]:
#Preparing numerical data:
def Numerical_data(df):
    numericals = df.select_dtypes(include=['float64'])

    for i in numericals.columns:
        mean = numericals[i].mean()
        numericals[i] = numericals[i].fillna(mean)


    scaler = StandardScaler()
    numericals = pd.DataFrame(scaler.fit_transform(numericals),columns=numericals.columns)
    
    return numericals

In [13]:
def preprocessing(df):
    numericals = Numerical_data(df)
    categoricals = categorical_data(df)
    
    X = pd.concat([numericals, categoricals], axis=1)
    
    return X

In [14]:
Y = train_df['class'].replace({'e':0,'p':1})
X = preprocessing(train_df.drop('class', axis=1))

  Y = train_df['class'].replace({'e':0,'p':1})


## Building and training the model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
class Data(Dataset):
    
    # Constructor
    def __init__(self, X, Y):
        self.x = torch.tensor(X.values, dtype=torch.float32).to(device)
        self.y = torch.tensor(Y.values, dtype=torch.float32).to(device)
            
    # Getter
    def __getitem__(self, index):    
        return self.x[index], self.y[index]
    
    # Get Length
    def __len__(self):
        return self.x.size(0)


In [17]:
class Net(nn.Module):
    def __init__(self, Layers):
        super(Net, self).__init__()
        self.hidden = nn.ModuleList()
        self.dropout = nn.Dropout(0)  

        for input_size, output_size in zip(Layers, Layers[1:]):
            linear = nn.Linear(input_size, output_size)
            torch.nn.init.kaiming_uniform_(linear.weight, nonlinearity='relu')
            self.hidden.append(linear)
            
        
    # Prediction
    def forward(self, x):
        L = len(self.hidden)
        for (l, linear_transform) in zip(range(L), self.hidden):
            if l < L - 1:
                x = F.relu(linear_transform(x))
                x = self.dropout(x)  
            else:
                x = torch.sigmoid(linear_transform(x))
        return x

In [18]:
def train(model, criterion, train_loader, optimizer, epochs):
    loss_accuracy = []
    mcc_scores = []
    accuracy_scores = []
    precision_scores = []
    recall_scores = []
    
    model.to(device)  # Move model to GPU
    
    for epoch in range(epochs):
        all_preds = []
        all_labels = []
        
        for i, (x, y) in enumerate(train_loader):
            x, y = x.to(device), y.to(device)  # Move data to GPU
            optimizer.zero_grad()
            z = model(x)
            y = y.view(-1, 1)
            loss = criterion(z, y)
            loss.backward()
            optimizer.step()
            loss_accuracy.append(loss.item())
            
            preds = (z > 0.5).float()
            all_preds.extend(preds.cpu().detach().numpy())
            all_labels.extend(y.cpu().detach().numpy())
        
        mcc = matthews_corrcoef(all_labels, all_preds)
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds)
        recall = recall_score(all_labels, all_preds)
        
        mcc_scores.append(mcc)
        accuracy_scores.append(accuracy)
        precision_scores.append(precision)
        recall_scores.append(recall)

        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item()}, MCC: {mcc}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}')
    
    return loss_accuracy, mcc_scores, accuracy_scores, precision_scores, recall_scores

In [19]:
Data_= Data(X,Y)
train_loader = DataLoader(dataset=Data_, batch_size=1200, shuffle=True)
layers = [384, 64, 32, 16, 4, 2, 1]
criterion = nn.BCELoss()
model = Net(layers)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
loss_accuracy, mcc_scores, accuracy_scores, precision_scores, recall_scores= train(model, criterion, train_loader, optimizer, epochs=20)

Epoch 1/20, Loss: 0.03712790086865425, MCC: 0.9417432100003382, Accuracy: 0.9709693305464164, Precision: 0.9598571884996763, Recall: 0.9882719321494832
Epoch 2/20, Loss: 0.03931239992380142, MCC: 0.9822506877360307, Accuracy: 0.9912016413507457, Precision: 0.9928387397685793, Recall: 0.9910677637334672
Epoch 3/20, Loss: 0.019682707265019417, MCC: 0.9829426832243414, Accuracy: 0.9915446053748141, Precision: 0.9931582209142149, Recall: 0.9913756101222239
Epoch 4/20, Loss: 0.04947298392653465, MCC: 0.9831833288970065, Accuracy: 0.9916639530052663, Precision: 0.9932489387035589, Recall: 0.9915034396703171
Epoch 5/20, Loss: 0.037872813642024994, MCC: 0.9834312659423671, Accuracy: 0.991786829732318, Precision: 0.9933646148821538, Recall: 0.991612505248048
Epoch 6/20, Loss: 0.05867915228009224, MCC: 0.9835561951973703, Accuracy: 0.9918487493362892, Precision: 0.9934215898030236, Recall: 0.9916687971591349
Epoch 7/20, Loss: 0.02335531823337078, MCC: 0.9837161678020411, Accuracy: 0.991927993596

## Predictions and Submission

In [20]:
#preprocessing
X_test = preprocessing(test_df)
X_test = torch.tensor(X_test.values,  dtype=torch.float32).to(device)

In [21]:
#prediction
Y_test = model.forward(X_test)
Y_test = (Y_test>0.5).cpu().detach().numpy().astype(int)

In [22]:
#formatting submission
Submission = pd.concat([test_df['id'], pd.DataFrame(Y_test, columns=['class']).replace({0 : 'e', 1:'p'})], axis = 1)
Submission

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e
...,...,...
2077959,5194904,p
2077960,5194905,p
2077961,5194906,p
2077962,5194907,e


In [24]:
Submission.to_csv('Submission.csv', index=False)