In [1]:
import pandas as pd

from TPNN_2023.lab2.Perceptron import *
from TPNN_2023.lab2.activation_functions import *
from TPNN_2023.lab2.tools import default_init, test_init, Adam, Default
from TPNN_2023.lab2.metrics import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

import plotly.express as px
import plotly.graph_objects as go

### Help reused functions

In [2]:
columns = ["edibility","cap-shape", "cap-surface", "cap-color", "bruises", "odor",
           "gill-attachment", "gill-spacing", "gill-size", "gill-color",
           "stalk-shape", "stalk-root", "stalk-surface-above-ring",
           "stalk-surface-below-ring", "stalk-color-above-ring",
           "stalk-color-below-ring", "veil-type", "veil-color", "ring-number",
           "ring-type", "spore-print-color", "population", "habitat"]

def from_file(file_name, sep: str):
    f = open(file_name, 'r')
    lines = f.readlines()

    lines_count = len(lines)
    data_list = []
    idx = 0

    for line in lines:
        dataset_line = np.array(line.replace("\n", "").split(sep))
        assert len(dataset_line) == 23
        data_list.append(dataset_line)
        idx += 1

    return pd.DataFrame(data=data_list, index=np.arange(0, lines_count, 1), columns=columns)

def encode_df(df, dst_columns):
    encoder = OneHotEncoder(handle_unknown='ignore')
    input_attributes = np.setdiff1d(np.array(df.columns), dst_columns)
    output_attributes = dst_columns

    input_df_encoded = pd.DataFrame(encoder.fit_transform(df[input_attributes]).toarray())
    output_df_encoded = pd.DataFrame(encoder.fit_transform(df[output_attributes]).toarray())

    return input_df_encoded, output_df_encoded

### plot functions: ###
def plot_loss(loss_list):
  # plot loss graphic
  x = np.arange(1, len(loss_list) + 1)
  y = np.array(loss_list)

  assert len(x) == len(y)

  fig = px.line(x=x, y=y, labels={'x':'epoch', 'y':'loss'})
  fig.show()


### Read dataset

In [3]:
assert len(columns) == 23
dataset_file_name = "../datasets/mushroom_dataset.txt"
df = from_file(dataset_file_name, ',')
df

Unnamed: 0,edibility,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [4]:
df.shape

(8124, 23)

### Encodes dataset

In [5]:
input_encoded_df, output_encoded_df = encode_df(df, ['edibility'])
input_encoded_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,107,108,109,110,111,112,113,114,115,116
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
8120,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
8121,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
8122,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [6]:
output_encoded_df

Unnamed: 0,0,1
0,0.0,1.0
1,1.0,0.0
2,1.0,0.0
3,0.0,1.0
4,1.0,0.0
...,...,...
8119,1.0,0.0
8120,1.0,0.0
8121,1.0,0.0
8122,0.0,1.0


In [7]:
print(input_encoded_df.shape)
print(output_encoded_df.shape)

(8124, 117)
(8124, 2)


### Build net

In [8]:
net = Net()
net.add_layer(Dense(120, sigmoid))
net.add_layer(Dense(2, sigmoid))
net.add_layer(Softmax(2))
net.init_net(init_strategy=default_init, input_shape=(input_encoded_df.shape[1], ))
net.print_net_config(level=ConfigLevel.LOW)

layers count=3
layers:
[0]
  Dense layer:
    neuron count=120
[1]
  Dense layer:
    neuron count=2
[2]
  Softmax layer:
  dimension: 2


### Train net

In [9]:
# prepare data

input_data = input_encoded_df.to_numpy()
output_data = output_encoded_df.to_numpy()

train_in_data, test_in_data, train_out_data, test_out_data = train_test_split(input_data, output_data, test_size=0.3, shuffle=True)

print("Train data shapes:")
print(train_in_data.shape)
print(train_out_data.shape)

print("Test data shapes:")
print(test_in_data.shape)
print(test_out_data.shape)

Train data shapes:
(5686, 117)
(5686, 2)
Test data shapes:
(2438, 117)
(2438, 2)


In [10]:
net.train(input_train_data=train_in_data, target_train_data=train_out_data, optimizer=Adam(), learning_rate=0.01, epoch_count=10, loss=log_loss)

loss=0.5656013682786174
loss=0.5726892827643814
loss=0.5726892826152121


KeyboardInterrupt: 

In [None]:
plot_loss(net.loss_list)