## Classifying From Tabular Data

In [2]:
# To classify tabular data we use fastai.tabular
from fastai.tabular.all import * 
import matplotlib.pyplot as plt
import numpy
import random

## Reading Data

In [3]:
# Path to data set
# Test data get partitioned into a separate file already now
from pathlib import Path
data_path = Path('./data/mecs/MECS_2-Phase-Steels.csv')

In [4]:
# Reading CSV file containing training and validation data
import pandas as pd
dataframe = pd.read_csv(data_path, sep=';')

# Looking at the first data
dataframe.head()

Unnamed: 0,class,equiv. diameter,major axis length,minor axis length,perimeter,equiv. radius,max feret diameter,min feret diameter,mean feret diameter,convex perimeter,...,std. relativ area,std. convex area/filled area,std. axial ratio,std. aspect ratio,std. roundness,std. circularity,std. sphericity,std. convex per./filled per.,std. form factor,std. convexity
0,category_3,15.225748,11.47827,6.516211,101.586092,7.612874,28.476331,14.601479,21.538905,69.846721,...,0.00263,0.360265,0.247471,0.172865,0.167522,0.150141,0.991127,0.257917,0.254766,0.725854
1,category_3,1.151113,0.974243,0.385487,5.609659,0.575557,2.580962,0.888856,1.734909,5.555157,...,0.193671,0.34532,0.108488,0.097143,0.10452,0.10756,0.357665,0.126119,0.195196,0.270121
2,category_3,1.450391,1.068283,0.524281,5.629028,0.725196,2.312959,1.14431,1.728635,5.545392,...,0.102618,0.441363,0.205836,0.167468,0.075052,0.079354,0.199565,0.134383,0.116518,0.255049
3,category_3,3.561641,2.350421,1.400185,13.099335,1.780821,5.162738,3.039382,4.10106,12.63286,...,0.040841,0.34714,0.234101,0.159523,0.16086,0.144939,1.299666,0.328635,0.25397,1.025188
4,category_3,6.676158,5.226249,2.784733,35.279133,3.338079,12.53648,6.907373,9.721926,30.297397,...,0.009716,0.257991,0.253047,0.162753,0.15665,0.134192,1.073103,0.273935,0.238783,0.793738


## Data Loading and Encoding 

In [42]:
m = np.arange(6).reshape(3,2).mean(axis=0)
s = np.arange(6).reshape(3,2).std(axis=0)

In [70]:
batch_size = 4
n_features = len(dataframe.columns)-1

def extract_numpy_from_df( dataframe: pd.DataFrame, y_column: int ):
    y_column_name = dataframe.columns[y_column]
    X = dataframe.drop(y_column_name, axis=1).to_numpy(dtype=np.float32)
    Y = dataframe[y_column_name].to_numpy()
    return X,Y

def clean_data( X ):
    X = X[~np.isnan(X).any(axis=1)]
    return X
    
def get_validation_and_training_indices( dataset_length ):
    indices = np.random.permutation(dataset_length)
    t = int(dataset_length * 0.8)

    return indices[:t], indices[t:]

def normalize( X ):
    m = X.mean(axis=0)
    s = X.std(axis=0)
    
    return (X - m) / s 

def hot_1_encode( Y, codes ):
    Y_encoded = np.zeros((Y.shape[0], len(codes)))
    for i, elem in enumerate(Y):
        Y_encoded[i, codes.get(elem)] = 1
    
    return Y_encoded

def create_batch( permutation, batch_no, batch_size, X ):
    x_batch = torch.zeros( [ batch_size, X.shape[1] ], dtype=torch.float32 )
    indices = permutation[batch_no * batch_size: batch_no * batch_size + batch_size]
    
    for i in range(batch_size):
        x_batch[i] = torch.from_numpy(X[indices[i]])
    return x_batch

codes = { 'category_1' : 0, 'category_2' : 1, 'category_3' : 2 }

X,Y = extract_numpy_from_df( dataframe, 0 )
X   = clean_data( X )
indices_train,indices_validation = get_validation_and_training_indices( X.shape[0] )

X   = normalize( X )
Y   = hot_1_encode( Y, codes )

## Creating a Network Architecture

In [77]:
def create_model( n_in, n_hidden, n_out ):
    layer = []
    layer.append(nn.Linear(n_in, n_hidden[0]))
    layer.append(nn.ReLU())
    for i in range(len(n_hidden) - 1):
        layer.append(nn.Linear(n_hidden[i], n_hidden[i + 1]))
        layer.append(nn.ReLU())
    layer.append(nn.Linear(n_hidden[-1], n_out))

    model = nn.Sequential( *layer )
    return model

model = create_model(X.shape[1], [50, 25], Y.shape[1])

In [78]:
print(model)

Sequential(
  (0): Linear(in_features=106, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=25, bias=True)
  (3): ReLU()
  (4): Linear(in_features=25, out_features=3, bias=True)
)


## Training Loop

In [79]:
X.shape[0]

10734

In [None]:
def accuracy_metric( y, y_hat ):
    y     = torch.argmax(y, dim=1).to(torch.float32)
    y_hat = torch.argmax(y_hat, dim=1).to(torch.float32)
    difference = y_hat-y
    return 1.0 - torch.mean( torch.abs( difference ) ).item() 

def train_one_epoch( epoch_index, indices_train, X, Y, optimizer, loss_fn, batch_size ):
    losses           = []
    accuracies       = []

    for batch_no in range(X.shape[0] / batch_size):
        batch = create_batch(indices_train, batch_no, batch_size, X)

        optimizer.zero_
    
    # please extend your source code here
    
    return losses, accuracies

optimizer = torch.optim.Adam( params = model.parameters(), lr=0.001 )
loss_fn   = torch.nn.CrossEntropyLoss()

losses           = []
accuracies       = []
n_epochs = 50
for i in range (n_epochs):
    metrics = train_one_epoch( i, indices_train, X, Y, optimizer, loss_fn, 32 )
    loss_per_epoch, accuracy_per_epoch = metrics
    losses = losses + loss_per_epoch
    accuracies = accuracies + accuracy_per_epoch
    print("epoch",i,"/",n_epochs,"accuracy",accuracy_per_epoch[-1])

In [None]:
fig,axis = plt.subplots( 1,1, figsize=(16,8) )
axis.plot( losses )
axis.plot( accuracies )
plt.show()

## Hyper Parameter Tuning