# Task
Please use the attached dataset to provide a binary classification of the column “Type”. The dataset is provided into two different tables with unique identifier of column “ID”. Tip: This column (ID) can be used to match the two tables.

## Setup

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from SimpleNNModel import SimpleNNModel
from torch.utils.tensorboard import SummaryWriter
import os
from train_model import train_model, create_tqdm_bar

# Preset matplotlib figure sizes
matplotlib.rcParams['figure.figsize'] = [9, 6]

## Read Data

In [2]:
# CSV file path
csv_file_path1 = './Task1_1.csv'
csv_file_path2 = './Task1_2.csv'

# Read into DataFrame
df1 = pd.read_csv(csv_file_path1, delimiter=';')
df2 = pd.read_csv(csv_file_path2, delimiter=';')

# merge df
dataset = pd.merge(df1, df2, on='ID')
dataset.head()

Unnamed: 0,ID,UKL,GJAH,ZIK,HUI,ERZ,CDx,BJZHD,NKJUD,LPI,BJKG,POUG,TRE,ZUB,VOL,UIO,VBNM,Type,OIN
0,0,160,oooo,x,oooo,www,5.0,vvvv,80.0,800000.0,qqqq,1,1.75,t,f,uuuu,t,n,17.92
1,1,153,rrr,,uuu,pppp,0.0,mmm,200.0,2000000.0,qqqq,0,0.29,f,f,wwww,f,n,16.92
2,1,153,rrr,,uuu,pppp,0.0,mmm,200.0,2000000.0,qqqq,0,0.29,f,f,wwww,f,n,16.92
3,2,5,oooo,x,oooo,www,19.0,hh,96.0,960000.0,hh,1,0.0,f,f,wwww,t,n,31.25
4,3,9,oooo,,oooo,www,120.0,kkk,0.0,0.0,qqq,0,0.335,f,f,uuuu,f,n,48.17


In [3]:
dataset.describe(include='all')

Unnamed: 0,ID,UKL,GJAH,ZIK,HUI,ERZ,CDx,BJZHD,NKJUD,LPI,BJKG,POUG,TRE,ZUB,VOL,UIO,VBNM,Type,OIN
count,4475.0,4475.0,4401,1867,4475,4401,4475.0,4398,4361.0,4361.0,4398,4475.0,4475.0,4475,4475,4430,4475,4475,4429.0
unique,,,3,2,3,3,,12,,,8,,,2,2,2,2,2,
top,,,oooo,x,oooo,www,,vvvv,,,qqqq,,,f,t,wwww,t,y,
freq,,,3704,1226,4145,3704,,936,,,2459,,,2310,3842,2895,2704,4135,
mean,1838.97743,96.316872,,,,,2286.035531,,163.132538,1631325.0,,4.13743,3.39201,,,,,,32.70718
std,1069.445348,56.013921,,,,,8938.486172,,154.868928,1548689.0,,6.73863,4.281415,,,,,,12.602387
min,0.0,1.0,,,,,0.0,,0.0,0.0,,0.0,0.0,,,,,,13.75
25%,898.5,47.0,,,,,0.0,,0.0,0.0,,0.0,0.5,,,,,,22.92
50%,1838.0,101.0,,,,,113.0,,120.0,1200000.0,,2.0,1.75,,,,,,28.67
75%,2766.5,152.0,,,,,1000.0,,280.0,2800000.0,,6.0,5.0,,,,,,40.0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4475 entries, 0 to 4474
Data columns (total 19 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      4475 non-null   int64  
 1   UKL     4475 non-null   int64  
 2   GJAH    4401 non-null   object 
 3   ZIK     1867 non-null   object 
 4   HUI     4475 non-null   object 
 5   ERZ     4401 non-null   object 
 6   CDx     4475 non-null   float64
 7   BJZHD   4398 non-null   object 
 8   NKJUD   4361 non-null   float64
 9   LPI     4361 non-null   float64
 10  BJKG    4398 non-null   object 
 11  POUG    4475 non-null   int64  
 12  TRE     4475 non-null   float64
 13  ZUB     4475 non-null   object 
 14  VOL     4475 non-null   object 
 15  UIO     4430 non-null   object 
 16  VBNM    4475 non-null   object 
 17  Type    4475 non-null   object 
 18  OIN     4429 non-null   float64
dtypes: float64(5), int64(3), object(11)
memory usage: 699.2+ KB


## Transform

In [5]:
# append Type column at the end
# The ID column can be dropped since each row is unique
cols = [x for x in dataset.columns if x not in ['ID', 'Type']]
cols.append('Type')

dataset = dataset[cols]

numerical_features = [x for x in dataset if dataset[x].dtypes != 'object']
categorical_features = [x for x in dataset if dataset[x].dtypes == 'object']

# Define the transformations for each column
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Apply the transformations on the DataFrame
dataset = preprocessor.fit_transform(dataset)

## Split Data
- convert DataFrame into TensorDataset
- Train, Val, Test: 60, 20, 20

In [6]:
# Create a TensorDataset with x_train and y_train
x_tensor = torch.Tensor(dataset[:, :len(cols)-1])
y_tensor = torch.Tensor(dataset[:, len(cols) - 1])
dataset = TensorDataset(x_tensor, y_tensor)

# Split the dataset into train, validation, and test sets
train_per, val_per, test_per = 0.6, 0.2, 0.2
total_size = len(dataset)
train_size = int(train_per * total_size)
val_size = int(val_per * total_size)
test_size = int(test_per * total_size)
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# Create data loaders for train, validation, and test sets
batch_size = 64
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

## Neural Network


### Hyperparameters

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hparams = {
    "input_size": x_tensor.shape[1],
    "hidden_size": 200,
    "num_classes": 2,
    "batch_size": batch_size,
    "learning_rate": 6e-4,
    "lr_decay": 9e-2,
    "num_workers": 8,
    "device": device,
    "dropout": 0.5,
    "gamma": 0.9,
    "epochs": 10,
}

### Train Model

In [8]:
# Create a tensorboard logger.
path = "logs"
num_of_runs = len(os.listdir(path)) if os.path.exists(path) else 0
path = os.path.join(path, f'run_{num_of_runs + 1}')

tb_logger = SummaryWriter(path)

# Binary Cross Entropy, since we have binary classification
loss_func = nn.BCELoss()
model = SimpleNNModel(hparams).to(device)
train_model(model,train_data_loader, val_data_loader, loss_func, tb_logger, epochs=hparams['epochs'], name="Default")

print()
print("Finished training!")
print(f"Training Acc: {model.getTestAcc(train_data_loader)[1] * 100}%")
print(f"Validation Acc: {model.getTestAcc(val_data_loader)[1] * 100}%")

Training Epoch [1/10]:   0%|                                                                                                   | 0/42 [00:00<?, ?it/s]


torch.Size([64])


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

## Test Set

Used only once at the end after fine tuning the model, to check overall performance.

## References

`SimpleNNModel.py`, `train_model.py`: Some parts of the code were used from the Introduction to Deep Learning course that I am working through at TUM.