In [605]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch import nn
import sklearn
import pandas as pd 

In [606]:
# make a simple dataset with numerical columns and categorical columns

raw_data = pd.DataFrame({
    'age': [25, 30, 35, 20, 25, 30, 35, 40, 45, 50],
    'income': [50000, 70000, 90000, 30000, 40000, 60000, 80000, 100000, 150000, 200000],
    'sex': ['M', 'F', 'F', 'M', 'Na', 'Na', 'M', 'F', 'Na', 'F'],
    'bought': ['N', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N', 'Y', 'Y'],
    'test': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', 'A']
})
raw_data

Unnamed: 0,age,income,sex,bought,test
0,25,50000,M,N,A
1,30,70000,F,N,B
2,35,90000,F,Y,C
3,20,30000,M,N,A
4,25,40000,Na,Y,B
5,30,60000,Na,Y,C
6,35,80000,M,Y,A
7,40,100000,F,N,B
8,45,150000,Na,Y,C
9,50,200000,F,Y,A


In [607]:
from sklearn.preprocessing import StandardScaler
data = pd.get_dummies(raw_data, columns=['sex'])
scaler = StandardScaler()
data[['age', 'income']] = scaler.fit_transform(data[['age', 'income']])
data

Unnamed: 0,age,income,bought,test,sex_F,sex_M,sex_Na
0,-0.948847,-0.742828,N,A,False,True,False
1,-0.390702,-0.341299,N,B,True,False,False
2,0.167444,0.060229,Y,C,True,False,False
3,-1.506993,-1.144357,N,A,False,True,False
4,-0.948847,-0.943592,Y,B,False,False,True
5,-0.390702,-0.542064,Y,C,False,False,True
6,0.167444,-0.140535,Y,A,False,True,False
7,0.725589,0.260994,N,B,True,False,False
8,1.283735,1.264815,Y,C,False,False,True
9,1.84188,2.268637,Y,A,True,False,False


In [608]:
# create single result category combine 'bought' and 'test'
data['result'] = data['bought'] + data['test']
data = data.drop(columns=['bought', 'test'])
data


Unnamed: 0,age,income,sex_F,sex_M,sex_Na,result
0,-0.948847,-0.742828,False,True,False,
1,-0.390702,-0.341299,True,False,False,NB
2,0.167444,0.060229,True,False,False,YC
3,-1.506993,-1.144357,False,True,False,
4,-0.948847,-0.943592,False,False,True,YB
5,-0.390702,-0.542064,False,False,True,YC
6,0.167444,-0.140535,False,True,False,YA
7,0.725589,0.260994,True,False,False,NB
8,1.283735,1.264815,False,False,True,YC
9,1.84188,2.268637,True,False,False,YA


In [609]:
class_names = data['result'].unique()
data['result'] = pd.Categorical(data['result'], categories=class_names).codes
label_to_class_name = dict(enumerate(class_names))
class_name_to_label = {v: k for k, v in label_to_class_name.items()}
data.head()

Unnamed: 0,age,income,sex_F,sex_M,sex_Na,result
0,-0.948847,-0.742828,False,True,False,0
1,-0.390702,-0.341299,True,False,False,1
2,0.167444,0.060229,True,False,False,2
3,-1.506993,-1.144357,False,True,False,0
4,-0.948847,-0.943592,False,False,True,3


In [610]:
class_name_to_label, label_to_class_name

({'NA': 0, 'NB': 1, 'YC': 2, 'YB': 3, 'YA': 4},
 {0: 'NA', 1: 'NB', 2: 'YC', 3: 'YB', 4: 'YA'})

In [611]:
# convert all cols except 'result' to numeric
data[['sex_F', 'sex_M', 'sex_Na']] = data[['sex_F', 'sex_M', 'sex_Na']].astype(int)
data.head()

Unnamed: 0,age,income,sex_F,sex_M,sex_Na,result
0,-0.948847,-0.742828,0,1,0,0
1,-0.390702,-0.341299,1,0,0,1
2,0.167444,0.060229,1,0,0,2
3,-1.506993,-1.144357,0,1,0,0
4,-0.948847,-0.943592,0,0,1,3


In [612]:
# triple the number of rows and add some random variation
data = pd.concat([data, data, data])

data['age'] = data['age'] + np.random.randn(data.shape[0]) * 0.1
data['income'] = data['income'] + np.random.randn(data.shape[0]) * 0.1

print(len(data))
data.head()


30


Unnamed: 0,age,income,sex_F,sex_M,sex_Na,result
0,-0.832676,-0.806041,0,1,0,0
1,-0.359604,-0.23244,1,0,0,1
2,0.195743,0.114314,1,0,0,2
3,-1.487125,-0.918608,0,1,0,0
4,-0.86425,-0.965894,0,0,1,3


In [613]:
classes = data['result'].unique()

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe):
        self.data = dataframe
        self.features = dataframe.drop(['result'], axis=1).values
        self.targets = dataframe[['result']].values

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        x = torch.tensor(self.features[index], dtype=torch.float32)
        y = torch.tensor(self.targets[index], dtype=torch.float32)
        return x, y

In [614]:
split = int(0.8 * len(data))
# shuffle first
data = data.sample(frac=1).reset_index(drop=True)
train_rows = data[:split]
test_rows = data[split:]
len(train_rows), len(test_rows)

(24, 6)

In [615]:
train_data = CustomDataset(train_rows)
test_data = CustomDataset(test_rows)

len(train_data), len(test_data)

(24, 6)

In [616]:
train_data.features[:4], train_data.targets[:4]

(array([[-0.86424999, -0.96589407,  0.        ,  0.        ,  1.        ],
        [ 0.11203336, -0.31376575,  0.        ,  1.        ,  0.        ],
        [-0.83267581, -0.80604123,  0.        ,  1.        ,  0.        ],
        [-0.35960409, -0.23244021,  1.        ,  0.        ,  0.        ]]),
 array([[3],
        [4],
        [0],
        [1]], dtype=int8))

In [617]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(train_data, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=2, shuffle=False)

train_features_batch, train_labels_batch = next(iter(train_dataloader))
train_features_batch.shape, train_labels_batch.shape

(torch.Size([2, 5]), torch.Size([2, 1]))

In [618]:
class CustomModel(nn.Module):
    def __init__(self, input_shape: int, hidden_units: int, output_shape: int) -> None:
        super().__init__()
        self.layer_stack = nn.Sequential(
            nn.Linear(in_features=input_shape, out_features=hidden_units),
            nn.ReLU(),
            nn.Linear(in_features=hidden_units, out_features=output_shape),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.layer_stack(x)

In [None]:
model_0 = CustomModel(input_shape=)