In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import random
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
import datetime
from torch.utils.tensorboard.writer import SummaryWriter

In [None]:
with ZipFile('pc6.zip') as zip_file:
    names = zip_file.namelist()
    with zip_file.open(names[0]) as data_file:
        df = pd.read_csv(data_file, index_col=0)

In [None]:
pd.DataFrame(df.describe(include='all').transpose())

In [None]:
variables = df.columns.drop('PC6')
data = df[variables]
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')
plt.show()


In [None]:
df['P_VROUW'] = df['VROUW']/df['INWONER']
df = df.drop(columns=['MAN', 'VROUW'])

In [None]:
y = df['WOZWONING'] > df['WOZWONING'].mean()
X = df.drop(columns=['WOZWONING','PC6']).fillna(-1)

Setting the random seeds for reproducibility

In [None]:
torch.initial_seed()
np.random.seed(0)
random.seed(0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(torch.tensor(X.values, dtype=torch.float32), 
                                                    torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1), 
                                                    train_size=0.7, 
                                                    shuffle=True,
                                                    random_state=0)
train_loader = torch.utils.data.DataLoader(list(zip(X_train, y_train)),
                                           batch_size=10,
                                           num_workers=1)
test_loader = torch.utils.data.DataLoader(list(zip(X_test, y_test)),
                                          batch_size=10,
                                          num_workers=1)

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.d1 = nn.Linear(18,4)
        self.d2 = nn.Linear(4, 1)

    def forward(self, x):
        x = self.d1(x)
        x = nn.functional.relu(x)
        logits = self.d2(x)
        out = nn.functional.softmax(logits, dim=1)
        return out

In [None]:
model = MyModel()

In [None]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
loss_function = torch.nn.CrossEntropyLoss()

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_loader):
        inputs, labels = data

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_function(outputs, labels)
        loss.backward()

        optimizer.step()

        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_loader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

In [None]:
epoch_number = 0
EPOCHS = 5
best_vloss = 1_000_000.

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)


    running_vloss = 0.0
    model.eval()

    with torch.no_grad():
        for i, vdata in enumerate(test_loader):
            vinputs, vlabels = vdata
            voutputs = model(vinputs)
            vloss = loss_function(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print(f'LOSS train {avg_loss} valid {avg_vloss}')

    epoch_number += 1