In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import random
from zipfile import ZipFile
from sklearn.model_selection import train_test_split

In [None]:
with ZipFile('pc6.zip') as zip_file:
    names = zip_file.namelist()
    with zip_file.open(names[0]) as data_file:
        df = pd.read_csv(data_file, index_col=0)

In [None]:
pd.DataFrame(df.describe(include='all').transpose())

In [None]:
variables = df.columns.drop('PC6')
data = df[variables]
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')
plt.show()


In [None]:
df['P_VROUW'] = df['VROUW']/df['INWONER']
df = df.drop(columns=['MAN', 'VROUW'])

In [146]:
y = df['WOZWONING'] > df['WOZWONING'].mean()
X = df.drop(columns=['WOZWONING','PC6'])
# X = X.fillna()

In [152]:
df2 = df.dropna()
y = df2['WOZWONING'] > df2['WOZWONING'].mean()
X = df2.drop(columns=['WOZWONING','PC6'])

Setting the random seeds for reproducibility

In [153]:
np.random.seed(0)
random.seed(0)

In [154]:
n_bins = 100
X_train, X_test, y_train, y_test = train_test_split(torch.tensor(X.values, dtype=torch.float32), 
                                                    torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1), 
                                                    train_size=0.7, 
                                                    shuffle=True,
                                                    random_state=0)
train_loader = torch.utils.data.DataLoader(list(zip(X_train, y_train)),
                                     batch_size=n_bins,
                                     num_workers=4,)
test_loader = torch.utils.data.DataLoader(list(zip(X_test, y_test)),
                                     batch_size=int(len(X_test)/4) + 1,
                                     num_workers=4,)

In [155]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.d1 = nn.Linear(18, 9)
        self.d2 = nn.Linear(9, 1)

    def forward(self, x):
        x = self.d1(x)
        x = nn.functional.relu(x)

        x = self.d2(x)
        x = nn.functional.softmax(x, dim=1)
        return x

In [156]:
def train(n_epochs, model, x_train, y_train, x_test, y_test):

    train_losses = []
    train_accuracies = []
    test_losses = []
    test_accuracies = []

    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = 0.05)

    for epoch in range(n_epochs):
        y1 = model(x_train)
        loss = criterion(y_train, y1)

        train_losses.append(loss.item() )
        train_accuracies.append((y1.eq(y_train).sum()).item() / y1.size(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # with torch.no_grad():
        #     y1 = model(x_test)
        #     loss = criterion(y_test, y1)
        #     test_losses.append(loss.item() )
        #     test_accuracies.append((y1.eq(y_test).sum()).item() / y1.size(0))

    return train_accuracies, train_losses, test_accuracies, test_losses

In [157]:
mod = MyModel()

In [158]:
train (8, mod, X_train, y_train, X_test, y_test)

([0.37985865724381623, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
 [62.01413345336914, nan, nan, nan, nan, nan, nan, nan],
 [],
 [])