In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import random
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
with ZipFile('pc6.zip') as zip_file:
    names = zip_file.namelist()
    with zip_file.open(names[0]) as data_file:
        df = pd.read_csv(data_file, index_col=0)

PC6
INWONER
MAN
VROUW
INW_014
INW_1524
INW_2544
INW_4564
INW_65PL
P_NL_ACHTG
P_WE_MIG_A
P_NW_MIG_A
AANTAL_HH
TOTHH_EENP
TOTHH_MPZK
GEM_HH_GR
WONING
WONVOOR45
P_HUURWON
WOZWONING
UITKMINAOW

In [None]:
pd.DataFrame(df.describe(include='all').transpose())

In [None]:
variables = df.columns.drop('PC6')
data = df[variables]
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')
plt.show()


In [None]:
df['P_VROUW'] = df['VROUW']/df['INWONER']
df = df.drop(columns=['MAN', 'VROUW'])

In [None]:
df.isna().sum()

In [None]:
zero_fills = (df['P_NL_ACHTG'] + df['P_WE_MIG_A'].fillna(0) + df['P_NW_MIG_A'].fillna(0)) >89
df.loc[zero_fills, 'P_NL_ACHTG'] =  df.loc[zero_fills, 'P_NL_ACHTG'].fillna(0) 
df.loc[zero_fills, 'P_WE_MIG_A'] =  df.loc[zero_fills, 'P_WE_MIG_A'].fillna(0) 
df.loc[zero_fills, 'P_NW_MIG_A'] =  df.loc[zero_fills, 'P_NW_MIG_A'].fillna(0) 

In [None]:
for col in df.columns[df.isna().sum() > 0]:
    df[col+'_na'] = df[col].isna().astype(int)
    df[col] = df[col].fillna(0)

In [None]:
df['INWONER'] = df['INWONER'].fillna(5)
df['AANTAL_HH'] = df['AANTAL_HH'].fillna(5)

In [None]:
y = df['WOZWONING'] > df['WOZWONING'].mean()
x = df.drop(columns=['WOZWONING','PC6']).fillna(-1)

In [None]:
mms = MinMaxScaler()
x = mms.fit_transform(x)

Setting the random seeds

In [None]:
np.random.seed(1)
random.seed(1)

In [None]:
x.shape

In [None]:
x_train, x_test, y_train, y_test = train_test_split(torch.tensor(x, dtype=torch.float32), 
                                                    torch.tensor(y.values, dtype=torch.float32).reshape(-1, 1), 
                                                    train_size=0.7, 
                                                    shuffle=True,
                                                    random_state=0)
loader = torch.utils.data.DataLoader(list(zip(x_train, y_train)),
                                     batch_size=100,
                                     num_workers=4,)

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()

        self.l1 = nn.Linear(33, 33)
        self.l2 = nn.Linear(33, 1)

    def forward(self, x):
        x = self.l1(x)
        x = nn.functional.relu(x)
        x = self.l2(x)
        x = nn.functional.sigmoid(x)
        return x

In [None]:
def train_model(iter, model, x_train, y_train, x_test, y_test, lr):
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.99)
    train_acc = []
    test_acc = []

    for epoch in range(iter):
        y1 = model(x_train)
        loss = criterion(y1, y_train)
        train_acc.append(((y1>0.5) == (y_train>0.5)).sum()/len(y_train))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        with torch.no_grad():
            test_acc.append(((model(x_test)>0.5) == (y_test>0.5)).sum()/len(y_test))


    return train_acc, test_acc

In [None]:
model = MyModel()

In [None]:
train_acc, test_acc = train_model(500, model, x_train, y_train, x_test, y_test, 0.01)

In [None]:
fig = plt.figure(figsize=(8, 6))
ax = fig.gca()
ax.plot(train_acc, label='train')
ax.plot(test_acc, label='test')
ax.legend()

In [None]:
f'The test accuracy is {test_acc[-1].item():.3}'