SSN 
Olga Krupa
Ewa Roszczyk


In [20]:
import torch, torchvision
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import torch.functional as F
import numpy as np
import pandas as pd
import torch.utils.data as data
import seaborn as sns
import matplotlib.pyplot as plt

In [21]:
device = torch.device("cuda")

In [22]:
if torch.cuda.is_available(): 
    torch.cuda.manual_seed(42)
    torch.cuda.manual_seed_all(42)

torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False

In [23]:
train_data = pd.read_csv("train_data.csv")

In [24]:
train_data = train_data.drop(columns=["YrSold","MonthSold","N_FacilitiesNearBy(Total)","N_SchoolNearBy(Total)"])

In [26]:
categorical_columns = ["HallwayType", "HeatingType", "AptManageType", "TimeToBusStop", "TimeToSubway", "SubwayStation"]
categorical_values = pd.get_dummies(train_data[categorical_columns])
train_data['SalePrice'] = train_data['SalePrice'].apply(lambda b: 1 if b > 300000 else 0)

In [27]:
train_data.drop(columns=categorical_columns,inplace=True)

In [28]:
np.random.seed(23)

In [29]:
train_indices = np.random.rand(len(train_data))>0.3
train_indices

array([ True,  True,  True, ..., False,  True,  True])

In [30]:
numerical_data = torch.from_numpy(train_data.values[train_indices,1:]).float()
categorical_data = torch.from_numpy(categorical_values.values[train_indices]).float()
targets = torch.from_numpy(train_data.values[train_indices,0]).float()

test_numerical_data = torch.from_numpy(train_data.values[~train_indices,1:]).float()
test_categorical_data = torch.from_numpy(categorical_values.values[~train_indices]).float()
test_targets = torch.from_numpy(train_data.values[~train_indices,0]).float()

In [31]:
train_dataset = data.TensorDataset(numerical_data,categorical_data,targets)
test_dataset = data.TensorDataset(test_numerical_data,test_categorical_data,test_targets)

In [32]:
def get_accuracy(model, data_loader):
    correct = 0
    total = 0
    model.eval() #*********#
    for x, cat_x, labels in data_loader:
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        output = model(x, cat_x)
        pred = output>0
        correct += pred.eq(labels.view_as(pred)).sum().item()
        total += x.shape[0]
    return correct / total

In [33]:
class House_classifier_embeddings(nn.Module):
    def __init__(self):
        super(House_classifier_embeddings, self).__init__()
        self.emb_layer = nn.Linear(categorical_data.shape[1], categorical_data.shape[1])
        self.act_emb = nn.Tanh()
        self.layer1 = nn.Linear(train_data.shape[1] -1 + categorical_data.shape[1], 40)
        self.bn1 = nn.BatchNorm1d(40)
        self.act_1 =  nn.LeakyReLU()
        self.d1 = nn.Dropout(0.5)
        self.layer2 = nn.Linear(40, 20)
        self.bn2 = nn.BatchNorm1d(20)
        self.act_2 =  nn.LeakyReLU()
        self.d2 = nn.Dropout(0.5)
        self.layer3 = nn.Linear(20, 1)
    def forward(self, x, cat_x):
        cat_x_embedded = self.emb_layer(cat_x)
        cat_x_embedded = self.act_emb(cat_x_embedded)
        x = torch.cat([x,cat_x_embedded],dim=1)
        x = self.layer1(x)
        x = self.bn1(x)
        activation1 = self.act_1(x)
        activation1 = self.d1(activation1)
        x = self.layer2(activation1)
        x = self.bn2(x)
        activation2 = self.act_2(x)
        activation2 = self.d2(activation2)
        output = self.layer3(activation2)
        return output

In [34]:
model = House_classifier_embeddings().to(device)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)

iters = []
losses = []
train_acc = []
val_acc = []
for n in range(250):
    epoch_losses = []
    for x, cat_x, labels in iter(train_loader):
        x, cat_x, labels = x.to(device), cat_x.to(device), labels.to(device)
        model.train() 
        out = model(x, cat_x).squeeze()           

        loss = criterion(out, labels)
        loss.backward()  
        epoch_losses.append(loss.item())
        optimizer.step()              
        optimizer.zero_grad()         

    loss_mean = np.array(epoch_losses).mean()
    iters.append(n)
    losses.append(loss_mean)
    test_acc = get_accuracy(model, test_loader)
    print(f"Epoch {n} loss {loss_mean:.3} test_acc: {test_acc:.3}")
    train_acc.append(get_accuracy(model, train_loader)) # compute training accuracy 
    val_acc.append(test_acc)  # compute validation accuracy
        

print("Final Training Accuracy: {}".format(train_acc[-1]))
print("Final Validation Accuracy: {}".format(val_acc[-1]))

Epoch 0 loss 0.672 test_acc: 0.751
Epoch 1 loss 0.556 test_acc: 0.858
Epoch 2 loss 0.477 test_acc: 0.862
Epoch 3 loss 0.425 test_acc: 0.868
Epoch 4 loss 0.399 test_acc: 0.866
Epoch 5 loss 0.375 test_acc: 0.866
Epoch 6 loss 0.365 test_acc: 0.855
Epoch 7 loss 0.35 test_acc: 0.865
Epoch 8 loss 0.334 test_acc: 0.869
Epoch 9 loss 0.328 test_acc: 0.865
Epoch 10 loss 0.323 test_acc: 0.884
Epoch 11 loss 0.316 test_acc: 0.876
Epoch 12 loss 0.318 test_acc: 0.865
Epoch 13 loss 0.318 test_acc: 0.854
Epoch 14 loss 0.312 test_acc: 0.911
Epoch 15 loss 0.3 test_acc: 0.903
Epoch 16 loss 0.303 test_acc: 0.915
Epoch 17 loss 0.294 test_acc: 0.904
Epoch 18 loss 0.3 test_acc: 0.915
Epoch 19 loss 0.283 test_acc: 0.915
Epoch 20 loss 0.295 test_acc: 0.921
Epoch 21 loss 0.304 test_acc: 0.918
Epoch 22 loss 0.29 test_acc: 0.916
Epoch 23 loss 0.288 test_acc: 0.922
Epoch 24 loss 0.283 test_acc: 0.914
Epoch 25 loss 0.279 test_acc: 0.915
Epoch 26 loss 0.272 test_acc: 0.918
Epoch 27 loss 0.283 test_acc: 0.922
Epoch 28

In [35]:
test_data = pd.read_csv("test_data.csv")
test_data = test_data.drop(columns=["YrSold","MonthSold","N_FacilitiesNearBy(Total)","N_SchoolNearBy(Total)"])

In [36]:
categorical_columns_test = ["HallwayType", "HeatingType", "AptManageType", "TimeToBusStop", "TimeToSubway", "SubwayStation"]

categorical_values_test = pd.get_dummies(test_data[categorical_columns_test])

test_data.drop(columns=categorical_columns_test,inplace=True)
test_data

numerical_data_test = torch.from_numpy(test_data.values[:,:])
categorical_data_test = torch.from_numpy(categorical_values_test.values[:,:])


In [37]:
preds = model(numerical_data_test.to(device).float(), categorical_data_test.to(device).float())
pd.DataFrame(preds.cpu().detach().numpy()).to_csv("results.csv", index=False)

In [38]:
results = pd.read_csv("results.csv")
res = results['0'].to_list()
a = []
for i in res:
  if i > 0:
    a.append(1)
  else:
    a.append(0)

results['1'] = a

results.to_csv("borowiki.csv", index=False)
results

Unnamed: 0,0,1
0,-10.147328,0
1,-9.173476,0
2,-4.406162,0
3,-3.876463,0
4,-1.481103,0
...,...,...
1762,-1.528491,0
1763,-9.326757,0
1764,-0.758648,0
1765,1.304965,1
