In [1]:
print('hello world')

hello world


In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('train_data.csv')

print('HallwayType: ', train_data['HallwayType'].unique())
print('HeatingType', train_data['HeatingType'].unique())
print('AptManageType', train_data['AptManageType'].unique())
print('TimeToBusStop', train_data['TimeToBusStop'].unique())
print('TimeToSubway', train_data['TimeToSubway'].unique())
print('SubwayStation', train_data['SubwayStation'].unique())

df = train_data


HallwayType:  ['terraced' 'corridor' 'mixed']
HeatingType ['individual_heating' 'central_heating']
AptManageType ['management_in_trust' 'self_management']
TimeToBusStop ['5min~10min' '0~5min' '10min~15min']
TimeToSubway ['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']
SubwayStation ['Kyungbuk_uni_hospital' 'Daegu' 'Sin-nam' 'Myung-duk' 'Chil-sung-market'
 'Bangoge' 'Banwoldang' 'no_subway_nearby']


In [3]:

from sklearn.preprocessing import MinMaxScaler

# Separate the target variable
target = df['SalePrice']
df = df.drop('SalePrice', axis=1)

# Scale the target variable
target_scaler = MinMaxScaler()
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

df['SalePrice_scaled'] = target_scaled

# One-hot encode the SubwayStation column
df = pd.get_dummies(df, columns=['SubwayStation'])
df = df.drop('SubwayStation_no_subway_nearby', axis=1)

# Map the TimeToSubway column to numerical values
time_to_subway_map = {'no_bus_stop_nearby': 0, '0-5min': 1, '5min~10min': 0.8, '10min~15min': 0.6, '15min~20min': 0.4}
df['TimeToSubway'] = df['TimeToSubway'].map(time_to_subway_map)

# Map the HallwayType column to numerical values
hallway_type_map = {'terraced': 0, 'mixed': 1, 'corridor': 2}
df['HallwayType'] = df['HallwayType'].map(hallway_type_map)

# Mapping for HeatingType column
heating_type_mapping = {'individual_heating': 0, 'central_heating': 1}
df['HeatingType'] = df['HeatingType'].map(heating_type_mapping)

# Mapping for AptManageType column
apt_manage_type_mapping = {'management_in_trust': 0, 'self_management': 1}
df['AptManageType'] = df['AptManageType'].map(apt_manage_type_mapping)

# Mapping for TimeToBusStop column
time_to_bus_stop_mapping = {'0~5min': 0, '5min~10min': 0.5, '10min~15min': 1.0}
df['TimeToBusStop'] = df['TimeToBusStop'].map(time_to_bus_stop_mapping)

# Scale the numerical features using MinMaxScaler
num_cols = ['YearBuilt', 'Size(sqf)', 'Floor', 'HallwayType', 'HeatingType', 'AptManageType', 'N_Parkinglot(Ground)', 'N_Parkinglot(Basement)', 'TimeToBusStop', 'N_manager', 'N_elevators', 'N_FacilitiesInApt', 'N_FacilitiesNearBy(Total)', 'N_SchoolNearBy(Total)']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Combine the preprocessed data with the target variable
df = pd.concat([target, df], axis=1)

In [4]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,...,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total),SalePrice_scaled,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam
0,141592,0.756757,0.308356,0.047619,0.0,0.0,0.0,0.155680,0.139288,0.5,...,0.3750,0.529412,0.194867,0,0,0,0,1,0,0
1,51327,0.189189,0.205268,0.166667,1.0,0.0,1.0,0.112202,0.057532,0.0,...,0.7500,0.235294,0.031276,0,0,0,1,0,0,0
2,48672,0.189189,0.205268,0.119048,1.0,0.0,1.0,0.112202,0.057532,0.0,...,0.7500,0.235294,0.026464,0,0,0,1,0,0,0
3,380530,0.756757,0.872389,0.166667,0.0,0.0,0.0,0.349229,0.405753,0.0,...,0.1875,0.411765,0.627907,0,0,0,0,0,0,1
4,78318,0.378378,0.231153,0.023810,0.5,0.0,1.0,0.199158,0.059803,0.5,...,0.5625,0.823529,0.080193,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,0.783784,0.814260,0.547619,0.0,0.0,0.0,0.000000,0.961393,0.0,...,0.5625,0.588235,0.972735,0,0,0,0,1,0,0
4120,307079,1.000000,0.231153,0.500000,0.0,0.0,0.0,0.143058,0.302801,0.0,...,0.4375,0.647059,0.494788,0,0,0,1,0,0,0
4121,357522,0.783784,0.332879,0.452381,0.0,0.0,0.0,0.000000,0.961393,0.0,...,0.5625,0.588235,0.586208,0,0,0,0,1,0,0
4122,312389,0.000000,0.541326,0.000000,1.0,0.0,1.0,0.122020,0.000000,0.0,...,0.4375,0.647059,0.504411,0,0,0,0,1,0,0


In [4]:
from sklearn.model_selection import train_test_split

# Separate the target variable
y = df['SalePrice_scaled']
X = df.drop(['SalePrice', 'SalePrice_scaled'], axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
train_dataset = data.TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
val_dataset = data.TensorDataset(torch.tensor(X_val.values, dtype=torch.float32), torch.tensor(y_val.values, dtype=torch.float32))

In [42]:
def calc_accuracy(valloader, model):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in valloader:
            inputs, labels = data
            outputs = model(inputs)
            outputs_unscaled = target_scaler.inverse_transform(outputs)
            labels_unscaled = target_scaler.inverse_transform(labels.reshape(-1, 1))
            cheap_labels = labels_unscaled <= 100000
            cheap_outputs = outputs_unscaled <= 100000
            average_labels = (labels_unscaled > 100000) & (labels_unscaled <= 350000)
            average_outputs = (outputs_unscaled > 100000) & (outputs_unscaled <= 350000)
            expensive_labels = labels_unscaled > 350000
            expensive_outputs = outputs_unscaled > 350000
            cheap_correct = (cheap_labels * cheap_outputs).sum().item()
            average_correct = (average_labels * average_outputs).sum().item()
            expensive_correct = (expensive_labels * expensive_outputs).sum().item()
            correct += cheap_correct + average_correct + expensive_correct
            total += labels.size(0)
    return correct / total

In [58]:
# training loop
def train_epoch(dataloader, model, loss_fn, optimizer):
    avg_loss = 0
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        pred = pred.reshape(-1)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss = loss.item()
        avg_loss += loss
    
    avg_loss /= len(dataloader)

    return avg_loss

In [59]:
from torch.nn import Linear, ReLU, BatchNorm1d


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)

# sequential model
model = nn.Sequential(
    Linear(22, 128),
    BatchNorm1d(128),
    ReLU(),
    Linear(128, 64),
    BatchNorm1d(64),
    ReLU(),
    Linear(64, 32),
    BatchNorm1d(32),
    ReLU(),
    Linear(32, 1)
)

# loss function
loss_fn = nn.MSELoss()

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

In [60]:
# train the model
epochs = 30
for t in range(epochs):
    avg_epoch_loss = train_epoch(train_data_loader, model, loss_fn, optimizer)
    acc = calc_accuracy(val_data_loader, model)
    print(f"Epoch {t+1}. Training loss: {avg_epoch_loss:.3f}. Validation accuracy: {acc:.3f}")
print("Done!")


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1. Training loss: 0.135. Validation accuracy: 0.712


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 2. Training loss: 0.037. Validation accuracy: 0.781


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 3. Training loss: 0.019. Validation accuracy: 0.788


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 4. Training loss: 0.016. Validation accuracy: 0.787


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 5. Training loss: 0.013. Validation accuracy: 0.804


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 6. Training loss: 0.014. Validation accuracy: 0.784


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 7. Training loss: 0.013. Validation accuracy: 0.832


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 8. Training loss: 0.013. Validation accuracy: 0.827


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 9. Training loss: 0.013. Validation accuracy: 0.840


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 10. Training loss: 0.011. Validation accuracy: 0.801


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 11. Training loss: 0.012. Validation accuracy: 0.825


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 12. Training loss: 0.011. Validation accuracy: 0.828


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 13. Training loss: 0.012. Validation accuracy: 0.818


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 14. Training loss: 0.011. Validation accuracy: 0.818


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 15. Training loss: 0.011. Validation accuracy: 0.806


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 16. Training loss: 0.011. Validation accuracy: 0.802


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 17. Training loss: 0.011. Validation accuracy: 0.813


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 18. Training loss: 0.010. Validation accuracy: 0.816


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 19. Training loss: 0.009. Validation accuracy: 0.805


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 20. Training loss: 0.010. Validation accuracy: 0.841


KeyboardInterrupt: 