In [1]:
print('hello world')

hello world


In [1]:
import torch
import torch.nn as nn
import torch.utils.data as data
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv('train_data.csv')

print('HallwayType: ', train_data['HallwayType'].unique())
print('HeatingType', train_data['HeatingType'].unique())
print('AptManageType', train_data['AptManageType'].unique())
print('TimeToBusStop', train_data['TimeToBusStop'].unique())
print('TimeToSubway', train_data['TimeToSubway'].unique())
print('SubwayStation', train_data['SubwayStation'].unique())

df = train_data


HallwayType:  ['terraced' 'corridor' 'mixed']
HeatingType ['individual_heating' 'central_heating']
AptManageType ['management_in_trust' 'self_management']
TimeToBusStop ['5min~10min' '0~5min' '10min~15min']
TimeToSubway ['10min~15min' '5min~10min' '0-5min' '15min~20min' 'no_bus_stop_nearby']
SubwayStation ['Kyungbuk_uni_hospital' 'Daegu' 'Sin-nam' 'Myung-duk' 'Chil-sung-market'
 'Bangoge' 'Banwoldang' 'no_subway_nearby']


In [3]:

from sklearn.preprocessing import MinMaxScaler

# Separate the target variable
target = df['SalePrice']
df = df.drop('SalePrice', axis=1)

# Scale the target variable
target_scaler = MinMaxScaler()
target_scaled = target_scaler.fit_transform(target.values.reshape(-1, 1))

df['SalePrice_scaled'] = target_scaled

# One-hot encode the SubwayStation column
df = pd.get_dummies(df, columns=['SubwayStation'])
df = df.drop('SubwayStation_no_subway_nearby', axis=1)

# Map the TimeToSubway column to numerical values
time_to_subway_map = {'no_bus_stop_nearby': 0, '0-5min': 1, '5min~10min': 0.8, '10min~15min': 0.6, '15min~20min': 0.4}
df['TimeToSubway'] = df['TimeToSubway'].map(time_to_subway_map)

# Map the HallwayType column to numerical values
hallway_type_map = {'terraced': 0, 'mixed': 1, 'corridor': 2}
df['HallwayType'] = df['HallwayType'].map(hallway_type_map)

# Mapping for HeatingType column
heating_type_mapping = {'individual_heating': 0, 'central_heating': 1}
df['HeatingType'] = df['HeatingType'].map(heating_type_mapping)

# Mapping for AptManageType column
apt_manage_type_mapping = {'management_in_trust': 0, 'self_management': 1}
df['AptManageType'] = df['AptManageType'].map(apt_manage_type_mapping)

# Mapping for TimeToBusStop column
time_to_bus_stop_mapping = {'0~5min': 0, '5min~10min': 0.5, '10min~15min': 1.0}
df['TimeToBusStop'] = df['TimeToBusStop'].map(time_to_bus_stop_mapping)

# Scale the numerical features using MinMaxScaler
num_cols = ['YearBuilt', 'Size(sqf)', 'Floor', 'HallwayType', 'HeatingType', 'AptManageType', 'N_Parkinglot(Ground)', 'N_Parkinglot(Basement)', 'TimeToBusStop', 'N_manager', 'N_elevators', 'N_FacilitiesInApt', 'N_FacilitiesNearBy(Total)', 'N_SchoolNearBy(Total)']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Combine the preprocessed data with the target variable
df = pd.concat([target, df], axis=1)

In [4]:
df

Unnamed: 0,SalePrice,YearBuilt,Size(sqf),Floor,HallwayType,HeatingType,AptManageType,N_Parkinglot(Ground),N_Parkinglot(Basement),TimeToBusStop,...,N_FacilitiesNearBy(Total),N_SchoolNearBy(Total),SalePrice_scaled,SubwayStation_Bangoge,SubwayStation_Banwoldang,SubwayStation_Chil-sung-market,SubwayStation_Daegu,SubwayStation_Kyungbuk_uni_hospital,SubwayStation_Myung-duk,SubwayStation_Sin-nam
0,141592,0.756757,0.308356,0.047619,0.0,0.0,0.0,0.155680,0.139288,0.5,...,0.3750,0.529412,0.194867,0,0,0,0,1,0,0
1,51327,0.189189,0.205268,0.166667,1.0,0.0,1.0,0.112202,0.057532,0.0,...,0.7500,0.235294,0.031276,0,0,0,1,0,0,0
2,48672,0.189189,0.205268,0.119048,1.0,0.0,1.0,0.112202,0.057532,0.0,...,0.7500,0.235294,0.026464,0,0,0,1,0,0,0
3,380530,0.756757,0.872389,0.166667,0.0,0.0,0.0,0.349229,0.405753,0.0,...,0.1875,0.411765,0.627907,0,0,0,0,0,0,1
4,78318,0.378378,0.231153,0.023810,0.5,0.0,1.0,0.199158,0.059803,0.5,...,0.5625,0.823529,0.080193,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4119,570796,0.783784,0.814260,0.547619,0.0,0.0,0.0,0.000000,0.961393,0.0,...,0.5625,0.588235,0.972735,0,0,0,0,1,0,0
4120,307079,1.000000,0.231153,0.500000,0.0,0.0,0.0,0.143058,0.302801,0.0,...,0.4375,0.647059,0.494788,0,0,0,1,0,0,0
4121,357522,0.783784,0.332879,0.452381,0.0,0.0,0.0,0.000000,0.961393,0.0,...,0.5625,0.588235,0.586208,0,0,0,0,1,0,0
4122,312389,0.000000,0.541326,0.000000,1.0,0.0,1.0,0.122020,0.000000,0.0,...,0.4375,0.647059,0.504411,0,0,0,0,1,0,0


In [5]:
from sklearn.model_selection import train_test_split

# Separate the target variable
y = df['SalePrice_scaled']
X = df.drop(['SalePrice', 'SalePrice_scaled'], axis=1)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
train_dataset = data.TensorDataset(torch.tensor(X_train.values, dtype=torch.float32), torch.tensor(y_train.values, dtype=torch.float32))
val_dataset = data.TensorDataset(torch.tensor(X_val.values, dtype=torch.float32), torch.tensor(y_val.values, dtype=torch.float32))

In [7]:
def calc_accuracy(valloader, model):
    correct = 0
    total = 0
    model.eval()
    with torch.no_grad():
        for data in valloader:
            inputs, labels = data
            outputs = model(inputs)
            outputs_unscaled = target_scaler.inverse_transform(outputs)
            labels_unscaled = target_scaler.inverse_transform(labels.reshape(-1, 1))
            cheap_labels = labels_unscaled <= 100000
            cheap_outputs = outputs_unscaled <= 100000
            average_labels = (labels_unscaled > 100000) & (labels_unscaled <= 350000)
            average_outputs = (outputs_unscaled > 100000) & (outputs_unscaled <= 350000)
            expensive_labels = labels_unscaled > 350000
            expensive_outputs = outputs_unscaled > 350000
            cheap_correct = (cheap_labels * cheap_outputs).sum().item()
            average_correct = (average_labels * average_outputs).sum().item()
            expensive_correct = (expensive_labels * expensive_outputs).sum().item()
            correct += cheap_correct + average_correct + expensive_correct
            total += labels.size(0)
    return correct / total

In [8]:
# training loop
def train_epoch(dataloader, model, loss_fn, optimizer):
    avg_loss = 0
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        pred = pred.reshape(-1)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        loss = loss.item()
        avg_loss += loss
    
    avg_loss /= len(dataloader)

    return avg_loss

In [9]:
from torch.nn import Linear, ReLU, BatchNorm1d, LeakyReLU, Dropout


train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False
)

# sequential model
model = nn.Sequential(
    nn.Linear(22, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 1)
)

# loss function
loss_fn = nn.MSELoss()

# optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, threshold=0.001, mode='min', factor=0.5, patience=5, verbose=True)

In [10]:
# train the model
epochs = 100
for t in range(epochs):
    avg_epoch_loss = train_epoch(train_data_loader, model, loss_fn, optimizer)
    acc = calc_accuracy(val_data_loader, model)
    scheduler.step(avg_epoch_loss)
    print(f"Epoch {t+1}. Training loss: {avg_epoch_loss:.4f}. Validation accuracy: {acc:.4f}", end="\r")
print("Done!")


Epoch 00053: reducing learning rate of group 0 to 5.0000e-05.
Epoch 00060: reducing learning rate of group 0 to 2.5000e-05.
Epoch 00068: reducing learning rate of group 0 to 1.2500e-05.
Epoch 00083: reducing learning rate of group 0 to 6.2500e-06.
Epoch 00093: reducing learning rate of group 0 to 3.1250e-06.
Done! 100. Training loss: 0.0059. Validation accuracy: 0.8448


In [11]:
all_train_dataset = data.TensorDataset(torch.tensor(X.values, dtype=torch.float32), torch.tensor(y.values, dtype=torch.float32))

In [12]:
all_train_dataloader = torch.utils.data.DataLoader(
    all_train_dataset,
    batch_size=32,
    shuffle=True
)

In [18]:
finalModel = nn.Sequential(
    nn.Linear(22, 128),
    nn.ReLU(),
    nn.Linear(128, 128),
    nn.ReLU(),
    nn.Linear(128, 1)
)

loss_fn = nn.MSELoss()

# optimizer
optimizer = torch.optim.Adam(finalModel.parameters(), lr=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, threshold=0.001, mode='min', factor=0.5, patience=5, verbose=True)

In [19]:
epochs = 100
for t in range(epochs):
    avg_epoch_loss = train_epoch(all_train_dataloader, finalModel, loss_fn, optimizer)
    scheduler.step(avg_epoch_loss)
    print(f"Epoch {t+1}. Training loss: {avg_epoch_loss:.4f}", end="\r")
print("Done!")


Epoch 00078: reducing learning rate of group 0 to 5.0000e-05.
Epoch 00087: reducing learning rate of group 0 to 2.5000e-05.
Epoch 00099: reducing learning rate of group 0 to 1.2500e-05.
Done! 100. Training loss: 0.0058


In [21]:
# same but without the target variable
df_test = pd.read_csv('test_data.csv')

In [22]:
# Preprocess the test data
# One-hot encode the SubwayStation column
df_test = pd.get_dummies(df_test, columns=['SubwayStation'])
df_test = df_test.drop('SubwayStation_no_subway_nearby', axis=1)

# Map the TimeToSubway column to numerical values
df_test['TimeToSubway'] = df_test['TimeToSubway'].map(time_to_subway_map)

# Map the HallwayType column to numerical values
df_test['HallwayType'] = df_test['HallwayType'].map(hallway_type_map)

# Mapping for HeatingType column
df_test['HeatingType'] = df_test['HeatingType'].map(heating_type_mapping)

# Mapping for AptManageType column
df_test['AptManageType'] = df_test['AptManageType'].map(apt_manage_type_mapping)

# Mapping for TimeToBusStop column
df_test['TimeToBusStop'] = df_test['TimeToBusStop'].map(time_to_bus_stop_mapping)

# Scale the numerical features using MinMaxScaler
num_cols = ['YearBuilt', 'Size(sqf)', 'Floor', 'HallwayType', 'HeatingType', 'AptManageType', 'N_Parkinglot(Ground)', 'N_Parkinglot(Basement)', 'TimeToBusStop', 'N_manager', 'N_elevators', 'N_FacilitiesInApt', 'N_FacilitiesNearBy(Total)', 'N_SchoolNearBy(Total)']
scaler = MinMaxScaler()
df_test[num_cols] = scaler.fit_transform(df_test[num_cols])


In [23]:
test_dataset = data.TensorDataset(torch.tensor(df_test.values, dtype=torch.float32))

In [24]:
test_data_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=False
)

In [29]:
def predict(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch, (X,) in enumerate(dataloader):
            pred = model(X)
            pred_unscaled = target_scaler.inverse_transform(pred)
            pred = pred_unscaled.reshape(-1)            
            predictions.extend(pred.tolist())
    return predictions

In [30]:
predictions = predict(test_data_loader, finalModel)

In [32]:
result = [0 if x <= 100000 else 1 if x <= 350000 else 2 for x in predictions]

In [33]:
# save the predictions to a csv file without the index
pd.DataFrame(result).to_csv('result.csv', index=False, header=False)

[1, 1, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 1, 1, 0, 1, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 1, 0, 1, 0, 2, 2, 1, 2, 0, 1, 1, 0, 1, 1, 0, 0, 0, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 0, 0, 1, 1, 2, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 2, 0, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 0, 2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 2, 1, 0, 1, 1, 0, 1, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 