In [65]:
import io
import torch
import torchvision
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
import seaborn as sns
from torchvision.datasets.utils import download_url
from torch.utils.data import DataLoader, TensorDataset, random_split
import copy
from collections import Counter
import numpy as np

In [66]:
train_headers = ['price', 'type_of_destination', 'rent', 'ownership', 'price_per_meter', 'url', 'number_of_rooms', 'empty1', 'meters', 'used', 'max_floor', 'type_of_building', 'empty2', 'empty3', 'description', 'floor', 'empty4', 'heating', 'windows_type', 'built_in_year', 'empty5', 'date', 'type_of_material', 'ad', 'empty6', 'additionals'] 
train_dataframe_from_file = pd.read_csv('./train/train.tsv', names = train_headers, sep='\t')

test_headers = ['type_of_destination', 'rent', 'ownership', 'price_per_meter', 'url', 'number_of_rooms', 'empty1', 'meters', 'used', 'max_floor', 'type_of_building', 'empty2', 'empty3', 'description', 'floor', 'empty4', 'heating', 'windows_type', 'built_in_year', 'empty5', 'date', 'type_of_material', 'ad', 'empty6', 'additionals'] 
dev_dataframe_from_file = pd.read_csv('./dev-0/in.tsv', names = test_headers, sep='\t')
test_dataframe_from_file = pd.read_csv('./test-A/in.tsv', names = test_headers, sep='\t')

expected_test_header = ['price']
expected_dev_dataframe_from_file = pd.read_csv('./dev-0/expected.tsv', names = expected_test_header, sep='\t')
expected_dev_dataframe_from_file['price'] = expected_dev_dataframe_from_file['price'].astype(float)

In [67]:
train_dataframe = copy.copy(train_dataframe_from_file)
dev_dataframe = copy.copy(dev_dataframe_from_file)
dev_expected_dataframe = copy.copy(expected_dev_dataframe_from_file)
test_dataframe = copy.copy(test_dataframe_from_file)


In [68]:
def preprocesing_datasets(data, is_train_set=False):    
    # zamiana str 'zł' na liczby
    data.drop(['url', 'ad', 
                    'empty1', 'empty2', 'empty3', 'empty4', 'empty5', 'empty6',
                    'date', 'built_in_year', 'description', 'additionals'], axis=1, inplace=True)
    data['rent'] = data['rent'].replace({'zł': ''}, regex=True)
    data['rent'] = data['rent'].replace({' ': ''}, regex=True)
  
    # liczby na float
    data['price_per_meter'] = data['price_per_meter'].astype(float)
    data['meters'] = data['meters'].astype(str)
    data['meters'] = data['meters'] = data['meters'].astype(str) 
    data['meters'] = data['meters'].replace({' ': ''}, regex=True)
    data['meters'] = data['meters'].astype(float)
    
    mapping_type_of_destination = [(3, 'do zamieszkania'), (2, 'do wykończenia'), (1, 'do remontu'), (0, 'nan')]

    data['type_of_destination'] = data['type_of_destination'].replace('do zamieszkania', 3)
    data['type_of_destination'] = data['type_of_destination'].replace('do wykończenia', 2)
    data['type_of_destination'] = data['type_of_destination'].replace('do remontu', 1)
    data['type_of_destination'] = data['type_of_destination'].fillna(0)

    data['type_of_destination'] = data['type_of_destination'].astype(int)
    data['rent'] = data['rent'].fillna(0)
    data['max_floor'] = data['max_floor'].fillna(0)


    mapping_type_of_building = [
        ('blok', 3), ('szeregowiec', 5), ('kamienica', 4), ('nan', 0), ('apartamentowiec', 7), ('dom wolnostojący', 6),
        ('plomba', 1), ('loft', 2)]

    data['type_of_building'] = data['type_of_building'].fillna(0)
    data['type_of_building'] = data['type_of_building'].replace('plomba', 1)
    data['type_of_building'] = data['type_of_building'].replace('loft', 2)
    data['type_of_building'] = data['type_of_building'].replace('blok', 3)
    data['type_of_building'] = data['type_of_building'].replace('kamienica', 4)
    data['type_of_building'] = data['type_of_building'].replace('szeregowiec', 5)
    data['type_of_building'] = data['type_of_building'].replace('dom wolnostojący', 6)
    data['type_of_building'] = data['type_of_building'].replace('apartamentowiec', 7)


    mapping_ownership = [
        ('spółdzielcze własnościowe', 3), ('pełna własność', 4), ('nan', 0), ('spółdzielcze wł. z KW', 2),
        ('udział', 1)]

    data['ownership'] = data['ownership'].replace('pełna własność', 4)
    data['ownership'] = data['ownership'].replace('spółdzielcze własnościowe', 3)
    data['ownership'] = data['ownership'].replace('spółdzielcze wł. z KW', 2)
    data['ownership'] = data['ownership'].replace('udział', 1)
    data['ownership'] = data['ownership'].fillna(0)


    mapping_heating = [
        ('gazowe', 6), ('miejskie', 4), ('nan', 0), ('inne', 1), ('elektryczne', 5), ('kotłownia', 2),
        ('piece kaflowe', 3)]

    data['heating'] = data['heating'].replace('gazowe', 6)
    data['heating'] = data['heating'].replace('elektryczne', 5)
    data['heating'] = data['heating'].replace('miejskie', 4)
    data['heating'] = data['heating'].replace('piece kaflowe', 3)
    data['heating'] = data['heating'].replace('kotłownia', 2)
    data['heating'] = data['heating'].replace('inne', 1)
    data['heating'] = data['heating'].fillna(0)


    mapping_windows_type = [('aluminiowe', 3), ('drewniane', 2), ('plastikowe', 1), ('nan', 0)]

    data['windows_type'] = data['windows_type'].replace('aluminiowe', 3)
    data['windows_type'] = data['windows_type'].replace('drewniane', 2)
    data['windows_type'] = data['windows_type'].replace('plastikowe', 1)
    
    data['windows_type'] = data['windows_type'].fillna(0)


    mapping_type_of_material = [ 
        ('cegła', 8),('wielka płyta', 1), ('silikat', 6), ('pustak', 5), ('żelbet', 4), ('beton', 3),
        ('beton komórkowy', 2), ('inne', 7), ('nan', 0)]

    data['type_of_material'] = data['type_of_material'].replace('cegła', 7)
    data['type_of_material'] = data['type_of_material'].replace('silikat', 6)
    data['type_of_material'] = data['type_of_material'].replace('pustak', 5)
    data['type_of_material'] = data['type_of_material'].replace('żelbet', 4)
    data['type_of_material'] = data['type_of_material'].replace('beton', 3)
    data['type_of_material'] = data['type_of_material'].replace('beton komórkowy', 2)
    data['type_of_material'] = data['type_of_material'].replace('inne', 2)
    data['type_of_material'] = data['type_of_material'].replace('wielka płyta', 1)
    data['type_of_material'] = data['type_of_material'].fillna(0)


    mapping_used = [('wtórny', 1), ('pierwotny',  2)]

    data['used'] = data['used'].replace('pierwotny', 2)
    data['used'] = data['used'].replace('wtórny', 1)
    data['floor'] = data['floor'].replace('> 10',  12)
    data['floor'] = data['floor'].replace('poddasze', 11)
    data['floor'] = data['floor'].replace('suterena', -1)
    data['floor'] = data['floor'].replace('parter', 0)
    data['floor'] = data['floor'].fillna(-2)

    data['number_of_rooms'] = data['number_of_rooms'].replace('więcej niż 10',  11)
    data['floor'] = data['floor'].astype(int)
    data['rent'] = data['rent'].astype(float)
    data['number_of_rooms'] = data['number_of_rooms'].astype(int)
    data['used'] = data['used'].astype(int)
    data['type_of_material'] = data['type_of_material'].astype(int)
    data['windows_type'] = data['windows_type'].astype(int)
    data['heating'] = data['heating'].astype(int)
    data['ownership'] = data['ownership'].astype(int)
    data['type_of_building'] = data['type_of_building'].astype(int)
    data['type_of_destination'] = data['type_of_destination'].astype(int) 
    data['max_floor'] = data['max_floor'].astype(int)
    
    if is_train_set is True:
        data['price'] = data['price'].astype(float)
    return data

In [69]:
# Preprocesing train_dataframe
training_dataframe = preprocesing_datasets(train_dataframe, is_train_set=True)

In [70]:
# Preprocesing dev_dataframe
dev_dataframe = preprocesing_datasets(dev_dataframe)

In [71]:
# Preprocesing test_dataframe
test_dataframe = preprocesing_datasets(test_dataframe)

In [72]:
t_num_rows = len(train_dataframe)
print(F"Train set liczba wierszy: {t_num_rows}")

d_num_rows = len(dev_dataframe)
print(F"Dev set liczba wierszy: {d_num_rows}")

test_num_rows = len(test_dataframe)
print(F"Test set liczba wierszy: {test_num_rows}")

Train set liczba wierszy: 2547
Dev set liczba wierszy: 462
Test set liczba wierszy: 418


In [73]:
input_cols = ['rent','price_per_meter', 'number_of_rooms', 'meters', 'max_floor', 'floor']
categorical_cols = ['type_of_destination', 'ownership', 'used', 'type_of_building', 'heating', 'windows_type', 'type_of_material']
output_cols = ['price']

In [74]:
def dataframe_to_arrays(dataframe):
    # Make a copy of the original dataframe
    dataframe1 = dataframe.copy(deep=True)
    # Convert non-numeric categorical columns to numbers
    
    for col in categorical_cols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes
        
    # Extract input & outupts as numpy arrays
    inputs_array = dataframe1[input_cols].to_numpy()
    targets_array = dataframe1[output_cols].to_numpy()
    return inputs_array, targets_array

In [75]:
def test_dataframe_to_arrays(dataframe):
    # Make a copy of the original dataframe
    dataframe1 = dataframe.copy(deep=True)
    # Convert non-numeric categorical columns to numbers
    
    for col in categorical_cols:
        dataframe1[col] = dataframe1[col].astype('category').cat.codes
        
    # Extract input & outupts as numpy arrays
    inputs_array = dataframe1[input_cols].to_numpy()
    return inputs_array

In [76]:
# Create train input and traget arrays 
train_inputs_array, train_targets_array = dataframe_to_arrays(train_dataframe)

In [77]:
# dev set
dev_dataframe['price'] = dev_expected_dataframe['price'] 
dev_inputs_array, dev_targets_array = dataframe_to_arrays(dev_dataframe)

In [78]:
def check_len(input_array, target_array):
    current_len = len(input_array[0])
    for num, i in enumerate(input_array):
        if len(i) != current_len:
            print('break INPUT:', num)
            break
            
    current_len = len(target_array[0])
    for num, i in enumerate(target_array):
        if len(i) != current_len:
            print('break TARGET: ', num)
            break
    
    if len(input_array) != len(target_array):
        print(f"{len(input_array)} \!\= {len(target_array)}")
        


In [79]:
# test set input array 
test_inputs_array = test_dataframe_to_arrays(test_dataframe)

In [80]:
# Convert to float
# train set
t_inputs = torch.from_numpy(train_inputs_array).float()
t_targets = torch.from_numpy(train_targets_array).float()
print("TRAIN:")
print(t_inputs[:2])
print(t_targets[:2])

# dev set
d_inputs = torch.from_numpy(dev_inputs_array).float()
d_targets = torch.from_numpy(dev_targets_array).float()
print("DEV:")
print(d_inputs[:2])
print(d_targets[:2])

# test set
test_inputs = torch.from_numpy(test_inputs_array).float()
test_targets = torch.from_numpy(np.zeros((len(test_inputs),1)))
print("TEST:")
print(test_inputs[:2])
print(test_targets[:2])


TRAIN:
tensor([[3.9000e+02, 7.1130e+03, 2.0000e+00, 4.3440e+01, 4.0000e+00, 1.0000e+00],
        [0.0000e+00, 7.3920e+03, 2.0000e+00, 4.2600e+01, 2.0000e+00, 1.0000e+00]])
tensor([[309000.],
        [314900.]])
DEV:
tensor([[2.5000e+02, 6.3110e+03, 3.0000e+00, 5.9100e+01, 4.0000e+00, 2.0000e+00],
        [0.0000e+00, 7.8680e+03, 2.0000e+00, 3.8000e+01, 1.2000e+01, 4.0000e+00]])
tensor([[373000.],
        [299000.]])
TEST:
tensor([[0.0000e+00, 6.9380e+03, 3.0000e+00, 6.1990e+01, 7.0000e+00, 2.0000e+00],
        [0.0000e+00, 6.0780e+03, 4.0000e+00, 6.4000e+01, 4.0000e+00, 0.0000e+00]])
tensor([[0.],
        [0.]], dtype=torch.float64)


In [81]:
# Create Train TensorDataset
train_dataset = TensorDataset(t_inputs, t_targets)

In [82]:
# Create Dev TensorDataset
dev_dataset = TensorDataset(d_inputs, d_targets)

In [83]:
# Create Test TensorDataset

test_dataset = TensorDataset(test_inputs, test_targets)

In [84]:
# Size of datasets
t_val_size = t_num_rows
d_val_size = d_num_rows


train_ds = copy.copy(train_dataset)

val_ds = copy.copy(dev_dataset)

test_ds = copy.copy(test_dataset)

In [85]:
batch_size = 100
train_loader = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size)
test_loader = DataLoader(test_ds, batch_size)

In [86]:
# Step 3: Create a Linear Regression Model

In [87]:
input_size = len(input_cols)
output_size = len(output_cols)

In [88]:
class PriceModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(input_size, output_size)
        
    def forward(self, xb):
        out = self.linear(xb)
        return out
    
    def training_step(self, batch):
        inputs, targets = batch 
        # Generate predictions
        out = self(inputs)          
        # Calculate loss
        loss = F.l1_loss(out, targets)
        return loss
    
    def validation_step(self, batch):
        inputs, targets = batch
        # Generate predictions
        out = self(inputs)
        # Calculate loss
        loss = F.l1_loss(out, targets)  
        return {'val_loss': loss.detach()}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()
        return {'val_loss': epoch_loss.item()}
    
    def epoch_end(self, epoch, result, num_epochs):
        # Print result every 20th epoch
        if (epoch+1) % 20 == 0 or epoch == num_epochs-1:
            print("Epoch [{}], val_loss: {:.4f}".format(epoch+1, result['val_loss']))

In [89]:
model = PriceModel()

In [90]:
def evaluate(model, val_loader):
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def fit(epochs, lr, model, train_loader, val_loader, opt_func=torch.optim.SGD):
    history = []
    optimizer = opt_func(model.parameters(), lr)
    for epoch in range(epochs):
        # Training Phase 
        for batch in train_loader:
            loss = model.training_step(batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        # Validation phase
        result = evaluate(model, val_loader)
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

In [91]:
result = evaluate(model, val_loader) # Use the evaluate function
print(result)

{'val_loss': 421839.34375}


In [98]:
epochs = 100
lr = 1e-2
history1 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 196420.6250
Epoch [40], val_loss: 188458.5938
Epoch [60], val_loss: 173731.3281
Epoch [80], val_loss: 151665.8906
Epoch [100], val_loss: 214904.7188


In [99]:
epochs = 100
lr = 1e-3
history2 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 134672.7500
Epoch [40], val_loss: 135195.6250
Epoch [60], val_loss: 136107.3281
Epoch [80], val_loss: 133085.2969
Epoch [100], val_loss: 135847.9531


In [100]:
epochs = 100
lr = 1e-4
history3 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 133568.2188
Epoch [40], val_loss: 133462.8281
Epoch [60], val_loss: 133212.8906
Epoch [80], val_loss: 133837.1719
Epoch [100], val_loss: 133276.5938


In [101]:
epochs = 100
lr = 1e-5
history4 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 133502.0938
Epoch [40], val_loss: 133486.7969
Epoch [60], val_loss: 133458.5938
Epoch [80], val_loss: 133489.4375
Epoch [100], val_loss: 133473.2812


In [102]:
epochs = 100
lr = 1e-6
history5 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 133471.0469
Epoch [40], val_loss: 133480.4062
Epoch [60], val_loss: 133475.5938
Epoch [80], val_loss: 133475.8438
Epoch [100], val_loss: 133484.0156


In [103]:
epochs = 100
lr = 1e-7
history6 = fit(epochs, lr, model, train_loader, val_loader)

Epoch [20], val_loss: 133483.7188
Epoch [40], val_loss: 133483.5938
Epoch [60], val_loss: 133483.4688
Epoch [80], val_loss: 133483.2031
Epoch [100], val_loss: 133483.0156


In [104]:
val_loss = [result] + history1 + history2 + history3 + history4 + history5+ history6
val_loss_list = [vl['val_loss'] for vl in val_loss]

In [105]:
val_loss = [result] + history1 + history2 + history3 + history4 + history5
val_loss_list = [vl['val_loss'] for vl in val_loss]

In [106]:
def predict_single(input, target, model):
    inputs = input.unsqueeze(0)
    predictions = model(inputs)
    prediction = predictions[0].detach()
    print("Input:", input)
    print("Target:", target)
    print("Prediction:", prediction)

In [107]:
input, target = val_ds[0]
predict_single(input, target, model)

Input: tensor([2.5000e+02, 6.3110e+03, 3.0000e+00, 5.9100e+01, 4.0000e+00, 2.0000e+00])
Target: tensor([373000.])
Prediction: tensor([350319.9688])


In [108]:
input, target = val_ds[10]
predict_single(input, target, model)

Input: tensor([0.0000e+00, 7.8650e+03, 2.0000e+00, 3.8500e+01, 3.0000e+00, 1.0000e+00])
Target: tensor([302800.])
Prediction: tensor([362455.4062])


In [109]:
input, target = val_ds[23]
predict_single(input, target, model)

Input: tensor([0.0000e+00, 5.5200e+03, 3.0000e+00, 6.9750e+01, 5.0000e+00, 5.0000e+00])
Target: tensor([385000.])
Prediction: tensor([344435.8125])


In [110]:
# Test-A

def predict_test_single(input, model):
    
    inputs = input.unsqueeze(0)
    predictions = model(inputs)
    prediction = predictions[0].detach()
    
    #print("Input:", input)
    #print("Out:", prediction)
    return prediction.numpy()[0]
    
predictions=[]
for num, (xb, yb) in enumerate(test_ds):
    predictions.append(predict_test_single(xb, model))



In [111]:
dev_predictions=[]
for num, (xb, yb) in enumerate(val_ds):
    dev_predictions.append(predict_test_single(xb, model))

In [112]:
#generowanie test-A/out.tsv 
import csv
with open('./test-A/out.tsv', 'wt') as f:
    writer = csv.writer(f, delimiter='\t')
    for row in predictions:
        writer.writerow([row])

In [113]:
#generowanie dev-0/out.tsv 

with open('./dev-0/out.tsv', 'w') as f:
    writer = csv.writer(f, delimiter='\t')
    for row in list(dev_predictions):
        writer.writerow([row])