In [194]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np

In [195]:
def dframe_to_dloader(dframe: pd.DataFrame, batch_size: int, label_col: str = None) -> data.DataLoader:
    labels_dframe= None
    features_dframe = None
    if label_col:
        labels_dframe = dframe[label_col].values
        features_dframe = dframe.drop(label_col, axis=1).values
    else:
        labels_dframe = pd.DataFrame(np.zeros(dframe.shape[0])).values
        features_dframe = dframe.values

    tensor_dataset = data.TensorDataset(
        torch.tensor(features_dframe, dtype=torch.float),
        torch.tensor(labels_dframe, dtype=torch.float)
    )

    return data.DataLoader(
        tensor_dataset,
        batch_size=batch_size,
        shuffle=True,
        drop_last=False
    )

In [196]:
def mae(y_true,y_pred):
    return np.absolute(np.subtract(y_true, y_pred)).mean()

In [197]:
print(torch.cuda.is_available())
device = torch.device("cuda")

True


In [198]:
training_data_file_path = 'df_train.csv'

feature_cols = [
    'MedInc',
    'HouseAge',
    'AveRooms',
    'AveBedrms',
    'Population',
    'AveOccup',
    'Latitude',
    'Longitude'
]
label_col = 'MedHouseVal'

file_dframe = pd.read_csv(
    training_data_file_path,
    delimiter=',',
    usecols=feature_cols + [label_col]
)

split = 0.8
train_dframe = file_dframe.sample(frac=split, random_state=200)
test_dframe = file_dframe.drop(train_dframe.index)

In [199]:
params = torch.rand(len(feature_cols), requires_grad=True)
train_loss_func = nn.MSELoss()
test_loss_func = mae
lr = 1e-4
epoch_count = 1000
batch_size = 256

class LinearRegression(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.linear1 = nn.Linear(input_size, 1)

    def forward(self, x):
        return self.linear1(x).squeeze(dim=1)

model = LinearRegression(len(feature_cols))
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# Train
for _ in range(epoch_count):
    train_dloader = dframe_to_dloader(train_dframe, batch_size, label_col)
    for features, labels in train_dloader:
        features, labels = features.to(device), labels.to(device)
        preds = model.forward(features)

        loss = train_loss_func(labels, preds)

        optimizer.zero_grad()

        loss.backward()

        optimizer.step()

# Test
test_dloader = dframe_to_dloader(test_dframe, batch_size, label_col)
for features, labels in train_dloader:
    features, labels = features.to(device), labels.to(device)
    preds = model.forward(features)
    loss = test_loss_func(labels.detach().cpu().numpy(), preds.detach().cpu().numpy())
    print(f'MAE: {loss}')

MAE: 0.5799770951271057
MAE: 0.601913571357727
MAE: 0.5702310800552368
MAE: 0.5565762519836426
MAE: 0.5107356905937195
MAE: 0.5646263957023621
MAE: 0.5178350806236267
MAE: 0.6214296817779541
MAE: 0.5978156924247742
MAE: 0.5863064527511597
MAE: 0.6030198335647583
MAE: 0.5345337390899658
MAE: 0.5726086497306824
MAE: 0.5992660522460938
MAE: 0.5841449499130249
MAE: 0.611670732498169
MAE: 0.5821232795715332
MAE: 0.5632288455963135
MAE: 0.6086298227310181
MAE: 0.586023211479187
MAE: 0.5537835359573364
MAE: 0.5743560791015625
MAE: 0.5817962884902954
MAE: 0.6046843528747559
MAE: 0.663558304309845
MAE: 0.5642755627632141
MAE: 0.5667545795440674
MAE: 0.5749059915542603
MAE: 0.5625280141830444
MAE: 0.6137263774871826
MAE: 0.6102809906005859
MAE: 0.5796045660972595
MAE: 0.5303040742874146
MAE: 0.6153019070625305
MAE: 0.5386835336685181
MAE: 0.5845038890838623
MAE: 0.5694732666015625
MAE: 0.5749933123588562
MAE: 0.5547628402709961
MAE: 0.5492566823959351
MAE: 0.5899598598480225
MAE: 0.5784472227096

In [200]:
test_data_file_path = 'X_test.csv'

feature_cols = [
    'MedInc',
    'HouseAge',
    'AveRooms',
    'AveBedrms',
    'Population',
    'AveOccup',
    'Latitude',
    'Longitude'
]

file_dframe = pd.read_csv(
    test_data_file_path,
    delimiter=',',
    usecols=feature_cols
)

In [202]:
test_dloader = dframe_to_dloader(file_dframe, batch_size)
combined_preds = np.empty((0,))
for features, _ in test_dloader:
    features = features.to(device)
    preds = model.forward(features)
    preds_np = preds.detach().cpu().numpy()
    combined_preds = np.concatenate((combined_preds, preds_np))
np.savetxt('piatek_Kubiszyn_Sobiech.csv', combined_preds, delimiter=',', fmt='%.10f')