In [141]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import Module
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter
import yaml
import os
import shutil
import json
import time
from datetime import datetime
from math import sqrt

import joblib



In [9]:
df = pd.read_csv('num_tabular_data.csv')

In [10]:

def load_airbnb(csv_file_path):
    df = pd.read_csv(csv_file_path)
    features = df.drop(columns= ['Price_Night'])
    features = features.apply(pd.to_numeric, errors='coerce')
    features = features.values.astype(np.float32)
    labels = df["Price_Night"].values.astype(np.float32)
    loaded_data = features,labels
    return loaded_data

ds = load_airbnb('num_tabular_data.csv')


In [11]:
class AirbnbNightlyPriceDataset(Dataset):
    def __init__(self):
        # Convert the features and labels dataframes into torch tensors
        super().__init__()
        self.X, self.y = load_airbnb('num_tabular_data.csv')
        assert len(self.X) == len(self.y) # Data and labels have to be of equal length

    def __getitem__(self, index):
        return (torch.tensor(self.X[index]), torch.tensor(self.y[index]))

    def __len__(self):
        return len(self.X)


# create dataset
dataset = AirbnbNightlyPriceDataset()

In [12]:
def split_data(dataset):
    # Splits the input dataset into training, validation, and testing sets. # Splits data into 70% training and 30% test

    train_dataset, test_data = random_split(dataset, [int(len(dataset) * 0.7), len(dataset)-int(len(dataset)*0.7)])

    # Splits test data in half, percentage of total dataset is 15% test and 15% validation
    validation_dataset, test_dataset = random_split(test_data, [int(len(test_data) * 0.5), len(test_data)-int(len(test_data)*0.5)])

    print(f"    Training: {len(train_dataset)}")
    print(f"    Validation: {len(validation_dataset)}")
    print(f"    Testing: {len(test_dataset)}")


    return train_dataset, validation_dataset, test_dataset


# Load whole dataset with DataLoader
# shuffle: shuffle data, good for training
# num_workers: faster loading with multiple subprocesses
# !!! IF YOU GET AN ERROR DURING LOADING, SET num_workers TO 0 !!!

train_dataset, validation_dataset, test_dataset = split_data(dataset)

# train loader
train_loader = DataLoader(dataset = train_dataset, batch_size=4, shuffle=True, num_workers=0)


# validation loader
validation_loader = DataLoader(dataset = validation_dataset, batch_size =4, shuffle=True, num_workers=0)


# test loader
test_loader = DataLoader(dataset = test_dataset, batch_size = 4, shuffle=False, num_workers=0)

# dataiter = iter(train_loader)
# data = next(dataiter)
# features, labels = data
# print(features, labels)

    Training: 581
    Validation: 124
    Testing: 125


In [142]:
def save_neural_model(model, hyperparameters, metrics, folder):
    """This function saves a trained model, its associated hyperparameters and performance metrics to a specified folder.
    Parameters:
        model: Machine learning model name
        hyperparameters: A dictionary of the best hyperparameters used to train the model
        metrics: A dictionary of the performance metrics of the model on test and validation sets
        folder: A string specifying the directory path where the model and associated files will be saved."""
    
    # Create the folder if it doesn't exist
    os.makedirs(folder, exist_ok=True)

    # Save the model
    if isinstance(model,torch.nn.Module):
        folder_path = folder + 'model.pt'
        torch.save(model.state_dict(), folder_path)

    else:
        # Save the trained model
        model_path = os.path.join(folder, "model.joblib")
        joblib.dump(model, model_path)

    # Save the hyperparameters as a JSON file
    hyperparameters_path = os.path.join(folder, "hyperparameters.json")
    with open(hyperparameters_path, "w") as f:
        json.dump(hyperparameters, f)

    # Save the performance metrics as a JSON file
    metrics_path = os.path.join(folder, "metrics.json")
    with open(metrics_path, "w") as f:
        json.dump(metrics, f) 
            
    print ('Model is saved')


   
    


In [144]:
def train(model, dataloader, nn_config, dataset_type):
    training_start_time = time.time()
    num_epochs= int(nn_config['epochs'])
    if nn_config['optimiser'] == 'SGD':
        optimiser = torch.optim.SGD
        optimiser = optimiser(model.parameters(), nn_config['learning_rate'])
    elif nn_config['optimiser'] == 'Adam':
        optimiser = torch.optim.Adam
        optimiser = optimiser(model.parameters(), nn_config['learning_rate'] )
    else:
        raise ValueError("Invalid optimiser specified in nn_config.")

    n_total_steps = len(dataloader)
    writer = SummaryWriter()
    batch_idx = 0
    rmse_losses = 0.0
    r_2 = 0.0

    for epoch in range(num_epochs):
        for i, (inputs, labels) in enumerate(dataloader):
            inputs=inputs.type(torch.float32)
            labels=labels.unsqueeze(1) #to add an extra dimension to the labels tensor.
            # Forward pass and loss
            try:
                output = model(inputs)
            except Exception as e:
                print(f"Error occurred on row {i}: {e}")
                continue
            loss = F.mse_loss(output, labels) 
            # Backward pass
            loss.backward() #back propagation
            optimiser.step() #update weights
            optimiser.zero_grad() #reset gradients 

            #calculate rsme and r2
            rmse_loss = torch.sqrt(loss)
            rmse_losses += rmse_loss.item()
            

            #Calculate r2
            prediction_detached = output.detach().numpy().flatten()
            y_detached = labels.detach().numpy().flatten()
            if len(prediction_detached) >= 2:
                r_2 += r2_score(y_detached, prediction_detached)
            else:
                r_2 = np.nan
            
           
            
            
            writer.add_scalar('Loss', loss.item(), batch_idx)
            batch_idx += 1


            # print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

            
            

    #Normalises performance metrics to the number of samples passed through the model
    training_duration = time.time() - training_start_time
    number_of_predictions = num_epochs * n_total_steps
    
    
    rmse_losses = rmse_losses/number_of_predictions
    r_2 = r_2 /number_of_predictions
    
    
    inference_latency = training_duration / number_of_predictions #average time taken to make a prediction 
    # creating the metric dictionary
    metrics_dic={}
    metrics_dic['training_time']=training_duration
    metrics_dic['inference_latency']=inference_latency
    metrics_dic["RMSE_loss"] = rmse_loss.item()
    metrics_dic["R_squared"] = r_2
  

    model_name = datetime.fromtimestamp(datetime.timestamp(datetime.now())).strftime("%d-%m-%Y, %H:%M:%S")

    if not os.path.exists('./models/regression/neural_networks'):
        os.makedirs('./models/regression/neural_networks')
    model_path = f'./models/regression/neural_networks/{dataset_type}/{model_name}/'
    
    # save model using imported function
    save_neural_model(model, nn_config, metrics_dic, model_path)
    


    
  # Break out of loop after first epoch


In [7]:
def get_nn_config(yaml_file):
    """Loads neural network configuration from a YAML file and returns as a dictionary
    
    Parameters:
        config_file: path to the .yaml file containing the hyperparameters
        
    Outputs:
        nn_config: dict containing the hyperparameters for the model"""
    
    with open(yaml_file, 'r') as f:
        nn_config = yaml.safe_load(f)
    return nn_config


In [145]:
class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        # define diferent layers
        self.lin = nn.Linear(input_dim, output_dim)

    def forward(self, x):
        return self.lin(x)



model = LinearRegression(9, 1)

num_epochs = 2
criterion = nn.MSELoss()
learning_rate = 0.0001
n_iters = 100
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 3) Train the model
nn_config = get_nn_config('nn_config.yaml')
train(model, train_loader,  nn_config, 'train_dataset')


Model is saved


In [None]:
class NeuralNet(torch.nn.Module):
    def __init__(self, nn_config):
        super(NeuralNet, self).__init__()
        #define layers
        self.hidden_layer_width = nn_config['hidden_layer_width']
        self.dropout = nn_config['dropout']
        
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(11, self.hidden_layer_width), # uses the same width in all layers of the model
            torch.nn.ReLU(),       
            torch.nn.Dropout(self.dropout),
            torch.nn.Linear(self.hidden_layer_width, self.hidden_layer_width),
            torch.nn.ReLU(),
            torch.nn.Dropout(self.dropout),
            torch.nn.Linear(self.hidden_layer_width, 1)

        )

        
    def forward(self, X):
        return self.layers(X)

In [125]:
# train the  validation_dataset
train(model, validation_loader, num_epochs, optimiser, criterion)