In [1]:
# Import the required libraries
import os
import zipfile
import pandas as pd
import numpy as np

In [2]:
# Load the data into pandas dataframes. The data has to be manually saved to a folder called 'data'.
# Note: the data is quite large, so this may take a while (~40 seconds)
train_df = pd.read_csv(os.path.join('data', 'training_set_VU_DM.csv'))
test_df = pd.read_csv(os.path.join('data', 'test_set_VU_DM.csv'))

In [3]:
print(f"Train data contains {train_df.shape[0]:,} rows and {train_df.shape[1]} columns")
print(f"Test data contains {test_df.shape[0]:,} rows and {test_df.shape[1]} columns")

Train data contains 4,958,347 rows and 54 columns
Test data contains 4,959,183 rows and 50 columns


### Data Columns

| Column Name                 | Data Type | Description                                                                                                                                                                                                       |
|-----------------------------|-----------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| position                    | Integer   | Hotel position on Expedia's search results page. This is only provided for the training data, but not the test data.                                                                                              |
| gross_booking_usd           | Float     | Total value of the transaction. This can differ from the price_usd due to taxes, fees, conventions on multiple day bookings and purchase of a room type other than the one shown in the search                    |
| click_bool                  | Boolean   | 1 if the user clicked on the property, 0 if not.                                                                                              |
| booking_bool                | Boolean   | 1 if the user booked the property, 0 if not.                    |
|                             |           ||
| srch_id                     | Integer   | The ID of the search                                                                                                                                                                                              |
| date_time                   | Date/time | Date and time of the search                                                                                                                                                                                       |
| site_id                     | Integer   | ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ..)                                                                                                                              |
| visitor_location_country_id | Integer   | The ID of the country the customer is located                                                                                                                                                                     |
| visitor_hist_starrating     | Float     | The mean star rating of hotels the customer has previously purchased; null signifies there is no purchase history on the customer                                                                                 |
| visitor_hist_adr_usd        | Float     | The mean price per night (in US$) of the hotels the customer has previously purchased; null signifies there is no purchase history on the customer                                                                |
| prop_country_id             | Integer   | The ID of the country the hotel is located in                                                                                                                                                                     |
| prop_id                     | Integer   | The ID of the hotel                                                                                                                                                                                               |
| prop_starrating             | Integer   | The star rating of the hotel, from 1 to 5, in increments of 1. A 0 indicates the property has no stars, the star rating is not known or cannot be publicized.                                                     |
| prop_review_score           | Float     | The mean customer review score for the hotel on a scale out of 5, rounded to 0.5 increments. A 0 means there have been no reviews, null that the information is not available.                                    |
| prop_brand_bool             | Integer   | +1 if the hotel is part of a major hotel chain; 0 if it is an independent hotel                                                                                                                                   |
| prop_location_score1        | Float     | A (first) score outlining the desirability of a hotel’s location                                                                                                                                                  |
| prop_location_score2        | Float     | A (second) score outlining the desirability of the hotel’s location                                                                                                                                               |
| prop_log_historical_price   | Float     | The logarithm of the mean price of the hotel over the last trading period. A 0 will occur if the hotel was not sold in that period.                                                                               |
| price_usd                   | Float     | Displayed price of the hotel for the given search. Note that different countries have different conventions regarding displaying taxes and fees and the value may be per night or for the whole stay              |
| promotion_flag              | Integer   | +1 if the hotel had a sale price promotion specifically displayed                                                                                                                                                 |
| srch_destination_id         | Integer   | ID of the destination where the hotel search was performed                                                                                                                                                        |
| srch_length_of_stay         | Integer   | Number of nights stay that was searched                                                                                                                                                                           |
| srch_booking_window         | Integer   | Number of days in the future the hotel stay started from the search date                                                                                                                                          |
| srch_adults_count           | Integer   | The number of adults specified in the hotel room                                                                                                                                                                  |
| srch_children_count         | Integer   | The number of (extra occupancy) children specified in the hotel room                                                                                                                                              |
| srch_room_count             | Integer   | Number of hotel rooms specified in the search                                                                                                                                                                     |
| srch_saturday_night_bool    | Boolean   | +1 if the stay includes a Saturday night, starts from Thursday with a length of stay is less than or equal to 4 nights (i.e. weekend); otherwise 0                                                                |
| srch_query_affinity_score   | Float     | The log of the probability a hotel will be clicked on in Internet searches (hence the values are negative)  A null signifies there are no data (i.e. hotel did not register in any searches)                      |
| orig_destination_distance   | Float     | Physical distance between the hotel and the customer at the time of search. A null means the distance could not be calculated.                                                                                    |
| random_bool                 | Boolean   | +1 when the displayed sort was random, 0 when the normal sort order was displayed                                                                                                                                 |
| comp*x*_rate                | Integer   | '*x*' denotes the competitor number. +1 if Expedia has a lower price than competitor 1 for the hotel; 0 if the same; -1 if Expedia’s price is higher than competitor 1; null signifies there is no competitive data |
| comp*x*_inv                 | Integer   | '*x*' denotes the competitor number. +1 if competitor 1 does not have availability in the hotel; 0 if both Expedia and competitor 1 have availability; null signifies there is no competitive data                  |
| comp*x*_rate_percent_diff   | Float     | '*x*' denotes the competitor number. The absolute percentage difference (if one exists) between Expedia and competitor 1’s price (Expedia’s price the denominator); null signifies there is no competitive data      |


**Models**

In [1]:
# Relevant imports
import torch
import torch.nn as nn

*Simple Neural model*

In [None]:
class RecommenderNet(nn.Module):
    """
    Neural prediction model for predicting the probability that a user will buy 
    a room at a certain hotel.
    """
    def __init__(self, num_features, hidden_size=64):
        super().__init__()
        self.embedding = nn.Linear(num_features, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 2*hidden_size)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(2*hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, features):
        x = self.embedding(features)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        
        x = self.fc1(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        
        x = self.fc2(x)
        x = nn.ReLU()(x)
        x = self.dropout(x)
        
        x = self.fc3(x)
        
        return x

_Training_

In [None]:
import torch.optim as optim

num_epochs = 100 #TODO
num_features = 0 #TODO

train_loader = None #TODO

model = RecommenderNet(num_features)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

for epoch in range(num_epochs):
    for i, (features, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        
        outputs = model(features)
        loss = criterion(outputs, targets.unsqueeze(1))
        loss.backward()
        
        optimizer.step()

_Evaluating_

In [None]:
# Define dataloader for test dataset. Each element of the test loader should be a tensor of
# shape (num_user_hotel_combinations, num_features) containing feature vectors for all user/hotel combinations
test_loader = None #TODO

with open("simple_nn_results.txt", "w") as f:
    f.write("SearchId, PropertyId")
    for features in test_loader:
        search_id = features[:, 0] #TODO extract search ID from feature vector correctly
        
        outputs = model(features)
        probabilities = torch.sigmoid(outputs.squeeze())

        sorted_indices = probabilities.argsort(descending=True)
        
        property_ids = features[sorted_indices][1] #TODO extract property ID from features
        
        for search, prop in zip(search_id, property_ids):
            f.writeln(f"{search}, {prop}")

**k-NN**

In [None]:
from sklearn.neighbors import NearestNeighbors

# assume X is a feature matrix of shape (num_user_hotel_combinations, num_features)
X = None

# assume y is a target vector of shape (num_user_hotel_combinations,)
y = None

# initialize a k-NN model with k=5
k = 5
knn = NearestNeighbors(n_neighbors=k)

# fit the k-NN model to the data
knn.fit(X)


# Define dataloader for test dataset. Each element of the test loader should be a tensor of
# shape (num_user_hotel_combinations, num_features) containing feature vectors for all user/hotel combinations
test_loader = None #TODO

with open("k_nn_results.txt", "w") as f:
    f.write("SearchId, PropertyId")
    for features in test_loader:
        search_id = features[:, 0] #TODO extract search ID from feature vector correctly
        
        # features list should be of shape (1, num_features)
        # find the k nearest neighbors to the user's feature vector
        distances, indices = knn.kneighbors(features)

        # select the hotels recommended by the k nearest neighbors
        recommended_hotels = set()
        for neighbor_index in indices.squeeze():
            if y[neighbor_index] == 1:
                recommended_hotels.add(neighbor_index)
        
        property_ids = features[recommended_hotels][1] #TODO extract property ID from features
        
        for search, prop in zip(search_id, property_ids):
            f.writeln(f"{search}, {prop}")
