<a href="https://colab.research.google.com/github/milesfking/2020-presidential-election-model/blob/main/CS-671-Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Packages

In [2]:
# Import necessary libraries
import os
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

import argparse
import os, sys
import time
import datetime
from tqdm import tqdm_notebook as tqdm
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import random

# Load in Dataset

In [99]:
train_df = pd.read_csv("kaggle_train.csv")
test_df = pd.read_csv("kaggle_test.csv")

# Data Cleaning

## Dropping columns

We want to begin by dropping any features that are not relevant for prediction, including metadata like website URLs, index, etc.)

In [100]:
model_features = [
       'host_id', 'host_since', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates',
       'bathrooms_text', 'beds', 'amenities', 'minimum_nights',
       'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'has_availability',
       'availability_30', 'availability_60', 'availability_90',
       'availability_365', 'calendar_last_scraped', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d', 'instant_bookable',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms']

X_train = train_df[model_features]
y_train = train_df['price']

X_test = test_df[model_features]

## Drop missing data

In [101]:
# Drop rows with NaN values in X_train
X_train_clean = X_train.dropna()

# Get the indices of the dropped rows
dropped_indices = X_train.index.difference(X_train_clean.index)

# Drop corresponding rows in y_train
y_train_clean = y_train.drop(dropped_indices)

## Feature engineering

In [102]:
from datetime import datetime

# Convert 'host_since' to datetime type
X_train_clean.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')
X_test.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')

# Use the current date or the maximum date from your dataset as the reference date
current_date = X_train_clean['host_since'].max()

# Calculate the number of years as a host
X_train_clean.loc[:, 'years_as_host'] = (current_date - X_train_clean['host_since']).dt.days / 365.25
X_test.loc[:, 'years_as_host'] = (current_date - X_train_clean['host_since']).dt.days / 365.25

# Optionally, drop the original 'host_since' column
X_train_clean = X_train_clean.drop(columns=['host_since'])
X_test = X_test.drop(columns=['host_since'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train_clean.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')
  X_train_clean.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')
  X_test.loc[:, 'host_since'] = pd.to_datetime(X_train_clean['host_since'], format='%Y-%m-%d')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

## Encode and normalize data

In [103]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Separate features into numeric and categorical
numeric_columns = X_train_clean.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = X_train_clean.select_dtypes(include=['object']).columns

# Define a function to replace less frequent categories with 'Other'
def replace_rare_categories(data, column, threshold=0.02):
    series = pd.value_counts(data[column])
    mask = series / series.sum() <= threshold
    rare_categories = series[mask].index
    data.loc[data[column].isin(rare_categories), column] = 'Other'
    return data

# Apply the function to each categorical column
for col in categorical_columns:
    X_train_clean = replace_rare_categories(X_train_clean, col)

# Rest of your code for preprocessing remains the same
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

X_train_preprocessed = preprocessor.fit_transform(X_train_clean)
y_train_preprocessed = y_train_clean.to_numpy().astype(np.float32)

# Model

In [179]:
def set_all_seeds(RANDOM_SEED):
    random.seed(RANDOM_SEED)     # python random generator
    np.random.seed(RANDOM_SEED)  # numpy random generator

    torch.manual_seed(RANDOM_SEED)
    torch.cuda.manual_seed_all(RANDOM_SEED)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
set_all_seeds(42)

In [180]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [181]:
# Define a simple fully connected network for regression
class OrdinalClassifier(nn.Module):
    def __init__(self, input_size):
        super(OrdinalClassifier, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 6)  # 6 output units for 6 categories

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # No softmax here, it's included in nn.CrossEntropyLoss
        return x

In [182]:
# Load your data (assuming X_train_preprocessed and y_train_preprocessed are numpy arrays)
X_train, X_val, y_train, y_val = train_test_split(X_train_preprocessed, y_train_preprocessed, test_size=0.2, random_state=42)

In [183]:
# Convert your data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.float32)
y_val = torch.tensor(y_val, dtype=torch.long)

# Create datasets and dataloaders
train_dataset = torch.utils.data.TensorDataset(X_train, y_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = torch.utils.data.TensorDataset(X_val, y_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=32, shuffle=False)

# Initialize the network and optimizer
INITIAL_LR = 0.01
input_size = X_train.shape[1]
net = OrdinalClassifier(input_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=INITIAL_LR, momentum=0.9)

In [184]:
EPOCHS = 30
CHECKPOINT_FOLDER = "./saved_model"
best_val_loss = float('inf')

# Training Loop
train_losses = []
val_losses = []
for i in range(EPOCHS):
    net.train()

    # Record training loss
    train_loss = 0

    # Looping through training loader
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.to(device), targets.to(device)  # Send input and target to device

        optimizer.zero_grad()  # Zero the parameter gradients
        outputs = net(inputs)  # Forward pass: compute the model output
        loss = criterion(outputs, targets)  # Compute loss
        loss.backward()  # Backward pass: compute gradient of the loss with respect to model parameters
        optimizer.step()  # Perform a single optimization step

        train_loss += loss.item()  # Sum up batch loss

    # calculate average training loss
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)

    # Evaluate the validation set performance
    net.eval()

    # Record validation loss
    val_loss = 0
    correct = 0
    total = 0

    # Disable gradient during validation, which can save GPU memory
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(val_loader):
            inputs, targets = inputs.to(device), targets.to(device)  # Send input and target to device

            outputs = net(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)

    val_accuracy = correct / total
    print(f'Epoch {i}: Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')

    # Save the model if validation loss has decreased
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        if not os.path.exists(CHECKPOINT_FOLDER):
            os.makedirs(CHECKPOINT_FOLDER)
        torch.save(net.state_dict(), os.path.join(CHECKPOINT_FOLDER, 'best_model.bin'))

print(f"Best Validation Loss: {best_val_loss:.4f}")


Epoch 0: Train Loss: 1.4347, Validation Loss: 1.2565, Validation Accuracy: 0.4792
Epoch 1: Train Loss: 1.2293, Validation Loss: 1.2423, Validation Accuracy: 0.4795
Epoch 2: Train Loss: 1.1780, Validation Loss: 1.1651, Validation Accuracy: 0.4936
Epoch 3: Train Loss: 1.1527, Validation Loss: 1.1876, Validation Accuracy: 0.4970
Epoch 4: Train Loss: 1.1367, Validation Loss: 1.1550, Validation Accuracy: 0.5134
Epoch 5: Train Loss: 1.1197, Validation Loss: 1.1251, Validation Accuracy: 0.5235
Epoch 6: Train Loss: 1.1083, Validation Loss: 1.1369, Validation Accuracy: 0.5185
Epoch 7: Train Loss: 1.0993, Validation Loss: 1.1545, Validation Accuracy: 0.5131
Epoch 8: Train Loss: 1.0882, Validation Loss: 1.1312, Validation Accuracy: 0.5178
Epoch 9: Train Loss: 1.0748, Validation Loss: 1.1291, Validation Accuracy: 0.5161
Epoch 10: Train Loss: 1.0671, Validation Loss: 1.1330, Validation Accuracy: 0.5175
Epoch 11: Train Loss: 1.0547, Validation Loss: 1.1158, Validation Accuracy: 0.5279
Epoch 12: Trai

## Prediction

In [185]:
# Transform X_test with the same preprocessor
X_test_preprocessed = preprocessor.transform(X_test)

# Convert to DataFrame (optional, for handling column alignment)
X_train_preprocessed_df = pd.DataFrame(X_train_preprocessed, columns=preprocessor.get_feature_names_out())
X_test_preprocessed_df = pd.DataFrame(X_test_preprocessed, columns=preprocessor.get_feature_names_out())

# Add missing columns in X_test with zeros
missing_cols = set(X_train_preprocessed_df.columns) - set(X_test_preprocessed_df.columns)
for c in missing_cols:
    X_test_preprocessed_df[c] = 0

# Ensure the order of columns is the same
X_test_preprocessed_df = X_test_preprocessed_df[X_train_preprocessed_df.columns]
X_test_preprocessed = X_test_preprocessed_df.to_numpy()

In [188]:
# Load the best model
best_model_path = os.path.join(CHECKPOINT_FOLDER, 'best_model.bin')
state_dict = torch.load(best_model_path)
net.load_state_dict(state_dict)

# Prepare your test data
# Assuming X_test_preprocessed is your test data and already a numpy array
X_test_tensor = torch.tensor(X_test_preprocessed, dtype=torch.float32).to(device)

# Create a DataLoader for the test set if necessary
# Assuming batch size of 32 for consistency
test_dataset = torch.utils.data.TensorDataset(X_test_tensor)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

# Make predictions
net.eval()  # Set the model to evaluation mode
predictions = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs[0].to(device)  # Move inputs to the device
        outputs = net(inputs)
        _, predicted_classes = torch.max(outputs, 1)  # Get the predicted classes
        predictions.extend(predicted_classes.cpu().numpy())

# Convert predictions to a numpy array
predictions = np.array(predictions)

In [202]:
# Create a DataFrame
df_predictions = pd.DataFrame({
    'price': predictions
})

# Display the DataFrame
print(df_predictions)

      price
0         3
1         3
2         3
3         3
4         3
...     ...
6286      5
6287      4
6288      1
6289      0
6290      3

[6291 rows x 1 columns]


In [204]:
df_predictions.to_csv("submission1.csv", index=True, index_label='id')