In [68]:
import numpy as np
import pandas as pd
import math

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler, normalize, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

In [69]:
# Get datasets

requests_train = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/requests_train.csv',
                             sep=',',
                             low_memory=False,
                             error_bad_lines=False)
requests_test = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/requests_test.csv',
                            sep=',',
                            low_memory=False,
                            error_bad_lines=False)

individuals_train = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/individuals_train.csv',
                                sep=',',
                                low_memory=False,
                                error_bad_lines=False)
individuals_test = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/individuals_test.csv',
                               sep=',',
                               low_memory=False,
                               error_bad_lines=False)



  requests_train = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/requests_train.csv',


  requests_test = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/requests_test.csv',


  individuals_train = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/individuals_train.csv',


  individuals_test = pd.read_csv(filepath_or_buffer='/work/data/housing_assistance/individuals_test.csv',


# Data preparation

## define function

In [70]:
# Cyclically encodes given data using sin and cos

def cyclical_encoding(data, col, col_name, max_val):
    data[col_name + '_sin'] = np.sin(2 * np.pi * col/max_val)
    data[col_name + '_cos'] = np.cos(2 * np.pi * col/max_val)
    return data

In [71]:
# Encodes timestamp by seperating year, month, day and cyclically encoding month and day

def date_encoding(data, var):
    data[f'{var}_creation_date'] = pd.to_datetime(data[f'{var}_creation_date'], yearfirst=True)
    data[f'{var}_creation_year'] = data[f'{var}_creation_date'].dt.year
    data = cyclical_encoding(data, data[f'{var}_creation_date'].dt.month, f'{var}_creation_month', 12.0) #to do: check
    data = cyclical_encoding(data, data[f'{var}_creation_date'].dt.day, f'{var}_creation_day', 365.0)
    data.drop(columns=[f'{var}_creation_date'], inplace=True)
    return data

### preprocess_initial_dataset

In [72]:
def preprocess_initial_dataset(request_df, indiv_df, request_drops, indiv_drops):
    # Encode timestamps
    request_df = date_encoding(request_df, "answer")
    request_df = date_encoding(request_df, "group")
    request_df = date_encoding(request_df, "request")

    indiv_df = date_encoding(indiv_df, "individual")

    # Drop unneeded columns- if too many modalities or new data present in test set and not training set
    request_df.drop(request_drops, axis=1, inplace=True)
    indiv_df.drop(indiv_drops, axis=1, inplace=True)

    indiv_df["childcare_center_supervision"] = indiv_df[
        "childcare_center_supervision"
    ].fillna("x")
    indiv_df["disabled_worker_certification"] = indiv_df[
        "disabled_worker_certification"
    ].fillna("x")

    # Group individuals
    indiv_df = indiv_df.groupby(["request_id"]).agg(["mean", "median"])

    # Flatten individuals data
    indiv_df.columns = [" ".join(col).strip() for col in indiv_df.columns.values]

    # Merge request and individual data
    X_train = pd.merge(request_df, indiv_df, how="left", on="request_id")
    return X_train, indiv_df

### preprocess_train

In [73]:
def preprocess_train(X, numeric_features, categorical_features):
    cols = numeric_features + categorical_features
    X = X[cols].copy()

    numeric_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="mean")),
            ("scaler", StandardScaler()),
        ]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            (
                "ohe",
                OneHotEncoder(handle_unknown="ignore", max_categories=30, sparse=False),
            ),
        ]
    )
    # to do: check if need mode or "unknown"

    X_preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
        ]
    )

    X_transformed = X_preprocessor.fit_transform(X)

    return X_transformed, X_preprocessor

### preprocess_prediction

In [74]:
def preprocess_prediction(X, pipeline):
    return pipeline.transform(X)

## CONSTANT

In [75]:
request_drops = [
    "granted_number_of_nights",
    "group_id",
    "group_main_requester_id",
    "housing_situation_label",
    "request_backoffice_creator_id",
    "social_situation_id",
    "victim_of_violence_type",
]

indiv_drops = [
    "individual_id",
    "housing_situation_2_label",
    "individual_role",
    "individual_role_2_label",
    "marital_status_label",
]

TARGET = "granted_number_of_nights"

req_numeric_features = [
    "answer_creation_year",
    "answer_creation_month_sin",
    "answer_creation_month_cos",
    "answer_creation_day_sin",
    "answer_creation_day_cos",
    "group_composition_id",
    "group_creation_year",
    "group_creation_month_sin",
    "group_creation_month_cos",
    "group_creation_day_sin",
    "group_creation_day_cos",
    "housing_situation_id",
    "number_of_underage",
    "request_creation_year",
    "request_creation_month_sin",
    "request_creation_month_cos",
    "request_creation_day_sin",
    "request_creation_day_cos",
]


req_categorical_features = [
    "animal_presence",
    "child_to_come",
    "group_type",
    "long_term_housing_request",
    "requester_type",
    "town",
    "victim_of_violence",
    "district",
    "child_situation",
]

## Preprocessing

### copy df

In [76]:
# request
req_train = requests_train.copy()
req_train_outputs = requests_train[TARGET].copy().values

req_test = requests_test.copy()

# indiv
indiv_train = individuals_train.copy()
indiv_test = individuals_test.copy()

In [77]:
print(req_train.shape)
print(req_test.shape)

(238191, 24)
(59548, 24)


### preprocess_initial_dataset

In [78]:
X_train_validate, indiv_train = preprocess_initial_dataset(
    req_train,
    indiv_train,
    request_drops,
    indiv_drops,
)

X_test, indiv_test = preprocess_initial_dataset(
    req_test,
    indiv_test,
    request_drops,
    indiv_drops,
)

  indiv_df = indiv_df.groupby(["request_id"]).agg(["mean", "median"])
  indiv_df = indiv_df.groupby(["request_id"]).agg(["mean", "median"])


In [79]:
print(X_train_validate.shape)
print(X_test.shape)

(238191, 49)
(59548, 49)


### train test split

In [80]:
# Split training dataset
X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, req_train_outputs, test_size=0.20, 
                                                            stratify=req_train_outputs)

In [81]:
print(X_train.shape)
print(X_validate.shape)

(190552, 49)
(47639, 49)


### encoder

In [82]:
numeric_features = req_numeric_features + indiv_train.columns.values.tolist()
categorical_features = req_categorical_features

In [83]:
X_train_transformed, pipeline = preprocess_train(
    X_train,
    numeric_features,
    categorical_features,
)

In [84]:
X_validate_transformed = preprocess_prediction(X_validate, pipeline)

In [85]:
X_test_transformed = preprocess_prediction(X_test, pipeline)

In [86]:
print(X_train_transformed.shape)
print(X_validate_transformed.shape)
print(X_test_transformed.shape)

(190552, 119)
(47639, 119)
(59548, 119)


# Model

## Prep data for model

In [87]:
pd.set_option('display.max_columns', None)

In [88]:
# transform to torch tensors

X_train_tensor = torch.Tensor(X_train_transformed)
y_train_tensor = torch.Tensor(y_train)
y_train_tensor = y_train_tensor.type(torch.LongTensor)
# y_train_tensor = .reshape(-1, 1)

X_validate_tensor = torch.Tensor(X_validate_transformed)
y_validate_tensor = torch.Tensor(y_validate)
y_validate_tensor = y_validate_tensor.type(torch.LongTensor)

X_test_tensor = torch.Tensor(X_test_transformed)

In [89]:
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
validate_dataset = TensorDataset(X_validate_tensor, y_validate_tensor)


test_dataset = TensorDataset(
    X_test_tensor, torch.Tensor(np.random.randint(4, size=X_test_tensor.shape[0]))
)

In [90]:
# create dataloaders
batch_size = 100

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
validate_dataloader = DataLoader(validate_dataset, batch_size=batch_size, shuffle=False)

test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Create model

In [91]:
# # Create model

# input_size = X_train.shape[1] # + 1 for bias
# hidden_sizes = [100, 50]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[1], output_size),
#     nn.LogSoftmax(dim=1), # activation function for classification, normalizes values along axis 1
# )
# # drop out layer
# # activation func, find diff funcs that work
# # hidden layer- size, amounr, etc
# # loss

In [92]:
# # Model 2- hidden sizes

# input_size = X_train.shape[1]
# hidden_sizes = [45, 10]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[1], output_size),
#     nn.LogSoftmax(dim=1), 
# )

In [93]:
# # Model 3- dropout layers

# input_size = X_train.shape[1]
# hidden_sizes = [45, 10]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.Dropout(),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.Dropout(),
#     nn.ReLU(),
#     nn.Linear(hidden_sizes[1], output_size),
#     nn.LogSoftmax(dim=1), 
# )

In [94]:
# Model 4- Prevent dying neurons

# input_size = X_train.shape[1]
# hidden_sizes = [512, 128]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.LeakyReLU(),
#     torch.nn.Dropout(0.1),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.LeakyReLU(),
#     torch.nn.Dropout(0.4),
#     nn.Linear(hidden_sizes[1], output_size),
    
# )

In [95]:
# # Model 5- different final activation func

# input_size = X_train.shape[1]
# hidden_sizes = [45, 10]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.Dropout(0.3),
#     nn.LeakyReLU(),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.LeakyReLU(),
#     nn.Linear(hidden_sizes[1], output_size),
#     nn.Softmax(dim=1), 
# )

In [96]:
# # Model 5- different final activation func

# input_size = X_train.shape[1]
# hidden_sizes = [45, 10]
# output_size = 4

# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.Dropout(0.3),
#     nn.LeakyReLU(),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.LeakyReLU(),
#     nn.Linear(hidden_sizes[1], output_size),
#     nn.Softmax(dim=1), 
# )

## Train and validate

In [97]:
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, input_size, h0, h1, h2, output_size):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(input_size, h0)
        self.dp1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(h0, h1)
        self.dp2 = torch.nn.Dropout(0.2)
        self.fc3 = nn.Linear(h1, h2)
        self.dp3 = torch.nn.Dropout(0.3)
        self.fc4 = nn.Linear(h2, output_size)
        

    def forward(self, X):
        # layer1
        out = F.leaky_relu(self.fc1(X))
        out = self.dp1(out)
        # layer2
        out = F.leaky_relu(self.fc2(out))
        out = self.dp2(out)
        # layer3
        out = F.relu(self.fc3(out))
        out = self.dp3(out)
        # layer3
        out = self.fc4(out)
        
        return out

In [98]:

class Model(nn.Module):
    def __init__(self, input_size, h0, h1, h2, output_size):
        super(Model, self).__init__()

        self.fc1 = nn.Linear(input_size, h0)
        self.dp1 = nn.Dropout(0.2)
        self.fc2 = nn.Linear(h0, h1)
        self.dp2 = torch.nn.Dropout(0.2)
        self.fc3 = nn.Linear(h1, output_size)
        

    def forward(self, X):
        # layer1
        out = F.leaky_relu(self.fc1(X))
        out = self.dp1(out)
        # layer2
        out = F.leaky_relu(self.fc2(out))
        out = self.dp2(out)
        # layer3
        out = self.fc3(out)
        
        return out

In [99]:
#criterion = nn.NLLLoss() # to do: account for unbalanced
#criterion = nn.PoissonNLLLoss()

weight = torch.Tensor([10**el for el in range(4)])
print(f"weight: {weight}")


# Define model
input_size = X_train_transformed.shape[1]
hidden_sizes = [150, 60, 30]
output_size = 4

weight: tensor([   1.,   10.,  100., 1000.])


In [100]:
model = Model(input_size, hidden_sizes[0], hidden_sizes[1], hidden_sizes[2], output_size)

device = "cuda"
model = model.to(device)

In [101]:
# model = torch.nn.Sequential(
#     nn.Linear(input_size, hidden_sizes[0]),
#     nn.LeakyReLU(),
# #     torch.nn.Dropout(0.2),
#     nn.Linear(hidden_sizes[0], hidden_sizes[1]),
#     nn.LeakyReLU(),
# #     torch.nn.Dropout(0.2),
#     nn.Linear(hidden_sizes[1], output_size),
# )
# model = model.cuda()
# print(model)


In [102]:
n_epoch = 30

print(model)

criterion = nn.CrossEntropyLoss(weight=weight.cuda())
# criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001) #lr=0.001


# Train model

model.train()
for epoch in range(n_epoch):
    train_loss = 0.0
    
    for i, data in enumerate(train_dataloader):     
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Perform forward pass, backpropogation, and optimizer step
        outputs = model(inputs)
        loss = criterion(outputs, labels)
    
        optimizer.zero_grad()
        loss.backward()  # model learning
        
        optimizer.step() # optimizing weights
        train_loss += loss.item()
    print(train_loss / len(train_dataloader))

    # to do: validate later

Model(
  (fc1): Linear(in_features=119, out_features=150, bias=True)
  (dp1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=150, out_features=60, bias=True)
  (dp2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=60, out_features=4, bias=True)
)
0.9707436874305589
0.8134481151669373
0.7645778346255594
0.7405097114952264
0.7282349711356358
0.7188937951109967
0.7129046597986132
0.7070022132032691
0.7004978077140209
0.6945794878482568
0.6890070446359399
0.6850863202530715
0.679951983352523
0.6760937691766843
0.6727434562798688
0.6685160892113559
0.6683365847888048
0.6636286321304278
0.6602959308269016
0.6562066148235565
0.6524591565538679
0.651687609045478
0.6465869450713004
0.6455509856046184
0.6418374467011764
0.638337201322551
0.6396013053773207
0.6368685240885643
0.6351564720992277
0.6296330052667375


In [103]:
print(model)

Model(
  (fc1): Linear(in_features=119, out_features=150, bias=True)
  (dp1): Dropout(p=0.2, inplace=False)
  (fc2): Linear(in_features=150, out_features=60, bias=True)
  (dp2): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=60, out_features=4, bias=True)
)


In [104]:
# valid_loss = 0.0
# model.eval()

# with torch.no_grad():
#     for i, data in enumerate(validate_dataloader):
#         inputs, labels = data
#         outputs = model(inputs)
#         labels = labels.squeeze_()
#         loss = criterion(outputs, labels)
#         valid_loss = loss.item()
        
# train_loss /= len(X_train)
# valid_loss /= len(X_validate)
# print(f'Epoch: {epoch+1}/{epoch_range} \nTraining loss: {train_loss} \nValidation Loss: {valid_loss}')

## Evaluate on test set

In [105]:
preds = []
test_loss = 0.0

with torch.no_grad():
    for i, inputs in enumerate(test_dataloader):
        # print(inputs)
        model.eval()
        
        X = inputs[0].to(device)  
        
        outputs = model.forward(X)
        outputs = F.softmax(outputs, dim=1)
        # print(outputs)
#         pred = outputs.argmax(1)
        pred = outputs.cpu().data.numpy()
        #print(pred)
        preds.append(pred)
    
# print(preds)

In [106]:
p_flatten = np.concatenate(preds, axis=0)

In [107]:
p_flatten

array([[0.48310304, 0.07251088, 0.36292577, 0.0814603 ],
       [0.03820433, 0.09640943, 0.45239067, 0.4129955 ],
       [0.00680584, 0.04137591, 0.42570946, 0.52610874],
       ...,
       [0.00501883, 0.04434477, 0.9468825 , 0.00375384],
       [0.01922334, 0.07781976, 0.6810838 , 0.22187316],
       [0.90508896, 0.03084334, 0.05370409, 0.01036356]], dtype=float32)

In [108]:
p_flatten

array([[0.48310304, 0.07251088, 0.36292577, 0.0814603 ],
       [0.03820433, 0.09640943, 0.45239067, 0.4129955 ],
       [0.00680584, 0.04137591, 0.42570946, 0.52610874],
       ...,
       [0.00501883, 0.04434477, 0.9468825 , 0.00375384],
       [0.01922334, 0.07781976, 0.6810838 , 0.22187316],
       [0.90508896, 0.03084334, 0.05370409, 0.01036356]], dtype=float32)

In [109]:
assert len(X_test) == len(p_flatten)

In [110]:
# Define test scorer

def competition_scorer(y_true, y_pred):
    return log_loss(y_true, y_pred, sample_weight=10**y_true)

In [111]:
random_preds = np.random.uniform(size=(requests_test.shape[0], 4))
y_true = requests_test.granted_number_of_nights.values
print(y_true)

random_score = competition_scorer(y_true, random_preds)
print(f'Random test score: {random_score}')

[0 0 0 ... 2 1 0]
Random test score: 1.6673729796281185


In [112]:
y_true

array([0, 0, 0, ..., 2, 1, 0])

In [113]:
preds = np.array(preds)

  preds = np.array(preds)


In [114]:
model_score = competition_scorer(y_true, p_flatten)
print(f'Model test score: {model_score}')

Model test score: 0.6586562687987075


In [115]:
# 1.3862943649291992