# Colab commands

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/build_model/TrainValidate.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_labels_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_sources_removed.csv /content
!cp /content/drive/MyDrive/all_removed_weighted_sampler_full_epoch_1000_fold_2.pth /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_prices.csv /content
!mkdir /content/models

In [None]:
!pip install --quiet torchinfo
!pip install --quiet torch_snippets

# Import packages

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler
from torch.utils.tensorboard import SummaryWriter

import torchinfo
from torch_snippets import Report
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from TrainValidate import TrainValidate, create_weighted_sampler

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"

In [3]:
device

'mps'

# Basic model
## Load datasets

In [4]:
features = pd.read_csv("../datasets/final_features_prices.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")

## Creat class for collecting data

In [6]:
class StatusDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, item):
        features = self.features.to_numpy()[item]
        features = torch.tensor(features).float().to(device)

        labels = self.labels.to_numpy()[item]
        status = torch.tensor([labels[0]]).float().to(device)

        return features, status

    def __len__(self):
        return len(self.features)

In [17]:
temp = StatusDataset(features, labels)
in_features = len(temp[0][0])
temp[0]

(tensor([3.6100e+03, 0.0000e+00, 3.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
         6.0000e+00, 2.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00, 1.0000e+00,
         1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 1.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 1.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 6.2576e+02, 6.0000e+02],
        device='mps:0'),
 tensor([1.], device='mps:0'))

In [9]:
in_features

53

## Build the model

In [22]:
class PredictStatus(nn.Module):
    def __init__(self, in_features):
        super(PredictStatus, self).__init__()

        self.hidden = nn.Sequential(nn.Linear(in_features, 128),
                                    nn.BatchNorm1d(128),
                                    nn.ReLU(),
                                    nn.Linear(128, 256),
                                    nn.BatchNorm1d(256),
                                    nn.Dropout(0.5),
                                    nn.ReLU(),
                                    nn.Linear(256, 512),
                                    nn.BatchNorm1d(512),
                                    nn.ReLU(),
                                    nn.Linear(512, 256),
                                    nn.BatchNorm1d(256),
                                    nn.Dropout(0.5),
                                    nn.ReLU(),
                                    nn.Linear(256, 128),
                                    nn.BatchNorm1d(128),
                                    nn.Dropout(0.5),
                                    nn.ReLU())
        self.status = nn.Sequential(nn.Linear(128, 1),
                                    nn.Sigmoid())

    def forward(self, x):
        hidden = self.hidden(x)
        out = self.status(hidden)

        return out

In [23]:
model = PredictStatus(in_features)
torchinfo.summary(model, input_size=(1, in_features))

Layer (type:depth-idx)                   Output Shape              Param #
PredictStatus                            [1, 1]                    --
├─Sequential: 1-1                        [1, 128]                  --
│    └─Linear: 2-1                       [1, 128]                  6,912
│    └─BatchNorm1d: 2-2                  [1, 128]                  256
│    └─ReLU: 2-3                         [1, 128]                  --
│    └─Linear: 2-4                       [1, 256]                  33,024
│    └─BatchNorm1d: 2-5                  [1, 256]                  512
│    └─Dropout: 2-6                      [1, 256]                  --
│    └─ReLU: 2-7                         [1, 256]                  --
│    └─Linear: 2-8                       [1, 512]                  131,584
│    └─BatchNorm1d: 2-9                  [1, 512]                  1,024
│    └─ReLU: 2-10                        [1, 512]                  --
│    └─Linear: 2-11                      [1, 256]                  1

## Data standardization

In [24]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

## Split datasets

In [25]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.1)

## Cross validation

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
torch.manual_seed(13)
epochs = 1000

for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
    train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
    val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
    print("\n\n-------------This is fold {}----------------".format(fold))

    train_data = StatusDataset(train_feature, train_label)
    val_data = StatusDataset(val_feature, val_label)
    in_features = len(train_data[0][0])

    train_sampler = create_weighted_sampler(train_label["Completed"].values, [0.1, 0])
    val_sampler = create_weighted_sampler(val_label["Completed"].values, [0.1, 0])

    train_loader = DataLoader(train_data, batch_size=16, shuffle=False, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_data, batch_size=16, shuffle=False, drop_last=True, sampler=val_sampler)

    model = PredictStatus(in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    train_validate = TrainValidate(model, nn.BCELoss(), optimizer)
    train_validate.set_loader(train_loader, val_loader)
    train_validate.train(epochs)

    train_validate.save_model("models/status_append_ws_removed_full_no_freeze_fold_{}.pth".format(fold))



-------------This is fold 0----------------
