# Colab commands

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/Colab\ Notebooks/build_model/TrainValidate.py /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_features_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_labels_removed.csv /content
!cp /content/drive/MyDrive/Colab\ Notebooks/datasets/final_sources_removed.csv /content
!cp /content/drive/MyDrive/all_removed_weighted_sampler_full_epoch_1000_fold_2.pth /content
!mkdir /content/models

In [None]:
!pip install --quiet torchinfo
!pip install --quiet torch_snippets

# Import packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import torch
from torch import nn
from torch import optim
from torch.utils.data import DataLoader, Dataset, WeightedRandomSampler

import torchinfo
from torch_snippets import Report
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from TrainValidate import TrainValidate, create_weighted_sampler

device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.has_mps:
    device = "mps"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device

'mps'

# Residual Connection
## Build the model

In [5]:
class ResidualBlock(nn.Module):
    def __init__(self, in_features, neurons):
        super(ResidualBlock, self).__init__()

        self.hidden = nn.Sequential()
        self.hidden.add_module("linear1", nn.Linear(in_features, neurons))
        self.hidden.add_module("a1", nn.ReLU())
        self.hidden.add_module("linear2", nn.Linear(neurons, neurons))
        self.hidden.add_module("dropout", nn.Dropout(0.5))
        self.hidden.add_module("a2", nn.ReLU())
        self.hidden.add_module("linear3", nn.Linear(neurons, in_features))
        self.hidden.add_module("bn1", nn.BatchNorm1d(in_features))

        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.hidden(x)
        out += x
        out = self.relu(out)

        return out

class PredictStatus(nn.Module):
    def __init__(self, in_features):
        super(PredictStatus, self).__init__()

        self.hidden = nn.Sequential()
        self.hidden.add_module("res1", ResidualBlock(in_features, 256))
        self.hidden.add_module("res2", ResidualBlock(in_features, 512))
        self.hidden.add_module("res3", ResidualBlock(in_features, 128))

        self.status = nn.Sequential()
        self.status.add_module("linear1", nn.Linear(in_features, 1))
        self.status.add_module("a1", nn.Sigmoid())

    def forward(self, x):
        hidden = self.hidden(x)
        out = self.status(hidden)

        return out.squeeze()

In [6]:
model = PredictStatus(54)
torchinfo.summary(model, input_size=(1, 54))

Layer (type:depth-idx)                   Output Shape              Param #
PredictStatus                            --                        --
├─Sequential: 1-1                        [1, 54]                   --
│    └─ResidualBlock: 2-1                [1, 54]                   --
│    │    └─Sequential: 3-1              [1, 54]                   93,858
│    │    └─ReLU: 3-2                    [1, 54]                   --
│    └─ResidualBlock: 2-2                [1, 54]                   --
│    │    └─Sequential: 3-3              [1, 54]                   318,626
│    │    └─ReLU: 3-4                    [1, 54]                   --
│    └─ResidualBlock: 2-3                [1, 54]                   --
│    │    └─Sequential: 3-5              [1, 54]                   30,626
│    │    └─ReLU: 3-6                    [1, 54]                   --
├─Sequential: 1-2                        [1, 1]                    --
│    └─Linear: 2-4                       [1, 1]                    55
│ 

## Create class for collecting data

In [7]:
class StatusDataset(Dataset):
    def __init__(self, features, labels):
        assert len(features) == len(labels)
        self.features = features
        self.labels = labels

    def __getitem__(self, item):
        features = self.features.to_numpy()[item]
        features = torch.tensor(features).float().to(device)

        labels = self.labels.to_numpy()[item]
        price = torch.tensor(labels[0]).float().to(device)

        return features, price

    def __len__(self):
        return len(self.features)

## Load datasets

In [8]:
features = pd.read_csv("../datasets/final_features_prices.csv")
labels = pd.read_csv("../datasets/final_labels_removed.csv")

## Data standardization

In [9]:
scaler = StandardScaler()
scaler.fit(features)
features[:] = scaler.transform(features)

## Split datasets

In [10]:
x_train, x_test, y_train, y_test = train_test_split(features, labels, random_state=1, test_size=0.1)

In [11]:
temp = StatusDataset(x_train, y_train)
in_features = len(temp[0][0])
in_features

53

## Train the model

In [12]:
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
torch.manual_seed(13)
epochs = 1000

for fold, (train_id, val_id) in enumerate(kfold.split(x_train.index)):
    train_feature, train_label = x_train.iloc[train_id], y_train.iloc[train_id]
    val_feature, val_label = x_train.iloc[val_id], y_train.iloc[val_id]
    print("\n\n-------------This is fold {}----------------".format(fold))

    train_data = StatusDataset(train_feature, train_label)
    val_data = StatusDataset(val_feature, val_label)
    in_features = len(train_data[0][0])

    train_sampler = create_weighted_sampler(train_label["Completed"].values)
    val_sampler = create_weighted_sampler(val_label["Completed"].values)

    train_loader = DataLoader(train_data, batch_size=16, shuffle=False, drop_last=True, sampler=train_sampler)
    val_loader = DataLoader(val_data, batch_size=16, shuffle=False, drop_last=True, sampler=val_sampler)

    model = PredictStatus(in_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=5e-4)

    train_validate = TrainValidate(model, nn.BCELoss(), optimizer)
    train_validate.set_loader(train_loader, val_loader)
    train_validate.train(epochs)

    train_validate.save_model("models/status_append_ws_removed_full_no_freeze_fold_{}.pth".format(fold))



-------------This is fold 0----------------
EPOCH: 0.364	train_loss: 0.567	(7.22s - 19826.07s remaining))

KeyboardInterrupt: 