In [1]:
#!/usr/bin/env python
import numpy as np
import pandas as pd
import argparse
import copy
import torch
import torch.nn as nn
import time
import sys
import os
from eval_sparseloader import DataLoader
from eval_data import LibSVMData, LibCSVData, LibSVMRegData
from eval_sparse_data import LibSVMDataSp
sys.path.append('/Users/nelvis/Documents/R/GrowNet/Regression')
from models.mlp import MLP_1HL, MLP_2HL, MLP_3HL
from models.dynamic_net import DynamicNet, ForwardType
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from torch.optim import SGD, Adam

# Prepare Data

In [2]:
df = pd.read_csv('/Users/nelvis/Documents/R/GrowNet/Regression/data/datwTestTrainSplit.csv')

list_of_features = [
    "claim_no",
    "occurrence_time", 
    "notidel", 
    "development_period", 
    "pmt_no",
    "log1_paid_cumulative",
    "max_paid_dev_factor",
    "min_paid_dev_factor",
]
output_field = ["claim_size"]
youtput="claim_size"

In [79]:
# Only take training subsection, we save the TEST part for the final final final validation
df_train = df.query('train_ind == 1')
df_test = df.query('train_ind == 0')
df_one = df.head(1)

# Only consider relevant columns
df_train = df_train.filter(items = list_of_features + [youtput])
df_test = df_test.filter(items = list_of_features + [youtput])
df_one = df_one.filter(items = list_of_features + [youtput])

# Extract values
X_train, y_train = df_train.iloc[:, :8].values, df_train.iloc[:,[8]]
X_test, y_test = df_test.iloc[:, :8].values, df_test.iloc[:,[8]]
X_one, y_one = df_one.iloc[:, :8].values, df_one.iloc[:,[8]]

In [80]:
# Saving train and test into npz file
np.savez('\datwTestTrainSplit_train.npz', features=X_train, labels=y_train)
np.savez('\datwTestTrainSplit_test.npz', features=X_test, labels=y_test)
np.savez('\datwTestTrainSplit_one.npz', features=X_one, labels=y_one)

# Load Model

In [170]:
class Options(object):
    def __init__(self, feat_d, hidden_d, batch_size, normalization, train, test, one) -> None:
        self.feat_d = feat_d
        self.hidden_d = hidden_d
        self.sparse = False
        self.batch_size = batch_size
        self.normalization = normalization
        self.tr = train
        self.te = test
        self.one = one

In [176]:
opt = Options(feat_d = 8, hidden_d = 32, batch_size = 2048, normalization = True, 
              train = "/Users/nelvis/Documents/R/GrowNet/Regression/ckpt/\datwTestTrainSplit_train.npz", 
              test = "/Users/nelvis/Documents/R/GrowNet/Regression/ckpt/\datwTestTrainSplit_test.npz",
              one = "/Users/nelvis/Documents/R/GrowNet/data/datwTestTrainSplit_tr.npz")

In [177]:
train = LibSVMRegData(opt.tr, opt.feat_d, opt.normalization)
test = LibSVMRegData(opt.te, opt.feat_d, opt.normalization)
one = LibSVMRegData(opt.one, opt.feat_d, opt.normalization)

8
8
8


In [178]:
train_loader = DataLoader(train, opt.batch_size, shuffle=True, drop_last=False, num_workers=0)
test_loader = DataLoader(test, opt.batch_size, shuffle=True, drop_last=False, num_workers=0)
one_loader = DataLoader(one, opt.batch_size, shuffle=True, drop_last=False, num_workers=0)

In [179]:
net_ensemble = DynamicNet.from_file("./datwTestTrainSplit_cls.pth", lambda stage: MLP_2HL.get_model(stage, opt))

In [180]:
net_ensemble.to_eval()

In [174]:
def root_mse(net_ensemble, loader):
    loss = 0
    total = 0

    for x, y in loader:
        with torch.no_grad():
            _, out = net_ensemble.forward(x)
        y = y.cpu().numpy().reshape(len(y), 1)
        out = out.cpu().numpy().reshape(len(y), 1)
        loss += mean_squared_error(y, out)* len(y)
        total += len(y)
    return np.sqrt(loss / total)

In [181]:
root_mse(net_ensemble, one_loader)

7666172.336520171

In [130]:
out = torch.empty(0)
orig = torch.empty(0)

for x, y in train_loader:
    with torch.no_grad():
            _, out_tmp = net_ensemble.forward(x)
    out = torch.cat([out, out_tmp])
    orig = torch.cat([orig, y])

In [142]:
orig.numpy().flatten()

array([484789.3   ,  29068.596 ,  35964.926 , ..., 630765.5   ,
        62283.812 ,   1927.2223], dtype=float32)

In [143]:
df_results = pd.DataFrame({'Original_Values' : orig.numpy().flatten(), 'Predictions' : out.numpy().flatten()})

In [145]:
df_results

Unnamed: 0,Original_Values,Predictions
0,484789.312500,-12277146.00
1,29068.595703,6581229.00
2,35964.925781,-1721630.75
3,136629.531250,7238783.50
4,4010.083740,5038468.50
...,...,...
85608,5309.017090,-13150435.00
85609,12193.708008,7666482.50
85610,630765.500000,2245062.75
85611,62283.812500,7758845.00


In [124]:
out.shape

torch.Size([85613])

In [125]:
orig.shape

torch.Size([85613, 1])

In [126]:
y_train.shape

(85613, 1)

# Load results from training

In [29]:
training_results = np.load('/Users/nelvis/Documents/R/GrowNet/Regression/results/datwTestTrainSplit_rmse.npz')

In [39]:
training_results['rmse']

array([[266753.62, 185097.88,      0.  ],
       [266623.97, 185000.1 ,      0.  ],
       [266328.2 , 184752.27,      0.  ],
       [265836.53, 184337.92,      0.  ],
       [264906.4 , 183530.72,      0.  ],
       [264030.3 , 182802.86,      0.  ],
       [262574.4 , 181622.38,      0.  ],
       [261187.27, 180582.42,      0.  ],
       [260015.52, 179691.61,      0.  ],
       [255709.88, 176111.67,      0.  ],
       [250062.03, 171962.92,      0.  ],
       [250889.73, 172617.03,      0.  ],
       [242953.5 , 166382.03,      0.  ],
       [232219.  , 156184.23,      0.  ],
       [224283.4 , 151828.38,      0.  ],
       [213708.12, 144387.7 ,      0.  ],
       [215110.75, 145047.89,      0.  ],
       [208653.31, 140304.14,      0.  ],
       [200601.95, 131433.23,      0.  ],
       [199986.05, 133467.58,      0.  ]], dtype=float32)