In [1]:
import pandas as pd
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 200)

In [3]:
df = pd.read_pickle('mortage_data.pkl')

In [4]:
float_columns = ['Original Interest Rate', 'Current Interest Rate', 'Original UPB', 'Current Actual UPB', 
'UPB at the Time of Removal', 'Total Principal Current', 'Last Paid Installment Date', 'Foreclosure Date', 'Mortgage Insurance Percentage']

In [5]:
one_hot_columns = [ 'Channel', 'Seller Name', 'Servicer Name', 
'First Time Home Buyer Indicator', 'Loan Purpose', 'Property Type', 'Occupancy Status', 'Property State', 'Amortization Type', 
'Prepayment Penalty Indicator', 'Interest Only Loan Indicator', 'Current Loan Delinquency Status', 'Modification Flag', 'Relocation Mortgage Indicator', 'High Balance Loan Indicator', 
'Borrower Assistance Plan', 'High Loan to Value (HLTV) Refinance Option Indicator', 'Repurchase Make Whole Proceeds Flag', 'Alternative Delinquency Resolution', 
'Alternative Delinquency Resolution Count', 'Borrower Education Level', 'Borrower Employmeny']

In [6]:
drop_columns = ['Loan Payment History', 'Loan Identifier', 'Zero Balance Effective Date', 'Zero Balance Code']

In [7]:
for col in float_columns:
    df[col] = pd.to_numeric(df[col],errors='coerce').fillna(-1)

In [8]:
for col in one_hot_columns:
    df = pd.concat([df.drop(columns=[col]), pd.get_dummies(df[col], prefix=col + ' ')],axis=1)

In [9]:
class Batcher:
  def __init__(self, num_items, batch_size, seed=0):
    self.indices = np.arange(num_items)
    self.num_items = num_items
    self.batch_size = batch_size
    self.rnd = np.random.RandomState(seed)
    self.rnd.shuffle(self.indices)
    self.ptr = 0
  def __iter__(self):
    return self
  def __next__(self):
    if self.ptr + self.batch_size > self.num_items:
      self.rnd.shuffle(self.indices)
      self.ptr = 0
      raise StopIteration  
    else:
      result = self.indices[self.ptr:self.ptr+self.batch_size]
      self.ptr += self.batch_size
      return result

In [10]:
class DataProcessor(torch.utils.data.Dataset):
    def __init__(self, X, y, scale_data=True):
        if not torch.is_tensor(X) and not torch.is_tensor(y):
            if scale_data:
                X = StandardScaler().fit_transform(X)
            self.X = torch.from_numpy(X.astype(float))
            self.y = torch.from_numpy(y.astype(float))
      
    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [16]:
class MLP(torch.nn.Module):

  def __init__(self):
    super().__init__()
    self.layers = torch.nn.Sequential(
      torch.nn.Linear(370, 200),
      torch.nn.ReLU(),
      torch.nn.Linear(200, 100),
      torch.nn.ReLU(),
      torch.nn.Linear(100, 50),
      torch.nn.ReLU(),
      torch.nn.Linear(50, 20),
      torch.nn.ReLU(),
      torch.nn.Linear(20, 10),
      torch.nn.ReLU(),
      torch.nn.Linear(10, 1)
    )

  def forward(self, x):
    '''
      Forward pass
    '''
    return self.layers(x)

In [17]:
def downsample(data_x, data_y, num):

    ind = np.r_[:len(data_y)][data_y>0]
    idx = np.r_[:len(data_y)][data_y<=0]
    data_x = np.hstack((data_x[np.random.choice(ind, num)], data_x[np.random.choice(idx, num)])) 
    data_y = np.hstack((data_y[np.random.choice(ind, num)], data_y[np.random.choice(idx, num)]))

    return data_x, data_y

In [18]:
torch.manual_seed(42)

<torch._C.Generator at 0x20bfd4a99f0>

In [19]:
data_x = df.drop(columns=drop_columns+['Total Deferral Amount']).values
data_y = df['Total Deferral Amount'].fillna(0).values
original_dataset = DataProcessor(data_x, data_y, scale_data=False)
data_x_rebalanced, data_y_rebalanced = downsample(data_x, data_y, 500000)
dataset = DataProcessor(data_x, data_y, scale_data=False)
net = MLP()
net = net.float()
net = net.train() 
trainloader = torch.utils.data.DataLoader(dataset, batch_size=100000, shuffle=True, num_workers=1)
loss_function = torch.nn.MSELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)

In [None]:

for epoch in range(0, 10): 
    
    # Iterate over the DataLoader for training data
    for i, data in enumerate(trainloader.batch_sampler):
      
      inputs = trainloader.dataset.X[data]
      targets = trainloader.dataset.y[data]
      inputs, targets = inputs.float(), targets.float()
      targets = targets.reshape((targets.shape[0], 1))
      
      # Zero the gradients
      optimizer.zero_grad()
      
      # Perform forward pass
      outputs = net(inputs)
      
      # Compute loss
      loss = loss_function(outputs, targets)
      
      # Perform backward pass
      loss.backward()
      
      # Perform optimization
      optimizer.step()
      
      # Print statistics
      print(f'\r Loss at epoch {epoch + 1}: {loss.item()}', end='')


RuntimeError: mat1 and mat2 shapes cannot be multiplied (100000x370 and 372x200)