In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [2]:
class Preprocess:
    def __init__(self):
        self.original_df = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_all = None
        self.scaler = None
    def pre_process(self,df,outcome,features, test_size, seed ):
        '''
        encode categorical variables, split training data
        df is the whole df before prediction
        outcome is the outcome variable
        features are the fetires used for prediction
        '''
        self.original_df = df.copy()
        cols = [outcome,*features]
        self.train_df = df[cols]
        y = self.train_df.loc[:,outcome] 
        X = self.train_df.loc[:,self.train_df.columns != outcome]
        self.X_all = X.copy()
        #split data, 30% test data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state= seed) 
        #standardize it 
        dummy = ['VC Dummy','Internet dummy' , 'top_tier_uw']
        stand_col = [col for col in features if col not in  dummy]
        self.scaler = preprocessing.StandardScaler().fit(self.X_train[stand_col])
        transformed  = pd.DataFrame(self.scaler.transform(self.X_all[stand_col]), columns = stand_col)
        self.X_all[stand_col] = transformed  
        self.X_train = self.X_all.loc[self.y_train.index]
        self.X_test = self.X_all.loc[self.y_test.index]
        #label test data
        self.original_df['is_test'] = 0 
        self.original_df.loc[self.y_test.index,'is_test'] = 1 
        self.original_df['is_train'] = 0 
        self.original_df.loc[self.y_train.index,'is_train'] = 1 

In [3]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 12),
            nn.ReLU(),
            nn.Linear(12, 12),
            nn.ReLU(),
            nn.Linear(12, 1),
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [4]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y.unsqueeze(1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 3 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


In [5]:
def model_eval(model, dataloader):
    y_pred_prob_list = []
    y_pred_tag_list = []
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            y_test_prob = torch.sigmoid(pred)
            y_test_tag = torch.round(y_test_prob)
            y_pred_prob_list.append(y_test_prob.numpy())
            y_pred_tag_list.append(y_test_tag.numpy())
    y_pred_prob_list = [i.item() for i in y_pred_prob_list]
    y_pred_tag_list = [i.item() for i in y_pred_tag_list]
    y_prob = np.asarray(y_pred_prob_list)
    y_tag = np.asarray(y_pred_tag_list)
    dict_me = {}
    dict_me['accuracy'] = accuracy_score(y_test, y_tag)
    dict_me['roc_auc_score'] = roc_auc_score(y_test,y_prob)
    dict_me['F_1'] = f1_score(y_test, y_tag)
    tn, fp, fn, tp = confusion_matrix(y_test, y_tag ).ravel()
    dict_me['FPR'] = fp / (fp+tn)
    dict_me['FNR'] = fn / (fn+tp)
    metric_df = pd.DataFrame.from_dict(dict_me , orient = 'index')
    return metric_df
            
            
    


    

In [6]:
df = pd.read_csv('../Data_clean/Final_Train/IPO_train.csv')

In [7]:
outcome = 'high_return'
seed = 123
features = ['Star_Ratings','VC Dummy','Internet dummy', 'firm_age',
            'top_tier_uw','perc_price_above','ASVI','mean_SVI',
            'week_-8','week_-7','week_-6','week_-5','week_-4','week_-3','week_-2','week_-1']
process = Preprocess()
process.pre_process(df = df,outcome = outcome,features = features,test_size =0.3, seed = seed)
X_train = process.X_train
y_train = process.y_train
X_test  = process.X_test
y_test  = process.y_test
X_all = process.X_all

In [8]:
train_data = TensorDataset(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train)))
test_data =  TensorDataset(torch.Tensor(np.array(X_test)), torch.Tensor(np.array(y_test)))

In [9]:
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=True)

In [10]:
input_dim = X_train.shape[1]
model = NeuralNetwork()
learning_rate = 1e-3
weight_decay = 10
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate , weight_decay=weight_decay)

In [11]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
#     x = test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.646668  [    0/ 1875]
loss: 0.669082  [  192/ 1875]
loss: 0.671367  [  384/ 1875]
loss: 0.648750  [  576/ 1875]
loss: 0.679674  [  768/ 1875]
loss: 0.649451  [  960/ 1875]
loss: 0.677230  [ 1152/ 1875]
loss: 0.647235  [ 1344/ 1875]
loss: 0.666130  [ 1536/ 1875]
loss: 0.655227  [ 1728/ 1875]
Epoch 2
-------------------------------
loss: 0.660984  [    0/ 1875]
loss: 0.677194  [  192/ 1875]
loss: 0.645196  [  384/ 1875]
loss: 0.658067  [  576/ 1875]
loss: 0.674633  [  768/ 1875]
loss: 0.659110  [  960/ 1875]
loss: 0.685981  [ 1152/ 1875]
loss: 0.688555  [ 1344/ 1875]
loss: 0.662378  [ 1536/ 1875]
loss: 0.666424  [ 1728/ 1875]
Epoch 3
-------------------------------
loss: 0.677366  [    0/ 1875]
loss: 0.671227  [  192/ 1875]
loss: 0.682507  [  384/ 1875]
loss: 0.670757  [  576/ 1875]
loss: 0.683280  [  768/ 1875]
loss: 0.685208  [  960/ 1875]
loss: 0.677738  [ 1152/ 1875]
loss: 0.680109  [ 1344/ 1875]
loss: 0.687370  [ 1536/ 1875]
loss: 0.67

In [12]:
metric = model_eval(model, test_dataloader)

In [13]:
metric

Unnamed: 0,0
accuracy,0.656716
roc_auc_score,0.515968
F_1,0.0
FPR,0.0
FNR,1.0
