In [19]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [6]:
class Preprocess:
    def __init__(self):
        self.original_df = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.X_all = None
        self.scaler = None
    def pre_process(self,df,outcome,features, test_size, seed ):
        '''
        encode categorical variables, split training data
        df is the whole df before prediction
        outcome is the outcome variable
        features are the fetires used for prediction
        '''
        self.original_df = df.copy()
        cols = [outcome,*features]
        self.train_df = df[cols]
        y = self.train_df.loc[:,outcome] 
        X = self.train_df.loc[:,self.train_df.columns != outcome]
        self.X_all = X.copy()
        #split data, 30% test data
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state= seed) 
        #standardize it 
        dummy = ['VC Dummy','Internet dummy' , 'top_tier_uw']
        stand_col = [col for col in features if col not in  dummy]
        self.scaler = preprocessing.StandardScaler().fit(self.X_train[stand_col])
        transformed  = pd.DataFrame(self.scaler.transform(self.X_all[stand_col]), columns = stand_col)
        self.X_all[stand_col] = transformed  
        self.X_train = self.X_all.loc[self.y_train.index]
        self.X_test = self.X_all.loc[self.y_test.index]
        #label test data
        self.original_df['is_test'] = 0 
        self.original_df.loc[self.y_test.index,'is_test'] = 1 
        self.original_df['is_train'] = 0 
        self.original_df.loc[self.y_train.index,'is_train'] = 1 

In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 20),
            nn.ReLU(),
            nn.Linear(20, 20),
            nn.ReLU(),
            nn.Linear(20, 1),
        )
    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [28]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y.unsqueeze(1))

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 3 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")



In [29]:
def model_eval(model, dataloader):
    y_pred_list = []
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            y_pred_list.append(pred.numpy())
    y_pred_list = [i.item() for i in y_pred_list]
    y_pred = np.asarray(y_pred_list)
    dict_me = {}
    dict_me['MSE'] = mean_squared_error(y_test, y_pred)
    dict_me['RMSE'] = np.sqrt(dict_me['MSE'])
    dict_me['L1'] = mean_absolute_error(y_test, y_pred)
    metric_df = pd.DataFrame.from_dict(dict_me , orient = 'index')
    return metric_df

In [30]:
df = pd.read_csv('../Data_clean/Final_Train/IPO_train.csv')

In [31]:
outcome = '1st_Day_Return'
seed = 123
features = ['Star_Ratings','VC Dummy','Internet dummy', 'firm_age',
            'top_tier_uw','perc_price_above','ASVI','mean_SVI',
            'week_-8','week_-7','week_-6','week_-5','week_-4','week_-3','week_-2','week_-1']
process = Preprocess()
process.pre_process(df = df,outcome = outcome,features = features,test_size =0.3, seed = seed)
X_train = process.X_train
y_train = process.y_train
X_test  = process.X_test
y_test  = process.y_test
X_all = process.X_all

In [32]:
train_data = TensorDataset(torch.Tensor(np.array(X_train)), torch.Tensor(np.array(y_train)))
test_data =  TensorDataset(torch.Tensor(np.array(X_test)), torch.Tensor(np.array(y_test)))
train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=True)

In [36]:
input_dim = X_train.shape[1]
model = NeuralNetwork()
learning_rate = 1e-3
# loss_fn = nn.MSELoss()
loss_fn = nn.L1Loss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [37]:
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
#     x = test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 0.302808  [    0/ 1875]
loss: 0.312887  [  192/ 1875]
loss: 0.313194  [  384/ 1875]
loss: 0.253380  [  576/ 1875]
loss: 0.266612  [  768/ 1875]
loss: 0.249152  [  960/ 1875]
loss: 0.302966  [ 1152/ 1875]
loss: 0.254347  [ 1344/ 1875]
loss: 0.265055  [ 1536/ 1875]
loss: 0.368494  [ 1728/ 1875]
Epoch 2
-------------------------------
loss: 0.297879  [    0/ 1875]
loss: 0.225749  [  192/ 1875]
loss: 0.292447  [  384/ 1875]
loss: 0.279045  [  576/ 1875]
loss: 0.283052  [  768/ 1875]
loss: 0.222547  [  960/ 1875]
loss: 0.270104  [ 1152/ 1875]
loss: 0.217556  [ 1344/ 1875]
loss: 0.237756  [ 1536/ 1875]
loss: 0.258639  [ 1728/ 1875]
Epoch 3
-------------------------------
loss: 0.251476  [    0/ 1875]
loss: 0.217791  [  192/ 1875]
loss: 0.213462  [  384/ 1875]
loss: 0.257765  [  576/ 1875]
loss: 0.195305  [  768/ 1875]
loss: 0.267275  [  960/ 1875]
loss: 0.243659  [ 1152/ 1875]
loss: 0.189252  [ 1344/ 1875]
loss: 0.240347  [ 1536/ 1875]
loss: 0.16

In [38]:
model_eval(model , test_dataloader )

Unnamed: 0,0
MSE,0.084082
RMSE,0.289968
L1,0.170672
