In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import pandas as pd
import math


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import data
X_pretrain_import = np.genfromtxt('pretrain_features.csv', delimiter=',')
y_pretrain_import = np.genfromtxt('pretrain_labels.csv', delimiter=',')
X_train_import = np.genfromtxt('train_features.csv', delimiter=',')
y_train_import = np.genfromtxt('train_labels.csv', delimiter=',')

X_predict_import = np.genfromtxt('test_features.csv', delimiter=',')

In [None]:
X_pretrain = X_pretrain_import[1:, 2:]
y_pretrain = y_pretrain_import[1:, 1:]
X_train = X_train_import[1:, 2:]
y_train = y_train_import[1:, 1:]

X_predict = X_predict_import[1:, 2:]


X_predict_names = X_predict_import[1:,0]
print(X_predict_names.shape)

print(X_pretrain.shape)
print(y_pretrain.shape)
print()
print(X_train.shape)
print(y_train.shape)
print()
print(X_predict.shape)

In [None]:
# train test split for pretraining data (lumo)
X_pretrain_test, X_pretrain_train, y_pretrain_test, y_pretrain_train =  train_test_split(X_pretrain, y_pretrain, test_size=0.2, random_state=42)

In [None]:
# train test split for training data (homo-lumo)
X_test, X_train, y_test, y_train =  train_test_split(X_train, y_train, test_size=0.5, random_state=42)

In [None]:
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

In [None]:
# dataloader
class Dataset(Dataset):
      def __init__(self, labels, data):
            self.labels = labels
            self.data = data

      def __len__(self):
            return self.labels.shape[0]

      def __getitem__(self, index):
            # Load data and get label
            X = self.data[index,:]
            y = self.labels[index,:]

            return X, y

In [None]:
# data generators
params_pretrain = {'batch_size': 500,
          'shuffle': True,
          'num_workers': 0}

params_validate = {'batch_size': 1000,
          'shuffle': True,
          'num_workers': 0}

params_train = {'batch_size': 100,
          'shuffle': True,
          'num_workers': 0}


training_set = Dataset(y_pretrain_train, X_pretrain_train)
training_generator = torch.utils.data.DataLoader(training_set, **params_pretrain)

validation_set = Dataset(y_pretrain_test, X_pretrain_test)
validation_generator = torch.utils.data.DataLoader(validation_set, **params_validate)

final_set = Dataset(y_train, X_train)
final_generator = torch.utils.data.DataLoader(final_set, **params_train)

In [None]:
# simple neural network
class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(1000, 500)
        self.fc2 = nn.Linear(500, 250)
        self.fc3 = nn.Linear(250, 100)
        self.fc4 = nn.Linear(100, 50)
        self.fc5 = nn.Linear(50, 1)
        #self.fc6 = nn.Linear(25, 10)
        #self.fc7 = nn.Linear(10, 1)
        self.activation_fn = torch.nn.ReLU()
        #self.activation_fn = torch.nn.Tanh()
        self.dropout = nn.Dropout(0.3)


    def forward(self, x):
        x = self.fc1(x)
        x = self.activation_fn(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.activation_fn(x)
        x = self.fc3(x)
        x = self.activation_fn(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.activation_fn(x)
        x = self.fc5(x)
        #x = self.activation_fn(x)
        #x = self.fc6(x)
        #x = self.activation_fn(x)
        #x = self.fc7(x)
        return x

# optimizer Adam
net = Net()
optimizer = optim.Adam(net.parameters(), lr=0.0005, weight_decay=0.3)

In [None]:
# define loss
criterion = nn.MSELoss()
net.fc4.requires_grad = False
net.fc5.requires_grad = False

In [None]:
# pretraining loop
max_epochs = 100
losses = list()
validation = list()
train_losses = list()
for epoch in tqdm(range(max_epochs)):
    for local_batch, local_labels in training_generator:
        optimizer.zero_grad()   # zero the gradient buffers
        output = net(local_batch.float())
        loss = criterion(output.float(), local_labels.float())
        losses.append(loss.item())
        

        validationset = next(iter(validation_generator))
        validation_features = validationset[0]
        validation_labels = validationset[1]
        validation_output = net(validation_features.float())
        validation_loss = criterion(validation_output.float(), validation_labels.float())
        validation.append(validation_loss.item())

        
        loss.backward()
        optimizer.step()

print(loss)
print(validation_loss)

In [None]:
# plot pretain loss
plt.plot(losses, color='blue')
plt.plot(validation, color='red')
plt.show()

In [None]:
# freeze weights of first layers
net.fc1.requires_grad = False
net.fc2.requires_grad = False
net.fc3.requires_grad = False
net.fc4.requires_grad = True
net.fc5.requires_grad = True

In [None]:
# training loop
train_losses = list()
test_losses = list()
max_iter = 10000
test_features = torch.from_numpy(X_test)
test_labels = torch.from_numpy(y_test)
optimizer = optim.SGD(net.parameters(), lr=0.002, momentum=0, dampening=0, weight_decay=0, nesterov=False, maximize=False)

for iter in tqdm(range(max_iter)):
    for local_batch, local_labels in final_generator:
        optimizer.zero_grad()   # zero the gradient buffers
        output = net(local_batch.float())
        loss = criterion(output.float(), local_labels.float())
        train_losses.append(loss.item())
        
        test_output = net(test_features.float())
        test_loss = criterion(test_output.float(), test_labels.float())
        test_losses.append(test_loss.item())
        
        loss.backward()
        optimizer.step()
    if math.sqrt(test_loss) < 0.19:
        break

print('MSE of test set:', test_loss)
print('RMSE of test set:', math.sqrt(test_loss))

print('MSE of training set:', loss)
print('RMSE of training set:', math.sqrt(loss))



In [None]:
# plot train loss
plt.plot(train_losses, color='blue')
plt.plot(test_losses, color='red')
plt.show()

In [None]:
# write to file
test_features = torch.from_numpy(X_predict)
test_output = net(test_features.float())

output = test_output.detach().numpy()

out = pd.DataFrame(output, index = X_predict_names.astype(int).astype(str), columns = ['y'])

print(out)
out.to_csv('firstsub.csv', sep=',')