In [2]:
import os
import torch
import numpy as np

print(os.sys.path)
os.sys.path.append("/home/matthias/DIKU Project Protein Folding/protein-folding-gans/src/openprotein/pnerf")

# CODE FILES HERE
from model_params import get_model_data_dcgan
from models.dcgan import Dcgan, Generator, Discriminator
from solver import Solver, Testing
from directories import Directories
from dataloader import DataLoader
from plots import plot_losses, plot_z_samples, contact_map_grid
from sampling import dcgan_sampling
from contact_maps import get_contact_maps
import preprocessing

# SETTINGS HERE
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" # to see the CUDA stack
%matplotlib inline
# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
# supress cluttering warnings in solutions
import warnings
warnings.filterwarnings('ignore')

['/home/matthias/DIKU Project Protein Folding/protein-folding-gans/src', '/home/matthias/.local/share/virtualenvs/openprotein-xj-YAKcS/lib/python37.zip', '/home/matthias/.local/share/virtualenvs/openprotein-xj-YAKcS/lib/python3.7', '/home/matthias/.local/share/virtualenvs/openprotein-xj-YAKcS/lib/python3.7/lib-dynload', '/home/matthias/anaconda3/envs/openprotein/lib/python3.7', '', '/home/matthias/.local/share/virtualenvs/openprotein-xj-YAKcS/lib/python3.7/site-packages', '/home/matthias/.local/share/virtualenvs/openprotein-xj-YAKcS/lib/python3.7/site-packages/IPython/extensions', '/home/matthias/.ipython', '/home/matthias/DIKU Project Protein Folding/protein-folding-gans/src/openprotein/pnerf']


In [3]:
# setting device on GPU if available, else CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

# Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cpu



In [6]:
# Preprocessing from raw files to actual hd5 files
dataset_arg = "proteins"
prefix = "_max_length/"
data_root_folder = "../data/proteins/"
data = get_model_data_dcgan(dataset_arg)

residue_fragments = 256
preprocessing.MAX_SEQUENCE_LENGTH = 256

preprocessing.process_raw_data(False, force_pre_processing_overwrite=False, prefix=prefix,
                               data_root_folder=data_root_folder)

Starting pre-processing of raw data...
files ['../data/proteins/raw/training_50.txt', '../data/proteins/raw/training_95.txt', '../data/proteins/raw/training_100.txt', '../data/proteins/raw/validation', '../data/proteins/raw/testing.txt', '../data/proteins/raw/sample.txt', '../data/proteins/raw/training_70.txt', '../data/proteins/raw/training_90.txt', '../data/proteins/raw/training_30.txt']
Preprocessed file for training_50.txt already exists.
Skipping pre-processing for this file...
Preprocessed file for training_95.txt already exists.
Skipping pre-processing for this file...
Preprocessed file for training_100.txt already exists.
Skipping pre-processing for this file...
Preprocessed file for validation already exists.
Skipping pre-processing for this file...
Preprocessed file for testing.txt already exists.
Skipping pre-processing for this file...
Preprocessed file for sample.txt already exists.
Skipping pre-processing for this file...
Preprocessed file for training_70.txt already exis

In [8]:
train_file_name = "training_50"
training_file = data_root_folder+"preprocessed/" + str(preprocessing.MAX_SEQUENCE_LENGTH)\
            + prefix + train_file_name + ".txt.hdf5"
print("training file: {0}".format(training_file))

training file: ../data/proteins/preprocessed/256_max_length/training_50.txt.hdf5


In [None]:
directories = Directories("dcgan", dataset_arg.lower(), data["z_dim"], make_dirs=True)
data_loader = DataLoader(directories, data["batch_size"], dataset_arg.lower(),
                         training_file=training_file, residue_fragments=residue_fragments, atom="calpha")

Creating the contact maps for ../data/proteins/preprocessed/256_max_length/training_50.txt.hdf5 as no cache was found!


In [None]:
sample = get_contact_maps(training_file, fragment_length=residue_fragments).unsqueeze(1)
sample = sample[:25]
contact_map_grid(sample, rows=5, cols=5, fill=True)

In [None]:
# train model
dcgan = Dcgan(data_loader.input_dim, data["z_dim"])
generator = Generator(data["z_dim"], res=residue_fragments)
discriminator = Discriminator(1, 1, res=residue_fragments)
solver = Solver(dcgan, generator, discriminator, data["epochs"], data_loader, data["optimizer_G"],
                data["optimizer_D"], data["optim_config_G"], data["optim_config_D"],
                preprocessing.MAX_SEQUENCE_LENGTH, save_model_state=False)
solver.main()

In [None]:
test_file_name = "testing"
testing_file = data_root_folder+"preprocessed/" + str(preprocessing.MAX_SEQUENCE_LENGTH)\
             + prefix + test_file_name + ".txt.hdf5"
print("testing file: {0}".format(testing_file))

In [None]:
# test complexity of model as in A.4
optim_config_G = {
    "lr": 1e-2,
    "weight_decay": None,
    "betas": (0.5, 0.999)
}
optimizer_G = torch.optim.Adam(generator.parameters(), **optim_config_G)
testing = Testing(solver)
test_loader = solver.data_loader.get_new_test_data_loader(testing_file=testing_file)
testing.test(optimizer_G, test_loader)

In [None]:
# Insert name of model here if want to load a model
LOAD_MODEL = 0
if LOAD_MODEL:
    res_dir = "" # "../../results/proteins_z=100_0/"
    solver = torch.load(res_dir+"model_state.pt", map_location="cpu")
    generator = solver.generator
    discriminator = solver.discriminator
    generator.eval()
    discriminator.eval()
    solver.data_loader.directories.make_dirs = False
else:
    res_dir = solver.data_loader.directories.result_dir

In [None]:
# Plotting g and d losses for all epochs
plot_losses(solver, solver.train_loss_history["g_loss"], solver.train_loss_history["d_loss"])exit(1)

In [None]:
samples = dcgan_sampling(generator, solver.model.z_dim, 25).detach().numpy()
imgs, rows, cols = solver.get_sample_stats()
contact_map_grid(samples[:imgs], rows=rows, cols=cols, fill=True, file_name=res_dir+"/plot_grid.png", show=True)