In [67]:
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import matplotlib.pyplot as plt

In [65]:
USE_CUDA = torch.cuda.is_available()

def cuda(obj):
    if USE_CUDA:
        if isinstance(obj, tuple):
            return tuple(cuda(o) for o in obj)
        elif isinstance(obj, list):
            return list(cuda(o) for o in obj)
        elif hasattr(obj, 'cuda'):
            return obj.cuda()
    return obj

In [77]:
# creating a dataset class to pull image, label, and file name 
class DLDataset(Dataset):
    # normalize to mean/std of image array values
    def __init__(self, csv_file, dna_dir, chip_dir, transform=None):
        """
        Args:
            csv_file (csv): csv file with annotations.
            dna_dir (string): Directory with all the DNA data files.
            chip_dir (string): Directory with all the ChIP data files.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.full_data = pd.read_csv(csv_file)
        self.dna_dir = dna_dir
        self.chip_dir = chip_dir
        self.transform = transform

    def __len__(self):
        return len(self.full_data)

    def __getitem__(self, idx):
        dna_file = os.path.join(self.dna_dir,
                                self.full_data.iloc[idx]["DNAFileSource"])
        chip_file = os.path.join(self.chip_dir,
                                self.full_data.iloc[idx]["ChIPFileSource"])
        dna_data = np.load(dna_file)
        chip_data = np.load(chip_file)
        # change chip data to 4x200
        mod_chip_data = np.repeat(chip_data["x"],4,axis=0)
        # add dim to make 1x4x200
        dna_data = np.expand_dims(dna_data,axis=0)
        mod_chip_data = np.expand_dims(mod_chip_data,axis=0)
        # input data will be nx4x200
        try:
            input_data = np.concatenate((dna_data,mod_chip_data))
        except:
            print("error //")
            print("dna:",dna_data.shape)
            print("mod_chip_data:",mod_chip_data)
            print(dna_file)
            print(chip_file)
    
        label = self.full_data.iloc[idx]["label"]
        
        sample = {'data': input_data, 'label': label}

        return sample

In [59]:
# data loader
BATCH_SIZE = 20
test_batch_size = 20
training_epoches = 20

testingData = DLDataset(csv_file='masterDataLoadingCSV.csv',
                        dna_dir="/Users/jason/PycharmProjects/Tsirigos/deeplearning/DNA",
                        chip_dir="/Users/jason/PycharmProjects/Tsirigos/deeplearning/chip_npz"
                       )
testing_loader = DataLoader(testingData, batch_size=BATCH_SIZE,shuffle=True, num_workers=4)

In [60]:
for sample_batched in testing_loader:
    # get the inputs
    inputs = sample_batched['data']
    labels = sample_batched['label']
    print(inputs.shape)
    print(labels.shape)
    print(inputs)
    print(labels)
    break

torch.Size([20, 2, 4, 200])
torch.Size([20])

( 0 , 0 ,.,.) = 
   0   0   1  ...    1   0   0
   0   1   0  ...    0   0   0
   0   0   0  ...    0   1   1
   1   0   0  ...    0   0   0

( 0 , 1 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
      ⋮  

( 1 , 0 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   1   0
   1   1   0  ...    0   0   0
   0   0   1  ...    1   0   1

( 1 , 1 ,.,.) = 
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
   0   0   0  ...    0   0   0
      ⋮  

( 2 , 0 ,.,.) = 
   0   1   0  ...    0   0   0
   1   0   0  ...    0   0   0
   0   0   1  ...    1   1   0
   0   0   0  ...    0   0   1

( 2 , 1 ,.,.) = 
   4   4   4  ...    1   1   1
   4   4   4  ...    1   1   1
   4   4   4  ...    1   1   1
   4   4   4  ...    1   1   1
...     
      ⋮  

(17 , 0 ,.,.) = 
   0   0   0  ...    0   1   0
   0   0   1  .

In [61]:
class ConvModel(nn.Module):
    def __init__(self,fc1_size=90,fc2_size=45,num_ch=2):
        super(ConvModel, self).__init__()

        self.conv1 = nn.Sequential(         # image shape (n, 4, 200)
            nn.Conv2d(
                in_channels=num_ch,
                out_channels=20,
                kernel_size=(4,6),
                stride=(1,1),
                padding=0
            ),                              # output shape 20x1x195
            nn.ReLU(),                      # activation
            nn.MaxPool2d(kernel_size=(1,3)),    # choose max value in 2x2 area, output shape (20, 1, 65)
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(20, 30, (1,3), (1,1)),    # output shape 30x1x63
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1,3)),    # output shape (30, 1, 21)
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(30, 40, (1,2), (1,1)),    # output shape 40x1x20
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(1,4)),    # output shape (40, 1, 5)
        )
        
        self.fc1 = nn.Linear(40*1*5, fc1_size)
        self.fc2 = nn.Linear(fc1_size, fc2_size)
        self.out = nn.Linear(fc2_size, 2)


    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.log_softmax(self.out(x))
        return x

model = ConvModel()
print(model)

ConvModel(
  (conv1): Sequential(
    (0): Conv2d (2, 20, kernel_size=(4, 6), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(1, 3), stride=(1, 3), dilation=(1, 1))
  )
  (conv2): Sequential(
    (0): Conv2d (20, 30, kernel_size=(1, 3), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(1, 3), stride=(1, 3), dilation=(1, 1))
  )
  (conv3): Sequential(
    (0): Conv2d (30, 40, kernel_size=(1, 2), stride=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=(1, 4), stride=(1, 4), dilation=(1, 1))
  )
  (fc1): Linear(in_features=200, out_features=90)
  (fc2): Linear(in_features=90, out_features=45)
  (out): Linear(in_features=45, out_features=2)
)


In [75]:
# Training Steps
# need defined: testing_loader, optimizer, model
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, sample in enumerate(testing_loader):
        data = sample['data'].float()
        target = sample['label']
        if USE_CUDA:
            data, target = data.cuda(), target.cuda()
        data, target = Variable(data), Variable(target)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % 5000 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(testing_loader.dataset),
                100. * batch_idx / len(testing_loader), loss.data[0]))
        train_loss += loss.data[0]
    train_loss /=len(testing_loader)
    return train_loss

In [78]:
model = ConvModel()
if USE_CUDA:
    model.cuda()
#Optimizor 
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_losses =[]
test_losses =[]
for epoch in range(1, training_epoches + 1):
    train_losses.append(train(epoch))
#     test_losses.append(test(epoch, test_loader))





FileNotFoundError: Traceback (most recent call last):
  File "/Users/jason/anaconda/envs/py35/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 42, in _worker_loop
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "/Users/jason/anaconda/envs/py35/lib/python3.5/site-packages/torch/utils/data/dataloader.py", line 42, in <listcomp>
    samples = collate_fn([dataset[i] for i in batch_indices])
  File "<ipython-input-48-bcab0765cf18>", line 26, in __getitem__
    dna_data = np.load(dna_file)
  File "/Users/jason/anaconda/envs/py35/lib/python3.5/site-packages/numpy/lib/npyio.py", line 372, in load
    fid = open(file, "rb")
FileNotFoundError: [Errno 2] No such file or directory: '/Users/jason/PycharmProjects/Tsirigos/deeplearning/DNA/MYCScreen_037479_DNA.npy'


In [None]:
epoch_number = list(range(1, training_epoches + 1))
plt.plot(epoch_number,train_losses, 'r' )
plt.plot(epoch_number,test_losses, 'b' )
plt.show()