In [1]:
import numpy as np
import pandas as pd
import json
import csv
import torch
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F

  from .autonotebook import tqdm as notebook_tqdm


The following two cells are a suggestion on accessing the files. You are free to ignore these!

In [None]:
with open("Dataset1/data.json", "r") as f:
    data_1 = json.load(f)
data_1

In [None]:
with open("Dataset2/data.csv", "r") as file:
    for row in csv.DictReader(file):
        print(row)

In [2]:
#Implement this!

class TokenDataset(Dataset):
    def __init__(self,dataset_path, transform=None, target_transform=None):
        self.data = self._read_json(dataset_path)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        tokens  = self.data[idx]['tokens']
        label = self.data[idx]['label']

        if self.transform:
            tokens = self.transform(tokens)
        if self.target_transform:
            label = self.target_transform(label)
        return tokens, label

    def _read_json(self, path):
        with open(path, "r") as f:
            data = json.load(f)
        return data
        
class TransformTokens(object):

    def __init__(self,output_len):
        self.output_len = output_len
    
    def __call__(self,tokens):
        if not isinstance(tokens, torch.Tensor):
            tokens = torch.tensor(tokens)

        if len(tokens) > self.output_len:
            tokens = tokens[:self.output_len]
        
        if len(tokens) < self.output_len:
            tokens = F.pad(tokens, (0, self.output_len - len(tokens)), "constant", 0)

        return tokens

In [3]:
class TokenDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset,batch_size=batch_size, shuffle=shuffle)

In [4]:
padding = TransformTokens(100)

tokens_dataset = TokenDataset(dataset_path="Dataset1/data.json",
                              transform=padding)


#Separate the dataset in train and test datasets at random
train_tokens, test_tokens = random_split(tokens_dataset, [4000,1574])

train_loader = TokenDataLoader(train_tokens, batch_size=6, shuffle=True) # Dataloader for the training part of the dataset only

Test correctness here (do not change the cell below)

In [5]:
X, y = next(iter(train_loader))
print(X)
print(y)

tensor([[  101,  1045,  1005,  1049,  6069,  2022,  2188,  2574,  1998,  1045,
          2123,  1005,  1056,  2215,  2000,  2831,  2055,  2023,  4933,  4902,
          3892,  1010,  1047,  1029,  1045,  1005,  2310,  6639,  2438,  2651,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0],
        [  101, 19181, 24471,  8840, 12423,  4647, 18163,  4854, 15536,  2094,
          1057,  1010,  1040,  3372,  2202,  2009,  5667,  1012,  1012,  2522,
          2480,  2108,  4854,  2003,  1040,  2087, 

In [6]:
#Implement this!
class CSVDataset(Dataset):
    def __init__(self, dataset_path, transform = None, target_transform = None):
        self.dataframe = pd.read_csv(dataset_path)
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        label = self.dataframe.iloc[idx,-1]
        symptoms = self.dataframe.iloc[idx,1:-1]       
        symptoms = torch.tensor(symptoms, dtype=torch.float32)

        if self.transform:
            symptoms = self.transform(symptoms)
        if self.target_transform:
            label = self.target_transform(label)

        return symptoms, label

class ConvertLabeltoInt(object):

    def __init__(self):
        self.map = {'Lyme_disease':0,
                    'Tungiasis':1,
                    'Zika':2,
                    'Rift_Valley_fever':3,
                    'West_Nile_fever':4,
                    'Malaria':5,
                    'Chikungunya':6,
                    'Plague':7,
                    'Dengue':8,
                        'Yellow_Fever':9,
                    'Japanese_encephalitis':10}

    def __call__(self, label):
        
        if isinstance(label, str):
            label = self.map[label]
        
        return label
    
class CreateOnehotEncode(object):
    def __init__(self):
        pass
    def __call__(self, label):
        target = torch.zeros(11, dtype=torch.float)
        target.scatter_(dim=0, index = torch.tensor([label]), value=1)

        return target

In [7]:
composed_transform = transforms.Compose([ConvertLabeltoInt(),CreateOnehotEncode()])

In [8]:
class CSVDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=32, shuffle=True):
        super().__init__(dataset,batch_size=batch_size, shuffle=shuffle)

In [9]:
desease_dataset = CSVDataset(dataset_path="Dataset2/data.csv",
                     target_transform=composed_transform)

train_desease, test_desease = random_split(desease_dataset, [500,207])

train_loader_desease = CSVDataLoader(train_desease, batch_size=6, shuffle=True)

Test correctness here (do not change the cell below)

In [10]:
X, y = next(iter(train_loader_desease))
print(X.shape)
print(y.shape)

torch.Size([6, 64])
torch.Size([6, 11])


  symptoms = torch.tensor(symptoms, dtype=torch.float32)
