Group Members:

Name: `Sayeh Jarollahi` \
Student ID (matriculation number): `7073520` \
Email: `saja00006@stud.uni-saarland.de` 

Name: `Mahsa Amani` \
Student ID (matriculation number): `7064006` \
Email: `maam00002@stud.uni-saarland.de`

In [None]:
import json
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# SMS Spam Data

In [15]:
# data loading
with open("Dataset1/data.json", "r") as f:
    data_1 = json.load(f)

tokenized_sequences = [data["tokens"] for data in data_1]  
labels = [data["label"] for data in data_1]   
print(f"Numeber of sequences: {len(tokenized_sequences)}")
print(f"Numeber of labels: {len(labels)}")


Numeber of sequences: 5574
Numeber of labels: 5574


In [26]:
# defining dataset class
class TokenDataset(Dataset):
    def __init__(self, tokenized_sequences, labels):
        self.tokenized_sequences = [torch.tensor(seq, dtype=torch.float32) for seq in tokenized_sequences]
        self.labels = torch.tensor(labels)
        assert len(self.tokenized_sequences) == len(self.labels)
        self.max_length = max(len(sequence) for sequence in tokenized_sequences)

    def __len__(self):
        return len(self.tokenized_sequences)

    def __getitem__(self, index):
        sequence = self.tokenized_sequences[index] 
        if len(sequence) < self.max_length: # check if padding is required
            # append 0 to the end of the shorter sequences
            sequence = torch.cat([sequence, torch.zeros(self.max_length - len(sequence), dtype=torch.long)]) 
        return sequence, self.labels[index]

In [27]:
# defining dataloader class
class TokenDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=1, shuffle=False):
        super().__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle)

    def __iter__(self):
        return super().__iter__()

In [None]:
dataset = TokenDataset(tokenized_sequences, labels)

# Separate the dataset in train and test datasets at random
generator1 = torch.Generator().manual_seed(42)
train_dataset, test_dataset = random_split(dataset, [0.8, 0.2], generator=generator1)

# Dataloader for the training part of the dataset only
train_loader = TokenDataLoader(train_dataset, batch_size=6) 

In [33]:
X, y = next(iter(train_loader))
print(X)
print(y)

tensor([[ 101., 2059., 2054.,  ...,    0.,    0.,    0.],
        [ 101., 2026., 2905.,  ...,    0.,    0.,    0.],
        [ 101., 2035., 2122.,  ...,    0.,    0.,    0.],
        [ 101., 8840., 2140.,  ...,    0.,    0.,    0.],
        [ 101., 2024., 2017.,  ...,    0.,    0.,    0.],
        [ 101., 2089., 1045.,  ...,    0.,    0.,    0.]])
tensor([0, 0, 0, 0, 0, 0])


Test correctness here (do not change the cell below)

In [5]:
X, y = next(iter(train_loader))
print(X)
print(y)

tensor([[  101.,  4604.,  2033.,  ...,     0.,     0.,     0.],
        [  101.,  2031.,  2017.,  ...,     0.,     0.,     0.],
        [  101.,  2129.,  1005.,  ...,     0.,     0.,     0.],
        [  101.,  2053.,  4830.,  ...,     0.,     0.,     0.],
        [  101.,  4638., 14166.,  ...,     0.,     0.,     0.],
        [  101.,  2009.,  1005.,  ...,     0.,     0.,     0.]])
tensor([0, 0, 0, 0, 0, 0])


# Disease Data

In [20]:
# loading data
df = pd.read_csv("Dataset2/data.csv", index_col=0)
df.head()

Unnamed: 0_level_0,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,hypotension,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lyme_disease
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tungiasis
2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,Lyme_disease
3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zika
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Rift_Valley_fever


In [47]:
# defining dataset class
class CSVDataset(Dataset):
    def __init__(self, data_frame):
        self.features = torch.tensor(data_frame.iloc[:, :-1].values, dtype=torch.int32)
        # One-hot encode the prognosis label
        encoder = OneHotEncoder(sparse_output=False)
        encoded_labels = encoder.fit_transform(data_frame.iloc[:, -1].values.reshape(-1, 1))
        self.labels = torch.tensor(encoded_labels, dtype=torch.int32)

    def __len__(self):
        return len(self.features)

    def __getitem__(self, index):
        return self.features[index], self.labels[index]

In [48]:
# defining dataloader class
class CSVDataLoader(DataLoader):
    def __init__(self, dataset, batch_size=1, shuffle=False):
        super().__init__(dataset=dataset, batch_size=batch_size, shuffle=shuffle)

    def __iter__(self):
        return super().__iter__()

In [None]:
dataset = CSVDataset(df)

# Separate the dataset in train and test datasets at random
generator1 = torch.Generator().manual_seed(42)
train_dataset, test_dataset = random_split(dataset, [0.8, 0.2], generator=generator1)

# Dataloader for the training part of the dataset only
train_loader = CSVDataLoader(train_dataset, batch_size=6) 

In [50]:
X, y = next(iter(train_loader))
print(X.shape)
print(y.shape)

torch.Size([6, 64])
torch.Size([6, 11])


Test correctness here (do not change the cell below)

In [8]:
X, y = next(iter(train_loader))
print(X.shape)
print(y.shape)

torch.Size([6, 64])
torch.Size([6, 11])
