## __Statistical and Linguistic Insights for Model Explanation - SLIME__ 
### __Importing dataset and preprocessing__

In [1]:
import sys
sys.path.insert(0, '../')

from slime_nlp.dataset import ImportData, CustomDset

### __1. $\mathtt{ImportData}$:__
<font size=3>
    
Import dataset (.csv) to split the data into train, validation, and test dataframes. \
_Check the ImportData object's doc._

In [2]:
id = ImportData(path_name="../dataset/adress_all.csv", 
                n_val=0.15, n_test=0.10, 
                group_by=['text', 'group'])

DataFrame:
                                                 text  group
0  well the little girl is saying to be uiet to h...      0
1  mhm . well the water's running over on the flo...      0
2  look at the picture <unintelligible> . oh okay...      0

Data length: N_total = 156
N-train = 118, N-val = 23, N-test = 15



In [3]:
print(id.__doc__)


    # ImportData: import dataframe and split it into train, validation, and test data.
    
    Input: (path_name, n_val=None, n_test=None, group_by=None, verbose=True)
    -----
    - path_name (str): string with the path and data name;
    - n_val (float): quantile of validation data;
    - n_test (float): quantile of test data;
    - group_by (List[str]): list of the dataframe's column names to be grouped.
    - verbose (bool): boolean variable to print dataset info.


    Attributes: 
    ----------
    - train (Dataframe): pandas dataframe of train batch;
    - val (Dataframe): pandas dataframe of validation batch;
    - test (Dataframe): pandas dataframe of test batch.
      
    


In [4]:
train_data = id.train
val_data = id.val
test_data = id.test

print(f"Data shape = train:{train_data.shape} - validation:{val_data.shape} - test:{test_data.shape}")

Data shape = train:(118, 2) - validation:(23, 2) - test:(15, 2)


In [5]:
train_data.head()

Unnamed: 0,text,group
121,well the sink is running over . she's drying t...,1
79,<event> <filler> the woman of the house is dry...,1
29,you mean right now tell you . <filler> the boy...,0
84,well your sink is being run over the water . t...,1
153,okay the water's running outof the sink overfl...,0


### __2. $\mathtt{CustomDset}$:__
<font size=3>
    
Tokenizing the data sentences to return the model's input tensors (_input_ids, token_type_ids, attention_mask_), and the label tensor (_groups Alzheimer's disease and control_). \
_Check the CustomDset object's doc._

In [6]:
train_dset = CustomDset(data=train_data, max_length=512, 
                        batch_size=1, shuffle=True, device="cpu", 
                        pretrained_name="google-bert/bert-base-cased")

In [7]:
print(train_dset.__doc__)


    # CustomDset: import the data sentences to return a PyTorch generator of tokenized 
    tensors of batch-size = 1.

    Input: (data, max_length, device='cpu')
    ----- 
    - data (Dataframe): pandas dataframe (ImportData's output) with "text"(str) and 
    "group"(int) columns.
    - max_length (int): the sequence maximum length.
    - batch_size (int): data batch-size value.
    - shuffle (bool): boolean variable for data shuffling.
    - device (str): select CPU or GPU device for output tensors.
    - pretained_name (str): pretrained model name from huggingface.co repository.


    Methods:
    -------
    __len__ (int): returns data size.
    
    __getitem__ (Tuple[Tensor, Tensor, Tensor], Tensor): generator 
    
    
    Output (generator): (input_ids, token_type_ids, attention_mask), label
    ------ 
    - input_ids (Tensor): sequence of special tokens IDs.
    - token_type_ids (Tensor): sequence of token indices to distinguish between 
    sentence pairs.
    - attenti

In [8]:
print("Train data size:", len(train_dset))

Train data size: 118


In [9]:
(input_ids, token_type_ids, attention_mask), label = train_dset.__getitem__(index=0)

print("input_ids:", input_ids.shape)
print("token_type_ids:", token_type_ids.shape)
print("attention_mask:", attention_mask.shape)
print("label:", label)

input_ids: torch.Size([1, 249])
token_type_ids: torch.Size([1, 249])
attention_mask: torch.Size([1, 249])
label: tensor([[1.]])
