## __Statistical and Linguistic Insights for Model Explanation - SLIME__ 
### __Importing dataset and preprocessing__

In [6]:
import sys
sys.path.insert(0, "../")

In [8]:
from slime_nlp.dataset import ImportData, CustomDset

### __1. $\mathtt{ImportData}$:__
<font size=3>
    
Import dataset (.csv) to split the data into train, validation, and test dataframes. \
_Check the ImportData object's doc._

In [10]:
id = ImportData(path_name="../dataset/adress_sample.csv", 
                n_val=0.15, n_test=0.10, 
                group_by=['text', 'group'])

DataFrame:
                                                 text  group
0  oh there's a cookie jar and a youngster with a...      1
1  cookie jar . <filler> a lad standing on a stoo...      0
2  well the table stool sr-ret <retracing> the se...      1

Data length: N_total = 5
N-train = 5, N-val = 0, N-test = 0



In [12]:
print(id.__doc__)


    # ImportData: import dataframe and split it into train, validation, and test data.
    
    Input: (path_name, n_val=None, n_test=None, group_by=None, verbose=True)
    -----
    - path_name (str): string with path and data name.
    - n_val (float): quantile of validation data.
    - n_test (float): quantile of test data.
    - group_by (List[str]): list of the dataframe's column names to group by.
    - verbose (bool): boolean variable to print dataset info.


    Attributes: 
    ----------
    - train (Dataframe): pandas dataframe of train batch.
    - val (Dataframe): pandas dataframe of validation batch.
    - test (Dataframe): pandas dataframe of test batch.
      
    


In [14]:
train_data = id.train
val_data = id.val
test_data = id.test

print(f"Data shape = train:{train_data.shape} - validation:{val_data.shape} - test:{test_data.shape}")

AttributeError: 'NoneType' object has no attribute 'shape'

In [16]:
train_data.head()

Unnamed: 0,text,group
1,cookie jar . <filler> a lad standing on a stoo...,0
4,okay the water's running outof the sink overfl...,0
0,oh there's a cookie jar and a youngster with a...,1
2,well the table stool sr-ret <retracing> the se...,1
3,mhm . there's a young boy <filler> going in a ...,1


### __2. $\mathtt{CustomDset}$:__
<font size=3>
    
Tokenizing the data sentences to return the model's input tensors (_input_ids, token_type_ids, attention_mask_), and the label tensor (_groups condition and control_). \
_Check the CustomDset object's doc._

In [20]:
train_dset = CustomDset(data=train_data, max_length=512, 
                        batch_size=1, shuffle=True, device="cpu", 
                        pretrained_name="google-bert/bert-base-cased")

In [22]:
print(train_dset.__doc__)


    # CustomDset: import the data sentences to return a PyTorch generator of tokenized 
    tensors.

    Input: (data, max_length, batch_size=1, shuffle=True, device='cpu',
            pretrained_name="google-bert/bert-base-cased")
    ----- 
    - data (Dataframe): pandas dataframe (ImportData's output) with "text"(str) and 
    "group"(int) columns.
    - max_length (int): the sequence maximum length.
    - batch_size (int): data batch-size value.
    - shuffle (bool): boolean variable for data shuffling.
    - device (str): select CPU or GPU device for output tensors.
    - pretained_name (str): pretrained model name from huggingface.co repository.


    Methods:
    -------
    __len__ (int): returns data size.
    
    __getitem__ (Tuple[Tensor, Tensor, Tensor], Tensor): generator 
    
    
    Output (generator): (input_ids, token_type_ids, attention_mask), label
    ------ 
    - input_ids (Tensor[int]): sequence of special tokens IDs.
    - token_type_ids (Tensor[int]): sequ

In [24]:
print("Train data size:", len(train_dset))

Train data size: 5


In [26]:
(input_ids, token_type_ids, attention_mask), label = train_dset.__getitem__(index=0)

print("input_ids:", input_ids.shape)
print("token_type_ids:", token_type_ids.shape)
print("attention_mask:", attention_mask.shape)
print("label:", label)

input_ids: torch.Size([1, 223])
token_type_ids: torch.Size([1, 223])
attention_mask: torch.Size([1, 223])
label: tensor([[1.]])
