In [11]:
from datasets import load_from_disk, load_dataset

In [49]:
dataset = load_from_disk("../../datasets/etel_adnan_dataset")
dataset.set_format("torch")
dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 27
})

In [50]:
dataset["input_ids"]

tensor([[    0,     0,     0,  ...,  1924,    30,     2],
        [    0,     0,     0,  ...,  1517,    30,     2],
        [    0,     0,     0,  ...,   346,    47,     2],
        ...,
        [    0,     0,     0,  ..., 35560,    30,     2],
        [    0,     0,     0,  ...,  9740,  1184,     2],
        [    0,     0,     0,  ...,  1194,    30,     2]])

In [51]:
dataset[:2]

{'input_ids': tensor([[   0,    0,    0,  ..., 1924,   30,    2],
         [   0,    0,    0,  ..., 1517,   30,    2]]),
 'attention_mask': tensor([[False, False, False,  ...,  True,  True,  True],
         [False, False, False,  ...,  True,  True,  True]]),
 'labels': tensor([[   0,    0,    0,  ..., 1924,   30,    2],
         [   0,    0,    0,  ..., 1517,   30,    2]])}

In [12]:
dataset2 = load_dataset("HuggingFaceTB/smoltalk", "everyday-conversations")
dataset2.set_format("torch")
dataset2

DatasetDict({
    train: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 2260
    })
    test: Dataset({
        features: ['full_topic', 'messages'],
        num_rows: 119
    })
})

In [13]:
dataset2["train"]

Dataset({
    features: ['full_topic', 'messages'],
    num_rows: 2260
})

In [15]:
dataset2["train"][0]

{'full_topic': 'Travel/Vacation destinations/Beach resorts',
 'messages': [{'content': 'Hi there', 'role': 'user'},
  {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
  {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
   'role': 'user'},
  {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
   'role': 'assistant'},
  {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
   'role': 'user'},
  {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
   'role': 'assistant'},
  {'content': "Okay, I'll look into those. Thanks for the recommendations!",
   'role': 'user'},
  {'content': "You're welcome. I hope you find

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-4")

In [60]:
print(tokenizer.apply_chat_template(dataset2["train"][0]["messages"]))

[100264, 882, 100266, 13347, 1070, 100265, 100264, 78191, 100266, 9906, 0, 2650, 649, 358, 1520, 499, 3432, 30, 100265, 100264, 882, 100266, 40, 2846, 3411, 369, 264, 11573, 22541, 369, 856, 1828, 20769, 13, 3053, 499, 7079, 1063, 5526, 6305, 30, 100265, 100264, 78191, 100266, 8538, 5526, 11573, 61545, 2997, 93550, 304, 28621, 11, 279, 8560, 91614, 11, 323, 279, 84229, 13, 2435, 2351, 3967, 369, 872, 6366, 35909, 323, 26110, 68127, 21160, 13, 100265, 100264, 882, 100266, 4897, 10578, 2294, 13, 8886, 1070, 904, 61545, 304, 279, 35374, 430, 527, 1695, 369, 8689, 30, 100265, 100264, 78191, 100266, 9642, 11, 279, 72857, 323, 14751, 17038, 23028, 323, 47142, 5670, 527, 9250, 11709, 369, 3070, 22658, 61545, 304, 279, 35374, 13, 2435, 3085, 264, 2134, 315, 7640, 323, 36483, 14791, 369, 682, 17051, 13, 100265, 100264, 882, 100266, 33413, 11, 358, 3358, 1427, 1139, 1884, 13, 11361, 369, 279, 19075, 0, 100265, 100264, 78191, 100266, 2675, 2351, 10788, 13, 358, 3987, 499, 1505, 279, 4832, 22541, 

In [20]:
templated_ds2 = dataset2["train"].map(
    lambda x: {"input_ids": tokenizer.apply_chat_template(x["messages"])}, batched=True
)

Map:   0%|          | 0/2260 [00:00<?, ? examples/s]

In [23]:
templated_ds2

Dataset({
    features: ['full_topic', 'messages', 'input_ids'],
    num_rows: 2260
})

In [31]:
templated_ds2.set_format("torch")

In [55]:
templated_ds2["input_ids"][:2]

[tensor([100264,    882, 100266,  13347,   1070, 100265, 100264,  78191, 100266,
           9906,      0,   2650,    649,    358,   1520,    499,   3432,     30,
         100265, 100264,    882, 100266,     40,   2846,   3411,    369,    264,
          11573,  22541,    369,    856,   1828,  20769,     13,   3053,    499,
           7079,   1063,   5526,   6305,     30, 100265, 100264,  78191, 100266,
           8538,   5526,  11573,  61545,   2997,  93550,    304,  28621,     11,
            279,   8560,  91614,     11,    323,    279,  84229,     13,   2435,
           2351,   3967,    369,    872,   6366,  35909,    323,  26110,  68127,
          21160,     13, 100265, 100264,    882, 100266,   4897,  10578,   2294,
             13,   8886,   1070,    904,  61545,    304,    279,  35374,    430,
            527,   1695,    369,   8689,     30, 100265, 100264,  78191, 100266,
           9642,     11,    279,  72857,    323,  14751,  17038,  23028,    323,
          47142,   5670,    

# HF Tutorial

In [40]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder("rotten_tomatoes")

In [45]:
ds_builder.info

DatasetInfo(description='', citation='', homepage='', license='', features={'text': Value(dtype='string', id=None), 'label': ClassLabel(names=['neg', 'pos'], id=None)}, post_processed=None, supervised_keys=None, builder_name='parquet', dataset_name='rotten_tomatoes', config_name='default', version=0.0.0, splits={'train': SplitInfo(name='train', num_bytes=1074810, num_examples=8530, shard_lengths=None, dataset_name=None), 'validation': SplitInfo(name='validation', num_bytes=134679, num_examples=1066, shard_lengths=None, dataset_name=None), 'test': SplitInfo(name='test', num_bytes=135972, num_examples=1066, shard_lengths=None, dataset_name=None)}, download_checksums=None, download_size=487770, post_processing_size=None, dataset_size=1345461, size_in_bytes=None)

In [52]:
from datasets import load_dataset

rotten_tomatoes = load_dataset("rotten_tomatoes", split="train")

In [53]:
rotten_tomatoes

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [58]:
tokenizer(rotten_tomatoes[0]["text"])

{'input_ids': [1820, 7091, 374, 51687, 311, 387, 279, 220, 1691, 267, 9478, 596, 502, 330, 390, 276, 330, 323, 430, 568, 596, 2133, 311, 1304, 264, 35732, 1524, 7191, 1109, 802, 77, 820, 82928, 5797, 797, 1414, 1174, 97721, 31717, 8039, 5355, 3824, 2727, 477, 4179, 1055, 4915, 278, 662], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [57]:
def tokenization(example):
    return tokenizer(example["text"])


rotten_tomatoes_tokenized = rotten_tomatoes.map(lambda example: tokenizer(example["text"]), batched=True)

Map:   0%|          | 0/8530 [00:00<?, ? examples/s]