# Pipeline

In [None]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis')
classifier(['Harry potter is one of the highest grossing movie of all time.', 'Daniel Redcliff became billionaire after Harry Potter'])

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9998136162757874},
 {'label': 'POSITIVE', 'score': 0.9923048615455627}]

#### Working behind a pipeline

In [None]:
#The working that happens behind a pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

#Define the text you want to get predictions for
raw_inputs = ['Harry potter is one of the highest grossing movie of all time.', 'Daniel Redcliff became billionaire after Harry Potter']

#Load the tokenizer model
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
print('Tokenizer: ',tokenizer)

#pass the inputs to tokenizer
#The output will contain 2 keys: input_ids, attention_mask
#return_tensor will make sure that the output is a tensor value. Here, 'pt' means pytorch tensors, 'tf' means tensorflow tensors and 'np' means numpy arrays
inputs = tokenizer(raw_inputs, padding = True, truncation = True, return_tensors = 'pt')
print('inputs: ', inputs)

#Load the model
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
print('Model: ', model)

#Pass the input to model
output = model(**inputs)
print('Output: ', output)

#The output will contain logits
#Logits are non-normalized scores
print('Logits', output.logits)

#To make sense out of logits we pass a softmax layer to find the probabilities
predictions = torch.nn.functional.softmax(output.logits, dim = -1)
print('Predictions', predictions)

#To get the label
print('Labels', model.config.id2label)

Tokenizer:  DistilBertTokenizerFast(name_or_path='distilbert-base-uncased-finetuned-sst-2-english', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
inputs:  {'input_ids': tensor([[  101,  4302, 10693,  2003,  2028,  1997,  1996

#### Tokenization process inside the tokenizer

In [None]:
from transformers import AutoTokenizer

#Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

sequence = 'Grindewald and Dumbledore had a romantic relationship'

#Tokenize the input
#Note that this line will automatically add [CLS] (starting point of a sentence) and [SEP] (endingpoint of a sequence)
tokens = tokenizer.tokenize(sequence)
print('Tokens: ', tokens)

#Convert categorical data to continuous data
ids = tokenizer.convert_tokens_to_ids(tokens)
print('input_ids: ', ids)

#Decoding: COnverting input_ids to text
decoded_string = tokenizer.decode(ids)
print('String: ', decoded_string)

Tokens:  ['grind', '##ew', '##ald', 'and', 'dumb', '##led', '##ore', 'had', 'a', 'romantic', 'relationship']
input_ids:  [23088, 7974, 19058, 1998, 12873, 3709, 5686, 2018, 1037, 6298, 3276]
String:  grindewald and dumbledore had a romantic relationship


# Loading and preprocessing a dataset from hub (Hugging face hub)

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

#Load the dataset
#Note that the dataset will contain training, validation and test data
raw_datasets = load_dataset('glue', 'mrpc')
print(raw_datasets)

#Preprocessing the dataset
#We apply a function to tokenize the dataset to reduce the storage
def tokenize_function(data):
  return tokenizer(data['sentence1'], data['sentence2'], truncation = True)

#batched = True will preprocess multiple inputs simultaneously
tokenized_dataset = raw_datasets.map(tokenize_function, batched = True)
print(tokenized_dataset)

#Dynamic padding using DataCollatorWithPadding ensures that all the sequences are of same length
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
print(data_collator)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})
DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedTok

# Training the dataset

In [None]:
#Load the model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

#Define training argumnets
from transformers import TrainingArguments

training_args = TrainingArguments('test-trainer', evaluation_strategy = 'epoch')

#Predictions function
def compute_metrics(eval_preds):
  metric = evaluate.load('glue', 'mrpc'),
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis = -1)
  return metric.compute(predictions = predictions, references = labels)

#load the trainer
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_dataset['train'],
    eval_dataset = tokenized_dataset['validation'],
    data_collator = data_collator,
    tokenizer = tokenizer
    compute_metrics = compute_metrics
)

#train the model
#It will report the loss for every 500 epochs
trainer.train()

# Dealing with datasets in the hub

In [None]:
#Decompressing the files manually

#Loading the dataset from GitHub
#This is a Squad-it dataset for question-answering in Italian
#The following line of code will download 2 files (train and test) with the extension .json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
!wget https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.json.gz

#Decompress (reduce the size) it wil gzip command
#The extension becomes .json
!gzip -dkv SQuAD_it-*.json.gz

--2024-01-30 07:40:55--  https://github.com/crux82/squad-it/raw/master/SQuAD_it-train.json.gz
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz [following]
--2024-01-30 07:40:55--  https://raw.githubusercontent.com/crux82/squad-it/master/SQuAD_it-train.json.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7725286 (7.4M) [application/octet-stream]
Saving to: ‘SQuAD_it-train.json.gz’


2024-01-30 07:40:56 (85.5 MB/s) - ‘SQuAD_it-train.json.gz’ saved [7725286/7725286]

--2024-01-30 07:40:56--  https://github.com/crux82/squad-it/raw/master/SQuAD_it-test.

In [None]:
from datasets import load_dataset

#To load both the train and test daatset so that map function can be applied at once
#This will automatically decompress files (no need to manually do it)
data_files = {'train': '/content/SQuAD_it-train.json.gz', 'test':'/content/SQuAD_it-test.json.gz'}
squad_it_dataset = load_dataset('json', data_files = data_files, field = 'data')
squad_it_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})

In [None]:
#Load a remote file

url = "https://github.com/crux82/squad-it/raw/master/"

data_files = {'train': url + 'SQuAD_it-train.json.gz', 'test': url + 'SQuAD_it-test.json.gz'}
squad_it_dataset = load_dataset('json', data_files = data_files, field = 'data')
squad_it_dataset

Downloading data:   0%|          | 0.00/7.73M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 442
    })
    test: Dataset({
        features: ['paragraphs', 'title'],
        num_rows: 48
    })
})

# Slicing and dicing a dataset

Here we are using drug review dataset. We will learn how to preprcess the dataset and then train it

In [1]:
#Download the dataset with !wget
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
#Extract the data with !unzip
!unzip drugsCom_raw.zip

--2024-01-31 06:38:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip’

drugsCom_raw.zip        [      <=>           ]  41.00M  38.3MB/s    in 1.1s    

2024-01-31 06:38:55 (38.3 MB/s) - ‘drugsCom_raw.zip’ saved [42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected p

In [4]:
#Load the dataset
from datasets import load_dataset

data_files = {'train': '/content/drugsComTrain_raw.tsv', 'test':'/content/drugsComTest_raw.tsv'}
#Note that TSV is a variant of CSV file which uses tabs instead of commas as separator
drug_dataset = load_dataset('csv', data_files = data_files, delimiter = '\t')
drug_dataset

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [7]:
#It is always better to generate a random sample to see what the dataset looks like
#Create a random sample by chaining together shuffle and select
drug_sample = drug_dataset['train'].shuffle(seed = 42).select(range(5,10))
drug_sample[:2]

{'Unnamed: 0': [37296, 225540],
 'drugName': ['Vyvanse', 'Bupropion'],
 'condition': ['ADHD', 'Depression'],
 'review': ['"I&#039;ve been taking this medication for many months now. I take it on work days mostly &amp; days when I need to get up and go. I still have depression and sometimes it&#039;s hard to sleep but not always. It helps if you love food because I don&#039;t over eat but if I forget to bring a snack to work I get a little nauseous or I just feel super hungry by the end of my shift because I forget about food for like 7-9 hours. I try to eat food before I take it. Girls at work ask me what helps me be confident and sell stuff and I tell them vyvanse. I have depression and anxiety but this helps so much. I do feel really tired and like I don&#039;t want to do stuff on days I don&#039;t take it and after it wears off. So my doctors trying me on brintellex or something too."',
  '"I was previously taking 150 mg for years and it stopped working. Just started 300 mg XL three

Note that in output we notice few things:


a. HTML codes such as I&#039

b. Unnamed: 0 column to be renamed for proper understanding

c. The review contains mix of special character, small and upper case

In [8]:
for split in drug_dataset.keys():
  assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [9]:
#Renaming the column 'Unnamed: 0' to something relevant
drug_dataset = drug_dataset.rename_column(original_column_name = 'Unnamed: 0', new_column_name = 'patient_id')
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [13]:
#Define a function to convert the strings to lowercase
def lowercase_condition(example):
  return {'condition': example['condition'].lower()}

In [14]:
#Apply the function to the dataset
drug_dataset.map(lowercase_condition)
#Note that our data has None datatype for which lower() can't be applied
#Hence we need to filter it

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [15]:
#Remove the None values from the review column
#lambda function as small function which can be used once
#Syntax: lambda <arguments>:<condition>
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)
drug_dataset

Filter:   0%|          | 0/161297 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53766 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 160398
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [16]:
#Now apply the lowercase condition
drug_dataset = drug_dataset.map(lowercase_condition)
drug_dataset['train']['condition'][:3]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

['left ventricular dysfunction', 'adhd', 'birth control']

In [17]:
#The review can be 1 word or an essay
#Define a function to find length of the reviews
def compute_review_length(example):
  return {'review_length': len(example['review'].split())}

In [18]:
#Apply the function on the dataset
#Note that review_length is not a column in the dataset and hence a new column will be created
drug_dataset = drug_dataset.map(compute_review_length)
drug_dataset['train'][0]

Map:   0%|          | 0/160398 [00:00<?, ? examples/s]

Map:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [19]:
#Sort the dataset to see the length of review
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [20]:
#Check how many rows the dataset has in train and test
drug_dataset.num_rows

{'train': 160398, 'test': 53471}

In [21]:
#Remove the rows with less than 30 words
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)
drug_dataset.num_rows

Filter:   0%|          | 0/160398 [00:00<?, ? examples/s]

Filter:   0%|          | 0/53471 [00:00<?, ? examples/s]

{'train': 138514, 'test': 46108}

In [22]:
#The following can be used to remove HTML character codes
import html

text = 'I&#039;m a transformer called BERT'
html.unescape(text)

"I'm a transformer called BERT"

In [23]:
#Remove the HTML character codes from the dataset
drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})
drug_dataset['train']['review'][:3]

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

['"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effective."',
 '"I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects. But it contained hormone gestodene, which is not available in US, so I switched to Lybrel, because the ing

In [24]:
#batched = True is used to send a batch of example to map fucntion (default 1000)
new_drug_dataset = drug_dataset.map(
    lambda x: {'review': [html.unescape(o) for o in x['review']]}, batched = True
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

Note that the above command is executed faster than the previous one. The reason being when batched = True is specified the function receives a dictionary with the fields of the dataset, but each value is a list of values and not just a single value.

The return value of the dataset should be the same: a dictionary with field we want to update or add to our dataset, and a list of values

In [25]:
from transformers import AutoTokenizer

#Load a tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

#Define a function to tokenize review column
def tokenize_function(example):
  return tokenizer(example['review'], truncation = True)

#To time a one line instruction - %time
#To time the whole cell - %%time
#batched = True is required to unlock the fast power of AutoTokenizer
%time tokenized_dataset = drug_dataset.map(tokenize_function, batched = True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 1min 59s, sys: 1.47 s, total: 2min
Wall time: 1min 27s


In [26]:
#Computing time when slow tokenizer is used
slow_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast = False)

def tokenize_function(example):
  return slow_tokenizer(example['review'], truncation = True)

%time slow_tokenized_dataset = drug_dataset.map(tokenize_function, batched = True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 6min 34s, sys: 2.36 s, total: 6min 37s
Wall time: 6min 51s


In [27]:
def tokenize_function(example):
  return tokenizer(example['review'], truncation = True)

#Parallelization - increase in speed
#For parallelization - num_proc
%time tokenized_dataset_num_poc = drug_dataset.map(tokenize_function, batched = True, num_proc = 8)

Map (num_proc=8):   0%|          | 0/138514 [00:00<?, ? examples/s]

Map (num_proc=8):   0%|          | 0/46108 [00:00<?, ? examples/s]

CPU times: user 2.33 s, sys: 755 ms, total: 3.09 s
Wall time: 1min 31s


In [28]:
#dataset.map() can be used to change the number of elements in the dataset
#return_overflowing_tokens = True is used to return all chunks of the tests of just the first one
def tokenize_and_split(example):
  return tokenizer(example['review'], truncation = True, max_length = 128,
                   return_overflowing_tokens = True)

In [29]:
result = tokenize_and_split(drug_dataset['train'][0])
[len(inp) for inp in result['input_ids']]

[128, 49]

In [30]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched = True)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1463

In [31]:
tokenized_dataset = drug_dataset.map(
    tokenize_and_split, batched = True, remove_columns = drug_dataset['train'].column_names
)

Map:   0%|          | 0/138514 [00:00<?, ? examples/s]

Map:   0%|          | 0/46108 [00:00<?, ? examples/s]

In [32]:
len(tokenized_dataset['train']), len(tokenized_dataset['test'])

(206772, 68876)

In [33]:
#Converting the dataset to dataframe using set_format
drug_dataset.set_format('pandas')

In [34]:
drug_dataset['train'][:2]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134


In [35]:
#Taking all the training data
train_df = drug_dataset['train']
len(train_df), len(drug_dataset['train'])

(138514, 138514)

In [36]:
frequencies = train_df['condition'].value_counts().to_frame().reset_index().rename(columns = {'index':'condition', 'condition':'frequency'})
frequencies.head(5)

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [37]:
#Create a new dataset
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [38]:
#Split the training dataset into train and validation
drug_dataset_clean = drug_dataset['train'].train_test_split(train_size = 0.8, seed = 42)

In [39]:
drug_dataset_clean['validation'] = drug_dataset_clean.pop('test')

In [40]:
drug_dataset_clean['test'] = drug_dataset['test']
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})