In [1]:
%%capture
!pip install datasets transformers[sentencepiece]

# Sliciing and dicing out data

In [2]:
#download drug review dataset
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"
!unzip drugsCom_raw.zip

--2021-11-28 13:34:18--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42989872 (41M) [application/x-httpd-php]
Saving to: ‘drugsCom_raw.zip’


2021-11-28 13:34:20 (38.1 MB/s) - ‘drugsCom_raw.zip’ saved [42989872/42989872]

Archive:  drugsCom_raw.zip
  inflating: drugsComTest_raw.tsv    
  inflating: drugsComTrain_raw.tsv   


In [3]:
from datasets import load_dataset
data_files = {'train': 'drugsComTrain_raw.tsv', 'test': 'drugsComTest_raw.tsv'}

In [4]:
drug_dataset = load_dataset('csv', data_files=data_files, delimiter='\t')

Using custom data configuration default-3761173c276c0a9a


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3761173c276c0a9a/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3761173c276c0a9a/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [6]:
# Create a random sample by chaining shuffle and select funcs
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'date': ['September 2, 2015', 'November 7, 2011', 'June 5, 2013'],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'rating': [9.0, 3.0, 10.0],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than t

In [7]:
drug_dataset.keys()

dict_keys(['train', 'test'])

In [8]:
#check 'Unnamed' is an id
for split in drug_dataset.keys():
  len(drug_dataset[split]) == len(drug_dataset[split].unique('Unnamed: 0'))

In [9]:
#rename 'Unnamed'
drug_dataset = drug_dataset.rename_column(original_column_name='Unnamed: 0', new_column_name='patient_id')

In [10]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

## Try it out
Try it out! Use the Dataset.unique() function to find the number of unique drugs and conditions in the training and test sets.

In [11]:
len(drug_dataset['train'].unique(column='drugName')), len(drug_dataset['test'].unique(column='drugName'))

(3436, 2637)

In [12]:
drug_dataset['train']['condition'][:3]

['Left Ventricular Dysfunction', 'ADHD', 'Birth Control']

In [13]:
def filter_nones(x): return x['condition'] is not None

In [15]:
def lowercase_condition(x): return {'condition': x['condition'].lower()}
#drug_dataset.map(lowercase_condition)

In [16]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [17]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)

  0%|          | 0/162 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

In [18]:
drug_dataset = drug_dataset.map(lowercase_condition)

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

In [19]:
drug_dataset['train']['condition'][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

## Creating new columns

In [20]:
# count the number of words in each review
def review_len(ex): return {'review_length': len(ex['review'].split())}

In [21]:
# adds a new column
drug_dataset = drug_dataset.map(review_len)
drug_dataset['train'][0]

  0%|          | 0/160398 [00:00<?, ?ex/s]

  0%|          | 0/53471 [00:00<?, ?ex/s]

{'condition': 'left ventricular dysfunction',
 'date': 'May 20, 2012',
 'drugName': 'Valsartan',
 'patient_id': 206461,
 'rating': 9.0,
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'review_length': 17,
 'usefulCount': 27}

In [22]:
drug_dataset['train'].sort('review_length')[:3]

{'condition': ['birth control', 'muscle spasm', 'pain'],
 'date': ['November 4, 2008', 'March 24, 2017', 'August 20, 2016'],
 'drugName': ['Loestrin 21 1 / 20', 'Chlorzoxazone', 'Nucynta'],
 'patient_id': [103488, 23627, 20558],
 'rating': [10.0, 1.0, 6.0],
 'review': ['"Excellent."', '"useless"', '"ok"'],
 'review_length': [1, 1, 1],
 'usefulCount': [5, 2, 10]}

🙋 An alternative way to add new columns to a dataset is with the `Dataset.add_column()` function. This allows you to provide the column as a Python list or NumPy array and can be handy in situations where `Dataset.map()` is not well suited for your analysis.

In [23]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)

  0%|          | 0/161 [00:00<?, ?ba/s]

  0%|          | 0/54 [00:00<?, ?ba/s]

In [24]:
print(drug_dataset.num_rows)

{'train': 138514, 'test': 46108}


## Try it out
✏️ Try it out! Use the Dataset.sort() function to inspect the reviews with the largest numbers of words

In [25]:
drug_dataset['train'].sort('review_length', reverse=True)['review'][:3]

['"Two and a half months ago I was prescribed Venlafaxine to help prevent chronic migraines.\r\nIt did help the migraines (reduced them by almost half), but with it came a host of side effects that were far worse than the problem I was trying to get rid of.\r\nHaving now come off of the stuff, I would not recommend anyone ever use Venlafaxine unless they suffer from extreme / suicidal depression. I mean extreme in the most emphatic sense of the word. \r\nBefore trying Venlafaxine, I was a writer. While on Venlafaxine, I could barely write or speak or communicate at all. More than that, I just didn&#039;t want to. Not normal for a usually outgoing extrovert.\r\nNow, I&#039;m beginning to write again - but my ability to speak and converse with others has deteriorated by about 95%. Writing these words is taking forever; keeping up in conversation with even one person is impossible, and I barely see the point of trying either. On Venlafaxine, words pretty much left me - my conversational v

In [26]:
# handle HTML character codes
import html
text = "I&#039;m a transformer called BERT"
html.unescape(text)

"I'm a transformer called BERT"

In [27]:
drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})

  0%|          | 0/138514 [00:00<?, ?ex/s]

  0%|          | 0/46108 [00:00<?, ?ex/s]

In [28]:
drug_dataset['train'].sort('review_length', reverse=True)['review'][:3]

['"Two and a half months ago I was prescribed Venlafaxine to help prevent chronic migraines.\r\nIt did help the migraines (reduced them by almost half), but with it came a host of side effects that were far worse than the problem I was trying to get rid of.\r\nHaving now come off of the stuff, I would not recommend anyone ever use Venlafaxine unless they suffer from extreme / suicidal depression. I mean extreme in the most emphatic sense of the word. \r\nBefore trying Venlafaxine, I was a writer. While on Venlafaxine, I could barely write or speak or communicate at all. More than that, I just didn\'t want to. Not normal for a usually outgoing extrovert.\r\nNow, I\'m beginning to write again - but my ability to speak and converse with others has deteriorated by about 95%. Writing these words is taking forever; keeping up in conversation with even one person is impossible, and I barely see the point of trying either. On Venlafaxine, words pretty much left me - my conversational vocabular

## The map() method’s superpowers


In [29]:
# when batched=True specified, func receives a Dict with flds of dset but
# as a List of vals & not just a single val
new_drug_dataset = drug_dataset.map(
    lambda x: {'review': [html.unescape(o) for o in x['review']]},
    batched=True 
)

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

In [30]:
from transformers import AutoTokenizer

In [31]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [32]:
def tokenize_function(examples): return tokenizer(examples['review'], truncation=True)

In [33]:
%%time 
tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

CPU times: user 1min 26s, sys: 904 ms, total: 1min 27s
Wall time: 53.7 s


In [34]:
%%time 
tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

  0%|          | 0/138514 [00:00<?, ?ex/s]

  0%|          | 0/46108 [00:00<?, ?ex/s]

CPU times: user 2min, sys: 1.42 s, total: 2min 1s
Wall time: 2min 2s


## Try it out!

✏️ Try it out! Execute the same instruction with and without batched=True, then try it with a slow tokenizer (add use_fast=False in the AutoTokenizer.from_pretrained() method) so you can see what numbers you get on your hardware.

In [35]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=True)

In [36]:
%%time 
#tokenized_dataset = drug_dataset.map(tokenize_function, batched=True)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.54 µs


In [37]:
%%time 
#tokenized_dataset = drug_dataset.map(tokenize_function, batched=False)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.3 µs


In [38]:
#slow_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', use_fast=False)

In [39]:
def slow_tokenize_func(examples): return slow_tokenizer(examples['review'], truncation=True)

In [40]:
%%time
#tokenized_dataset = drug_dataset.map(slow_tokenize_func, batched=True, num_proc=8)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.3 µs


 In general, we don’t recommend using Python multiprocessing for fast tokenizers with `batched=True`.

Using `num_proc` to speed up your processing is usually a great idea, as long as the function you are using is not already doing some kind of multiprocessing of its own.

This is super useful in many situations where you want to create several training features from one example, and we will need to do this as part of the preprocessing for several of the NLP tasks we’ll undertake in Chapter 7.

In [41]:
# tokenize the examples, truncate to len of 128 but return all the chunks of text
def tokenize_and_split(examples): return tokenizer(examples['review'], truncation=True, max_length=128, 
                                                  return_overflowing_tokens=True)

In [42]:
# testing on a single example
result = tokenize_and_split(drug_dataset['train'][0]);
[len(inp) for inp in result['input_ids']]

[128, 49]

So, our first example in the training set became two features because it was tokenized to more than the maximum number of tokens we specified: the first one of length 128 and the second one of length 49.

In [43]:
# batch_size (Optional[int], default 1000) – Number of examples per batch provided to function if batched=True 
# batch_size <= 0 or batch_size == None: Provide the full dataset as a single batch to function.
# 1463 examples  
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)

  0%|          | 0/139 [00:00<?, ?ba/s]

ArrowInvalid: ignored

***I still do not understand this. Clarification needed❓***

**Reason for the error**
we’re trying to mix two different datasets of different sizes: the drug_dataset columns will have a certain number of examples (the 1,000 in our error), but the tokenized_dataset we are building will have more (the 1,463 in the error message). That doesn’t work for a Dataset, so we need to either a) **remove the columns from the old dataset** or b) **make them the same size as they are in the new dataset**.

Option a: We can do the former with the remove_columns argument:

In [45]:
len(drug_dataset['train']), len(drug_dataset['test'])

(138514, 46108)

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True, remove_columns=drug_dataset['train'])

❓❓ - Shows different length compared to the course chapter. How is this possible

In [48]:
len(tokenized_dataset['train']), len(drug_dataset['train'])

(138514, 138514)

Option b: deal with the mismatched length problem by making the old columns the same size as the new ones

In [56]:
result, len(result), result.keys(), len(result['input_ids']), result['overflow_to_sample_mapping']

({'input_ids': [[101, 107, 1422, 1488, 1110, 9079, 1194, 1117, 2223, 1989, 1104, 1130, 19972, 11083, 119, 1284, 1245, 4264, 1165, 1119, 1310, 1142, 1314, 1989, 117, 1165, 1119, 1408, 1781, 1103, 2439, 13753, 1119, 1209, 1129, 1113, 119, 1370, 1160, 1552, 117, 1119, 1180, 6374, 1243, 1149, 1104, 1908, 117, 1108, 1304, 172, 14687, 1183, 117, 1105, 7362, 1111, 2212, 129, 2005, 1113, 170, 2797, 1313, 1121, 1278, 12020, 113, 1304, 5283, 1111, 1140, 119, 114, 146, 1270, 1117, 3995, 1113, 6356, 2106, 1105, 1131, 1163, 1106, 6166, 1122, 1149, 170, 1374, 1552, 119, 3969, 1293, 1119, 1225, 1120, 1278, 117, 1105, 1114, 2033, 1146, 1107, 1103, 2106, 119, 1109, 1314, 1160, 1552, 1138, 1151, 2463, 1714, 119, 1124, 1110, 150, 21986, 3048, 1167, 5340, 1895, 1190, 1518, 102], [101, 119, 1124, 1110, 1750, 6438, 113, 170, 1363, 1645, 114, 117, 1750, 172, 14687, 1183, 119, 1124, 1110, 11566, 1155, 1103, 1614, 1119, 1431, 119, 8007, 1117, 4658, 1110, 1618, 119, 1284, 1138, 1793, 1242, 1472, 23897, 1105, 11

To do this, we will need the **overflow_to_sample_mapping** field the tokenizer returns when we set **return_overflowing_tokens=True**. It gives us a mapping from a new feature index to the index of the sample it originated from. Using this, we can associate each key present in our original dataset with a list of values of the right size by repeating the values of each example as many times as it generates new features:

In [59]:
def tokenize_and_split(examples):
  result = tokenizer(examples['review'], truncation=True, max_length=128, return_overflowing_tokens=True)

  # Extract mapping between old and new indicies
  sample_map = result.pop('overflow_to_sample_mapping')
  for key, val in examples.items():
    result[key] = [val[i] for i in sample_map]
  return result

In [60]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

  0%|          | 0/139 [00:00<?, ?ba/s]

  0%|          | 0/47 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['attention_mask', 'condition', 'date', 'drugName', 'input_ids', 'patient_id', 'rating', 'review', 'review_length', 'token_type_ids', 'usefulCount'],
        num_rows: 68876
    })
})

🔖 Tip: We get the same number of training features as before, but here we’ve kept all the old fields. If you need them for some post-processing after applying your model, you might want to use this approach.

## From Datasets to DataFrames and back

In [61]:
# Changes the __getitem__() method
# Alternatively Dataset.to_pandas()
# remember to reset the format 
drug_dataset.set_format("pandas")

In [62]:
drug_dataset['train'][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [64]:
drug_dataset['train'][0]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141


In [65]:
train_df = drug_dataset['train'][:]

🚨 Under the hood, Dataset.set_format() changes the return format for the dataset’s __getitem__() dunder method. This means that when we want to create a new object like train_df from a Dataset in the "pandas" format, we need to slice the whole dataset to obtain a pandas.DataFrame. You can verify for yourself that the type of drug_dataset["train"] is Dataset, irrespective of the output format.

In [70]:
# class distribution from condition
frequencies = train_df['condition'].value_counts().to_frame().reset_index().rename(columns={'index': 'condition', 'condition': 'frequency'})
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [71]:
from datasets import Dataset
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [72]:
drug_dataset['train']

Dataset({
    features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
    num_rows: 138514
})

## Try it out
✏️ Try it out! Compute the average rating per drug and store the result in a new Dataset.



In [73]:
# reset the output format from pandas to arrow
drug_dataset.reset_format()

## Creating a validation set

In [78]:
# split the train into train/val
drug_dataset_clean = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)
# rename the test split to val
drug_dataset_clean['validation'] = drug_dataset_clean.pop('test')
# add test set to DatasetDict
drug_dataset_clean['test'] = drug_dataset['test']

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/csv/default-3761173c276c0a9a/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-7391db8767c64c7c.arrow and /root/.cache/huggingface/datasets/csv/default-3761173c276c0a9a/0.0.0/bf68a4c4aefa545d0712b2fcbb1b327f905bbe2f6425fbc5e8c25234acb9e14a/cache-12855d611aa3c7bb.arrow


In [79]:
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

## Saving and reloading a dataset

- download or process a dataset, it is stored in cache dir
- arrow for storing & processing
- parquet are stored for long term storage 
- saving and reload 

In [80]:
# saving the cleaned dataset in arrow format
drug_dataset_clean.save_to_disk("drug-reviews")

Flattening the indices:   0%|          | 0/111 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/28 [00:00<?, ?ba/s]

```
drug-reviews/
├── dataset_dict.json
├── test
│   ├── dataset.arrow
│   ├── dataset_info.json
│   └── state.json
├── train
│   ├── dataset.arrow
│   ├── dataset_info.json
│   ├── indices.arrow
│   └── state.json
└── validation
    ├── dataset.arrow
    ├── dataset_info.json
    ├── indices.arrow
    └── state.json
```

In [81]:
# load the data from disk
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['condition', 'date', 'drugName', 'patient_id', 'rating', 'review', 'review_length', 'usefulCount'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['condition', 'date', 'drugName', 'patient_id', 'rating', 'review', 'review_length', 'usefulCount'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

To store in csv & json format, each split is stored as separate file.

In [83]:
for split, dataset in drug_dataset_clean.items():
  dataset.to_json(f'drug-reviews-{split}.jsonl')

Creating json from Arrow format:   0%|          | 0/12 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

In [84]:
!head -n 1 drug-reviews-train.jsonl

{"patient_id":89879,"drugName":"Cyclosporine","condition":"keratoconjunctivitis sicca","review":"\"I have used Restasis for about a year now and have seen almost no progress.  For most of my life I've had red and bothersome eyes. After trying various eye drops, my doctor recommended Restasis.  He said it typically takes 3 to 6 months for it to really kick in but it never did kick in.  When I put the drops in it burns my eyes for the first 30 - 40 minutes.  I've talked with my doctor about this and he said it is normal but should go away after some time, but it hasn't. Every year around spring time my eyes get terrible irritated  and this year has been the same (maybe even worse than other years) even though I've been using Restasis for a year now. The only difference I notice was for the first couple weeks, but now I'm ready to move on.\"","rating":2.0,"date":"April 20, 2013","usefulCount":69,"review_length":147}


In [85]:
data_files = {
    'train': 'drug-reviews-train.jsonl',
    'validation': 'drug-reviews-validation.jsonl',
    'test': 'drug-reviews-test.jsonl',
}

drug_dataset_reloaded = load_dataset('json', data_files=data_files)

Using custom data configuration default-585febeeab40942b


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-585febeeab40942b/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426...


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-585febeeab40942b/0.0.0/c2d554c3377ea79c7664b93dc65d0803b45e3279000f993c7bfd18937fd7f426. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

## Ideas

- Use the techniques from Chapter 3 to train a classifier that can predict the patient condition based on the drug review.
- Use the summarization pipeline from Chapter 1 to generate summaries of the reviews.