In [1]:
from datasets import load_dataset

In [2]:
ds = load_dataset('csv', data_files='data_sets/datascience_q_and_a.csv')
ds

DatasetDict({
    train: Dataset({
        features: ['Question', 'Answer'],
        num_rows: 158
    })
})

In [3]:
ds['train'][0]

{'Question': 'What is the purpose of feature engineering in machine learning?',
 'Answer': 'Feature engineering involves selecting, transforming, or creating new features from the raw data to improve the performance of machine learning models by making them more expressive, informative, and suitable for the task at hand.'}

### Handling remote datasets

In [5]:
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [6]:
data_files = {"train": "data_sets/drugs_train.csv", "test": "data_sets/drugs_test.csv"}
drug_dataset = load_dataset("csv", data_files=data_files)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [7]:
sample_ds = drug_dataset['train'].shuffle(seed=42).select(range(100))
sample_ds[:10]

{'uniqueID': [87571,
  178045,
  80482,
  159268,
  205477,
  37296,
  225540,
  194581,
  47908,
  28843],
 'drugName': ['Naproxen',
  'Duloxetine',
  'Mobic',
  'TriNessa',
  'Pristiq',
  'Vyvanse',
  'Bupropion',
  'Atorvastatin',
  'Adapalene',
  'Lexapro'],
 'condition': ['Gout, Acute',
  'ibromyalgia',
  'Inflammatory Conditions',
  'Birth Control',
  'Depression',
  'ADHD',
  'Depression',
  'High Cholesterol, Familial Homozygous',
  'Acne',
  'Anxiety'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and hor

In [8]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("uniqueID"))

In [9]:
drug_dataset = drug_dataset.filter(lambda x: x["condition"] is not None)
len(drug_dataset)

2

In [10]:
def lowercase_condition(sample):
    return {"condition": sample["condition"].lower()}

In [11]:
drug_dataset = drug_dataset.map(lowercase_condition)
# Check that lowercasing worked
drug_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [12]:
from transformers import AutoTokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)

def tokenize_and_split(sample):
    return tokenizer(
        sample["review"],
        truncation=True,
        max_length=256,
        return_overflowing_tokens=True,
        padding=True
    )

result = tokenize_and_split(drug_dataset["train"][0])
result

{'overflowing_tokens': [], 'num_truncated_tokens': -231, 'input_ids': [101, 107, 1135, 1144, 1185, 1334, 2629, 117, 146, 1321, 1122, 1107, 4612, 1104, 1650, 12223, 8031, 126, 150, 1403, 1105, 9425, 9105, 107, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [14]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True, batch_size=2000)

In [15]:
### From dataset to dataframe
drug_dataset.set_format("pandas")
drug_dataset["train"][:3]

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,left ventricular dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17


In [16]:
train_df = drug_dataset["train"][:]
frequencies = (
    train_df["condition"]
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "condition": "frequency"})
)
frequencies.head()

Unnamed: 0,frequency,count
0,birth control,28788
1,depression,9069
2,pain,6145
3,anxiety,5904
4,acne,5588


Data   | format	Function
-----------------------------
Arrow  | Dataset.save_to_disk(),
CSV	   | Dataset.to_csv(),
JSON   | Dataset.to_json(),

In [18]:
drug_dataset_clean = drug_dataset["train"].train_test_split(train_size=0.8, seed=42)
# Rename the default "test" split to "validation"
drug_dataset_clean["validation"] = drug_dataset_clean.pop("test")
# Add the "test" set to our `DatasetDict`
drug_dataset_clean["test"] = drug_dataset["test"]
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 32080
    })
    test: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})

In [19]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (0/1 shards):   0%|          | 0/128318 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/32080 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/53471 [00:00<?, ? examples/s]

In [37]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews")
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 128318
    })
    validation: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 32080
    })
    test: Dataset({
        features: ['uniqueID', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53471
    })
})