In [2]:
from transformers import pipeline

camembert_fill_mask = pipeline("fill-mask", model="camembert-base")
results = camembert_fill_mask("Le camembert est <mask> :)")
results

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


[{'score': 0.49091637134552,
  'token': 7200,
  'token_str': 'délicieux',
  'sequence': 'Le camembert est délicieux :)'},
 {'score': 0.1055707037448883,
  'token': 2183,
  'token_str': 'excellent',
  'sequence': 'Le camembert est excellent :)'},
 {'score': 0.03453364223241806,
  'token': 26202,
  'token_str': 'succulent',
  'sequence': 'Le camembert est succulent :)'},
 {'score': 0.033031709492206573,
  'token': 528,
  'token_str': 'meilleur',
  'sequence': 'Le camembert est meilleur :)'},
 {'score': 0.030076919123530388,
  'token': 1654,
  'token_str': 'parfait',
  'sequence': 'Le camembert est parfait :)'}]

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

checkpoint = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForMaskedLM.from_pretrained(checkpoint)
 
input_txt = tokenizer("Le camembert est <mask> :)", return_tensors="pt")

output = model(**input_txt)

Some weights of the model checkpoint at camembert-base were not used when initializing CamembertForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
# local import
from datasets import load_dataset

data_files = {"train": "../SQuAD_it-train.json", "test": "../SQuAD_it-test.json"}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [5]:
# distant import
url = "https://github.com/crux82/squad-it/raw/master/"
data_files = {
    "train": url + "SQuAD_it-train.json.gz",
    "test": url + "SQuAD_it-test.json.gz",
}
squad_it_dataset = load_dataset("json", data_files=data_files, field="data")
squad_it_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    test: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})

In [41]:
data_files = {
    'train': '../drugsComTrain_raw.tsv', 
    'test': '../drugsComTest_raw.tsv'
}

# "\t" -> tabulation
drug_dataset = load_dataset('csv', data_files=data_files, delimiter='\t')
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [42]:
drug_samples = drug_dataset['train'].shuffle(seed=42).select(range(1000))
#drug_samples[:8]

In [43]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))


drug_dataset['train'].unique

<bound method Dataset.unique of Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 161297
})>

In [44]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)

drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [45]:
drug_dataset = drug_dataset.filter(lambda x: x['condition'] is not None)

def lowercase_condition(example):
    return {'condition': example['condition'].lower()}

drug_dataset = drug_dataset = drug_dataset.map(lowercase_condition)

In [46]:
drug_dataset['train']['condition']

Column(['left ventricular dysfunction', 'adhd', 'birth control', 'birth control', 'opiate dependence'])

In [47]:
def compute_review_length(x):
    return {'review_length': len(x['review'].split())}

drug_dataset = drug_dataset.map(compute_review_length)

In [48]:
drug_dataset['train'].sort('review_length')[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [49]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 30)

In [50]:
drug_dataset.num_rows

{'train': 138514, 'test': 46108}

In [51]:
sorted_dec = drug_dataset.sort('review_length', reverse=True)
sorted_dec['train']['review_length'][:3]

[1894, 1162, 1107]

In [52]:
import html

drug_dataset = drug_dataset.map(
    lambda x: 
        { 'review': [html.unescape(o) for o in x['review']] },
        batched=True
)

In [53]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenizer_function(example):
    return tokenizer(example['review'], truncation=True)

In [22]:
%time tokenized_dataset = drug_dataset.map(tokenizer_function, batched=True)

Map: 100%|██████████| 138514/138514 [00:07<00:00, 19534.68 examples/s]
Map: 100%|██████████| 46108/46108 [00:02<00:00, 18915.42 examples/s]

CPU times: user 1min 3s, sys: 1.36 s, total: 1min 4s
Wall time: 9.54 s





In [23]:
%time tokenized_dataset2 = drug_dataset.map(tokenizer_function, batched=False)

Map: 100%|██████████| 138514/138514 [00:24<00:00, 5631.18 examples/s]
Map: 100%|██████████| 46108/46108 [00:08<00:00, 5616.28 examples/s]

CPU times: user 32.4 s, sys: 294 ms, total: 32.7 s
Wall time: 32.8 s





In [29]:
tokenizer_without_use_fast = AutoTokenizer.from_pretrained('bert-base-cased', use_fast=False)

def tokenized_function_slow(example):
    return tokenizer_without_use_fast(example['review'], truncation=True)

# %time tokenized_dataset3 = drug_dataset.map(tokenized_function_slow, batched=False) # too long i stop.
# %time tokenized_dataset4 = drug_dataset.map(tokenized_function_slow, batched=True) # too long i stop too.

In [54]:
def tokenize_and_split(example):
    result = tokenizer(
        example['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True
    )
    # Extraire la correspondance entre les nouveaux et les anciens indices
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in example.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [57]:
tokenized_dataset = drug_dataset.map(tokenize_and_split, batched=True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 206772
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 68876
    })
})

In [75]:
drug_dataset.set_format('pandas')
drug_dataset["train"][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [62]:
train_df = drug_dataset["train"][:]

In [69]:
frequencies = (
    train_df['condition']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={"index": "condition", "count": "frequency"})
)

frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,27655
1,depression,8023
2,acne,5209
3,anxiety,4991
4,pain,4744


In [71]:
from datasets import Dataset

freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset

Dataset({
    features: ['condition', 'frequency'],
    num_rows: 819
})

In [81]:
mean_rating = (
    train_df.groupby('drugName')['rating']
    .mean()
    .reset_index()
    .rename(columns={'rating': 'mean_rating'})
)

mean_rating

Unnamed: 0,drugName,mean_rating
0,A + D Cracked Skin Relief,10.000000
1,A / B Otic,10.000000
2,Abacavir / dolutegravir / lamivudine,7.953488
3,Abacavir / lamivudine / zidovudine,9.000000
4,Abatacept,7.312500
...,...,...
3047,Zyvox,9.200000
3048,ZzzQuil,4.000000
3049,depo-subQ provera 104,1.000000
3050,ella,6.847826


In [83]:
drug_dataset.reset_format()

In [85]:
drug_dataset_clean = drug_dataset['train'].train_test_split(train_size=0.8, seed=42)
drug_dataset_clean['validation'] = drug_dataset_clean.pop('test')
drug_dataset_clean['test'] = drug_dataset['test']
drug_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [87]:
drug_dataset_clean.save_to_disk("drug-reviews")

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:00<00:00, 312081.19 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 301571.26 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 3387772.07 examples/s]


In [88]:
from datasets import load_from_disk

drug_dataset_reloaded = load_from_disk("drug-reviews") 
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})