In [None]:
%matplotlib inline


# Train and Validate a Diabetes Classification XGBoost Model

Watch a demo of sandbox creation and a sample execution of the pima diabetes pipeline below.

..  youtube:: YEvs0MHXZnY


In [None]:
import typing
from collections import OrderedDict
from dataclasses import dataclass
from typing import Tuple

import joblib
import pandas as pd
from dataclasses_json import dataclass_json
from flytekit import Resources, task, workflow
from flytekit.types.file import FlyteFile
from flytekit.types.schema import FlyteSchema
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

Since we are working with a specific dataset, we will create a strictly typed schema for the dataset.
If we wanted a generic data splitter we could use a Generic schema without any column type and name information
[Example file](https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv)
CSV Columns

#. Number of times pregnant
#. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
#. Diastolic blood pressure (mm Hg)
#. Triceps skin fold thickness (mm)
#. 2-Hour serum insulin (mu U/ml)
#. Body mass index (weight in kg/(height in m)^2)
#. Diabetes pedigree function
#. Age (years)
#. Class variable (0 or 1)

Example Row: 6,148,72,35,0,33.6,0.627,50,1
the input dataset schema



In [None]:
DATASET_COLUMNS = OrderedDict(
    {
        "#preg": int,
        "pgc_2h": int,
        "diastolic_bp": int,
        "tricep_skin_fold_mm": int,
        "serum_insulin_2h": int,
        "bmi": float,
        "diabetes_pedigree": float,
        "age": int,
        "class": int,
    }
)

The first 8 columns are features



In [None]:
FEATURE_COLUMNS = OrderedDict(
    {k: v for k, v in DATASET_COLUMNS.items() if k != "class"}
)

The last column is the class



In [None]:
CLASSES_COLUMNS = OrderedDict({"class": int})

Let us declare a task that accepts a CSV file with the previously defined
columns and converts it to a typed schema.
An example CSV file is available at
[here](https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv)_



In [None]:
@task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
def split_traintest_dataset(
    dataset: FlyteFile[typing.TypeVar("csv")], seed: int, test_split_ratio: float
) -> Tuple[
    FlyteSchema[FEATURE_COLUMNS],
    FlyteSchema[FEATURE_COLUMNS],
    FlyteSchema[CLASSES_COLUMNS],
    FlyteSchema[CLASSES_COLUMNS],
]:
    """
    Retrieves the training dataset from the given blob location and then splits it using the split ratio and returns the result
    This splitter is only for the dataset that has the format as specified in the example csv. The last column is assumed to be
    the class and all other columns 0-8 the features.

    The data is returned as a schema, which gets converted to a parquet file in the back.
    """
    column_names = [k for k in DATASET_COLUMNS.keys()]
    df = pd.read_csv(dataset, names=column_names)

    # Select all features
    x = df[column_names[:8]]
    # Select only the classes
    y = df[[column_names[-1]]]

    # split data into train and test sets
    return train_test_split(x, y, test_size=test_split_ratio, random_state=seed)

It is also possible to defined the output file type. This is useful in
combining tasks, where one task may only accept models serialized in ``.joblib.dat``



In [None]:
MODELSER_JOBLIB = typing.TypeVar("joblib.dat")

It is also possible in Flyte to pass custom objects, as long as they are
declared as ``dataclass``es and also decorated with ``@dataclass_json``.



In [None]:
@dataclass_json
@dataclass
class XGBoostModelHyperparams(object):
    """
    These are the xgboost hyper parameters available in scikit-learn library.
    """

    max_depth: int = 3
    learning_rate: float = 0.1
    n_estimators: int = 100
    objective: str = "binary:logistic"
    booster: str = "gbtree"
    n_jobs: int = 1


model_file = typing.NamedTuple("Model", model=FlyteFile[MODELSER_JOBLIB])
workflow_outputs = typing.NamedTuple(
    "WorkflowOutputs", model=FlyteFile[MODELSER_JOBLIB], accuracy=float
)


@task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
def fit(
    x: FlyteSchema[FEATURE_COLUMNS],
    y: FlyteSchema[CLASSES_COLUMNS],
    hyperparams: XGBoostModelHyperparams,
) -> model_file:
    """
    This function takes the given input features and their corresponding classes to train a XGBClassifier.
    NOTE: We have simplified the number of hyper parameters we take for demo purposes
    """
    x_df = x.open().all()
    y_df = y.open().all()

    # fit model no training data
    m = XGBClassifier(
        n_jobs=hyperparams.n_jobs,
        max_depth=hyperparams.max_depth,
        n_estimators=hyperparams.n_estimators,
        booster=hyperparams.booster,
        objective=hyperparams.objective,
        learning_rate=hyperparams.learning_rate,
    )
    m.fit(x_df, y_df)

    # TODO model Blob should be a file like object
    fname = "model.joblib.dat"
    joblib.dump(m, fname)
    return (fname,)


@task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
def predict(
    x: FlyteSchema[FEATURE_COLUMNS],
    model_ser: FlyteFile[MODELSER_JOBLIB],
) -> FlyteSchema[CLASSES_COLUMNS]:
    """
    Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns
    predictions.
    """
    model = joblib.load(model_ser)
    # make predictions for test data
    x_df = x.open().all()
    y_pred = model.predict(x_df)

    col = [k for k in CLASSES_COLUMNS.keys()]
    y_pred_df = pd.DataFrame(y_pred, columns=col, dtype="int64")
    y_pred_df.round(0)
    return y_pred_df


@task(cache_version="1.0", cache=True, limits=Resources(mem="200Mi"))
def score(
    predictions: FlyteSchema[CLASSES_COLUMNS], y: FlyteSchema[CLASSES_COLUMNS]
) -> float:
    """
    Compares the predictions with the actuals and returns the accuracy score.
    """
    pred_df = predictions.open().all()
    y_df = y.open().all()
    # evaluate predictions
    acc = accuracy_score(y_df, pred_df)
    print("Accuracy: %.2f%%" % (acc * 100.0))
    return float(acc)

Workflow sample here



In [None]:
@workflow
def diabetes_xgboost_model(
    dataset: FlyteFile[
        typing.TypeVar("csv")
    ] = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv",
    test_split_ratio: float = 0.33,
    seed: int = 7,
) -> workflow_outputs:
    """
    This pipeline trains an XGBoost mode for any given dataset that matches the schema as specified in
    https://github.com/jbrownlee/Datasets/blob/master/pima-indians-diabetes.names.
    """
    x_train, x_test, y_train, y_test = split_traintest_dataset(
        dataset=dataset, seed=seed, test_split_ratio=test_split_ratio
    )
    model = fit(
        x=x_train,
        y=y_train,
        hyperparams=XGBoostModelHyperparams(max_depth=4),
    )
    predictions = predict(x=x_test, model_ser=model.model)
    return model.model, score(predictions=predictions, y=y_test)

The entire workflow can be executed locally as follows.



In [None]:
if __name__ == "__main__":
    print(f"Running {__file__} main...")
    print(diabetes_xgboost_model())

In [None]:
!pip install torch==1.9.0
!pip install transformers==4.10.0
!pip install sentencepiece

In [None]:
import pandas as pd

diseases = ['cancer', 'diabetes', 'heart disease', 'asthma', 'Alzheimer\'s']
df = pd.DataFrame({'disease': diseases})

In [None]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [None]:
# Load pre-trained T5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

In [None]:
print(tokenizer)

None


In [None]:
# Loop through each disease and generate two drug names
for index, row in df.iterrows():
    disease = row['disease']
    input_str = f"generate 2 drugs for {disease}"
    input_ids = tokenizer.encode(input_str, return_tensors='pt')
    outputs = model.generate(input_ids=input_ids, max_length=100, num_beams=5, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    drugs = generated.split(', ')
    df.at[index, 'drug1'] = drugs[0]
    df.at[index, 'drug2'] = drugs[1]

In [None]:
!pip install transformers

In [None]:
# Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load pre-trained BioBERT model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained('dmis-lab/biobert-large-cased-v1.1')
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-large-cased-v1.1')

# Set up device for running the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define function to generate drug names for a given disease
def generate_drug_names(disease):
    input_str = f"generate 2 drugs for {disease}"
    input_ids = tokenizer.encode(input_str, return_tensors='pt').to(device)
    outputs = model.generate(input_ids=input_ids, max_length=50, num_beams=5, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated.split(' and ')




In [None]:
# Create a dummy dataframe with a column of diseases
import pandas as pd
df = pd.DataFrame({'disease': ['cancer', 'diabetes', 'asthma']})


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load pre-trained BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-large-cased-v1.1')
model = AutoModelForCausalLM.from_pretrained('dmis-lab/biobert-large-cased-v1.1')

If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
Some weights of the model checkpoint at dmis-lab/biobert-large-cased-v1.1 were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertLMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertLMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# Loop through each disease and generate two drug names
for index, row in df.iterrows():
    disease = row['disease']
    input_str = f"generate 2 drugs for {disease}"
    input_ids = tokenizer.encode(input_str, return_tensors='pt')
    outputs = model.generate(input_ids=input_ids, max_length=10, num_beams=5, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    drug_names = generated.split(' | ')
    if len(drug_names) >= 2:
        print(f"For {disease}, the generated drug names are {drug_names[0]} and {drug_names[1]}")
    else:
        print(f"No drug names generated for {disease}")


No drug names generated for cancer
No drug names generated for diabetes
No drug names generated for asthma


In [None]:
from multiprocessing import Pool

# Define a function to generate drug names for a given disease
def generate_drugs(disease):
    input_str = f"generate 2 drugs for {disease}"
    input_ids = tokenizer.encode(input_str, return_tensors='pt')
    outputs = model.generate(input_ids=input_ids, max_length=100, num_beams=5, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    drug_names = generated.split(' | ')
    return (disease, drug_names)

# Define the number of worker processes to use
num_workers = 10

# Create a multiprocessing Pool
with Pool(num_workers) as pool:
    # Map the generate_drugs function to each disease in the dataframe
    results = pool.map(generate_drugs, df['disease'])

# Print the results
for disease, drug_names in results:
    if len(drug_names) > 0:
        print(f"For {disease}, the generated drug names are {drug_names[0]} and {drug_names[1]}")
    else:
        print(f"No drug names generated for {disease}")


IndexError: ignored

In [1]:
!pip install transformers
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1
Looking in indexes: https://pypi.org/simple, http

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

In [7]:
def suggest_drugs(diseases):
    drugs = []
    for disease in diseases:
        # format the disease as input to the model
        input_text = f"treat {disease} with"
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        
        # make predictions using the model
        with torch.no_grad():
            logits = model(input_ids)[0]
        probabilities = torch.softmax(logits, dim=1)[0]
        
        # map the predicted label to a drug
        label_map = {
            0: "aspirin",
            1: "ibuprofen",
            2: "acetaminophen",
            3: "naproxen",
            4: "prednisone"
        }
        predicted_label = int(torch.argmax(probabilities))
        drug = label_map[predicted_label]
        drugs.append(drug)
    
    return drugs



In [8]:
diseases = ["hypertension", "diabetes", "asthma"]
drugs = suggest_drugs(diseases)
print(drugs)


['ibuprofen', 'ibuprofen', 'ibuprofen']


In [11]:
diseases_df = pd.DataFrame({"Disease": ["hypertension", "diabetes", "asthma"]})

In [None]:
import pandas as pd
import torch

# load the pre-trained model and tokenizer
model = torch.load("drug_suggester_model.pt")
tokenizer = torch.load("drug_suggester_tokenizer.pt")

def suggest_drugs(df):
    drugs = []
    for disease in df["Disease"]:
        # format the disease as input to the model
        input_text = f"treat {disease} with"
        input_ids = tokenizer.encode(input_text, return_tensors="pt")
        
        # make predictions using the model
        with torch.no_grad():
            logits = model(input_ids)[0]
        probabilities = torch.softmax(logits, dim=1)[0]
        
        # map the predicted label to a drug
        label_map = pd.DataFrame({
            "Label": [0, 1, 2, 3, 4],
            "Drug": ["aspirin", "ibuprofen", "acetaminophen", "naproxen", "prednisone"]
        })
        predicted_label = int(torch.argmax(probabilities))
        drug = label_map[label_map["Label"] == predicted_label]["Drug"].iloc[0]
        drugs.append(drug)
    
    # create a DataFrame from the list of drugs
    drugs_df = pd.DataFrame({"Disease": df["Disease"], "Drug": drugs})
    
    return drugs_df

# example usage
diseases_df = pd.DataFrame({"Disease": ["hypertension", "diabetes", "asthma"]})
drugs_df = suggest_drugs(diseases_df)

# display the resulting DataFrame
print(drugs_df)


In [26]:
model_name = "l3vr0n/clinical_BERT_for_drug_recommendation"

In [27]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Define a function to suggest drugs for a list of diseases
def suggest_drugs(diseases):
    drugs = []
    for disease in diseases:
        # Tokenize the disease name and generate input IDs and attention masks
        inputs = tokenizer.encode_plus(
            disease,
            max_length=64,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        
        # Feed the input IDs and attention mask to the model to get probabilities
        with torch.no_grad():
            logits = model(input_ids, attention_mask=attention_mask)
            probabilities = torch.softmax(logits[0], dim=1)[0]
        
        # Map the predicted label to a drug
        label_map = {
            0: "aspirin",
            1: "ibuprofen",
            2: "tamsulosin",
            3: "naproxen",
            4: "prednisone"
        }
        predicted_label = int(torch.argmax(probabilities))
        drug = label_map[predicted_label]
        drugs.append(drug)
    
    # Create a DataFrame of the suggested drugs
    drug_df = pd.DataFrame({"Disease": diseases, "Drug": drugs})
    return drug_df

# Example usage
diseases = ["hypertension", "kidney failure", "asthma"]
drugs = suggest_drugs(diseases)
print(drugs)


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model

          Disease       Drug
0    hypertension  ibuprofen
1  kidney failure  ibuprofen
2          asthma  ibuprofen


In [15]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the pre-trained model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("allenai/scibert_scivocab_cased")
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_cased")

# Load the diseases and drugs dataframes
diseases_df = pd.read_excel("/content/diseases.xlsx")
drugs_df = pd.read_excel("/content/drugs.xlsx")

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

In [16]:
diseases_df

Unnamed: 0,disease
0,cancer
1,kidney stone


In [17]:
drugs_df

Unnamed: 0,DRUG
0,aspirin
1,ibuprofen
2,acetaminophen


In [23]:
# Define the function to suggest drugs for a list of diseases
def suggest_drugs(diseases):
    drugs = []
    for disease in diseases:
        # Preprocess the input text
        inputs = tokenizer.encode(disease, return_tensors="pt")
        # Get the model's predictions
        outputs = model(inputs)[0]
        probabilities = torch.softmax(outputs, dim=1)[0]
        # Map the predicted label to a drug
        predicted_label = int(torch.argmax(probabilities))
        drug = drugs_df.loc[drugs_df['predicted_label'] == predicted_label, 'drug'].iloc[0]
        drugs.append(drug)
    return drugs


In [24]:
# Test the function
diseases = ["hypertension", "diabetes", "asthma"]
drugs = suggest_drugs(diseases)
print(drugs)

KeyError: ignored