In [None]:
experiment_name = "Inferenza_su_output_global"

In [None]:
from IPython.display import clear_output
huggingface_cli_token = input("INSERT YOUR huggingface-cli_token:")
clear_output()

In [None]:
!huggingface-cli login --token $huggingface_cli_token

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `tirocinio2` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `tirocinio2`


In [None]:
from google.colab import drive
from IPython.display import Image, display

mount_point = "/content/drive"
base_path = mount_point + "/MyDrive"

input_data_path = base_path + "/Z2_DDD_merged.csv"

output_data_path = base_path + experiment_name
log_filename = base_path + experiment_name + ".log"

drive.mount(mount_point, force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import pickle

In [None]:
def summarize_adherence_columns(row):
    """
    Summarize the three adherence columns into a single value representing which one is active
    0 for low adherence, 1 for intermediate adherence and 2 for high adherence
    :param row: dataframe row
    :return: adherence value
    """

    if row["BASSA ADERENZA"] == 1:
        return 0

    if row["ALTA ADERENZA"] == 1:
        return 1

    return -1

In [None]:
data = pd.read_csv(input_data_path,
        parse_dates=["DT_NAS"],
        date_format="%d/%m/%Y",
        usecols=[
            # Patient code
            "CODICE PAZIENTE UNIVOCO",
            # Sex
            "SESSO",
            # Date of birth
            "DT_NAS",
            # City of birth
            "COMUNE NASCITA",
            # City of residence
            "COMUNE_RESIDENZA",
            # First drug to take
            "PRIMO_PROD",
            # Shift
            "SHIFT",
            # Adherence
            "BASSA ADERENZA",
            "ALTA ADERENZA",

        ],)

data["ADERENZA"] = data.apply(summarize_adherence_columns, axis=1)
data.drop(columns=["BASSA ADERENZA", "ALTA ADERENZA"], inplace=True)



data.head(3)

Unnamed: 0,CODICE PAZIENTE UNIVOCO,SESSO,DT_NAS,COMUNE NASCITA,COMUNE_RESIDENZA,SHIFT,PRIMO_PROD,ADERENZA
0,A1003098,F,1966-10-01,ASSEMINI,ASSEMINI,0,1470395,1
1,A1018737,F,1963-02-11,CAGLIARI,QUARTUCCIU,0,1470395,1
2,A1030052,F,1970-10-03,CAGLIARI,QUARTU SANT'ELENA,0,1470395,1


In [None]:
def apply_mistral_format(data):

  template = f"""Given the sex, birth date, birth city, residence city, first drug to take and shift of a patient,
    all information given line by line and formatted as 'label: value', the whole block of lines being enclosed
    by triple single quotes, predict the value of the adherence for this patient.
    Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict one value for adherence based on the patient's information and to output the predicted
    value in the same format.
    Do not absolutely include for any reason any other content, especially input information, in the output.
    '''
    sex: {data['SESSO']}
    birth date: {data['DT_NAS']}
    birth city: {data['COMUNE NASCITA']}
    residence city: {data['COMUNE_RESIDENZA']}
    first drug to take: {data['PRIMO_PROD']}
    shift: {data['SHIFT']}
    '''
    """

  return template

In [None]:
data['formatted_prompt'] = data.apply(apply_mistral_format, axis=1)

In [None]:
data.iloc[0]['formatted_prompt']
data.head(3)

Unnamed: 0,CODICE PAZIENTE UNIVOCO,SESSO,DT_NAS,COMUNE NASCITA,COMUNE_RESIDENZA,SHIFT,PRIMO_PROD,ADERENZA,formatted_prompt
0,A1003098,F,1966-10-01,ASSEMINI,ASSEMINI,0,1470395,1,"Given the sex, birth date, birth city, residen..."
1,A1018737,F,1963-02-11,CAGLIARI,QUARTUCCIU,0,1470395,1,"Given the sex, birth date, birth city, residen..."
2,A1030052,F,1970-10-03,CAGLIARI,QUARTU SANT'ELENA,0,1470395,1,"Given the sex, birth date, birth city, residen..."


In [None]:
!pip install -q datasets --progress-bar off

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.[0m[31m
[0m

In [None]:
%%capture
%pip install -U bitsandbytes
%pip install -U transformers
%pip install -U peft
%pip install -U accelerate
%pip install -U trl

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch #, wandb
# from datasets import load_dataset
# from trl import SFTTrainer

In [None]:
from peft import PeftModel, PeftConfig

base_model = "mistralai/Mistral-7B-Instruct-v0.3"

bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
model = AutoModelForCausalLM.from_pretrained(
        base_model,
        #load_in_4bit=True,
        quantization_config=bnb_config,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True,
)
model = PeftModel.from_pretrained(model, base_path+"/Mistral-7B-Instruct-v0.3-fine-tuning-global-DDD-adherenceBased", device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

(True, True)

In [None]:
from transformers import pipeline

CODICE PER TESTARE IL PAZIENTE SINGOLO:

In [None]:

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10) #20 / 25
messages = [
    {"role": "user", "content": """Given the sex, birth date, birth city, residence city, first drug to take and shift of a patient,
    all information given line by line and formatted as 'label: value', the whole block of lines being enclosed
    by triple single quotes, predict the values of the adherence for this patient.
    Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict one value for adherence based on the patient's information and to output the predicted
    value in the same format.
    Do not absolutely include for any reason any other content, especially input information, in the output.
    '''
    sex: "F"
    birth date: "1966/10/01"
    birth city: "ASSEMINI"
    residence city: "ASSEMINI"
    first drug to take: "1480002"
    shift: "2"
    '''
    """ },
]

chatbot(messages)



CODICE PER CREARE UN FILE CSV CON I RISULTATI CONFRONTABILI

In [None]:
import csv
import re

chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=10)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
outputs = []

nome_file = '/content/drive/My Drive/inferenza_global_DDD_adherence_only.csv'
headers = ["CODICE PAZIENTE UNIVOCO","SESSO","DT_NAS", "COMUNE NASCITA", "COMUNE_RESIDENZA", "PRIMO_PROD", "SHIFT", "ADERENZA PREVISTA","ADERENZA REALE"]

with open(nome_file, mode='w', newline='') as file:
    print("Generazione file in corso...")
    writer = csv.writer(file)
    writer.writerow(headers)

    for index, row in data.iterrows():
        riga=[]
        for j in range (len(headers)):

          if j == 0:
              valore_cella = row['CODICE PAZIENTE UNIVOCO']
          elif j == 1:
              valore_cella = row['SESSO']
          elif j == 2:
              valore_cella = row['DT_NAS']
          elif j == 3:
              valore_cella = row['COMUNE NASCITA']
          elif j == 4:
              valore_cella = row['COMUNE_RESIDENZA']
          elif j == 5:
              valore_cella = row['PRIMO_PROD']
          elif j == 6:
              valore_cella = row['SHIFT']
          elif j == 7:
              message = f"""Given the sex, birth date, birth city, residence city, first drug to take and shift of a patient,
              all information given line by line and formatted as 'label: value', the whole block of lines being enclosed
              by triple single quotes, predict the values for adherence and follow-up persistence for this patient.
              Do not consider any information than those provided enclosed by triple single quotes. Your task is to predict one value for adherence and one for follow-up persistence based on the patient's information and to output the predicted
              values in the same format.
              Do not absolutely include for any reason any other content, especially input information, in the output.
              '''
              sex: {row['SESSO']}
              birth date: {row['DT_NAS']}
              birth city: {row['COMUNE NASCITA']}
              residence city: {row['COMUNE_RESIDENZA']}
              first drug to take: {row['PRIMO_PROD']}
              shift: {row['SHIFT']}
              '''
              """

              prediction = chatbot([{"role": "user", "content": message}])

              outputs.append(prediction[0]['generated_text'][1]['content'])

              for output in outputs:
                  adherence = re.search(r"adherence: (\d+|-1)", output)
                  if adherence:
                      valore_cella = int(adherence.group(1))


          elif j == 8:
              valore_cella = row['ADERENZA']

          riga.append(valore_cella)

        writer.writerow(riga)


print("File CSV scritto e salvato con successo!")




Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'GraniteForCausalLM', 'GraniteMoeForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianFor

Generazione file in corso...
File CSV scritto e salvato con successo!
