In [None]:
%pip install datasets

In [None]:
%pip install sacremoses

In [None]:
# Import necessary libraries
import pandas as pd
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, pipeline
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling


In [None]:
# Setting up configuration variables for the project
ENABLE_FINE_TUNING = False  # Set to True to fine-tune
FINE_TUNING_DATA_PATH = '/content/healthcare_fine_tuning_data.txt'  # Path in Colab
FINE_TUNED_MODEL_DIR = '/content/fine_tuned_healthcare_model'  # Path in Colab
model_name = 'microsoft/BioGPT-Large'  # Use BioGPT model for healthcare
INPUT_FILE_PATH = '/content/input.csv'
OUTPUT_FILE_PATH = '/content/output.csv'

In [None]:
# Fine-tuning function
def fine_tune_model():
    if ENABLE_FINE_TUNING:
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        datasets = load_dataset('text', data_files={'train': FINE_TUNING_DATA_PATH})
        tokenized_datasets = datasets.map(
            lambda examples: tokenizer(examples['text'], truncation=True, max_length=512),
            batched=True, num_proc=4, remove_columns=['text']
        )

        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
        training_args = TrainingArguments(
            output_dir=FINE_TUNED_MODEL_DIR,
            overwrite_output_dir=True,
            num_train_epochs=3,
            per_device_train_batch_size=1,
            save_steps=500,
            save_total_limit=2,
            prediction_loss_only=True,
            logging_steps=100,
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=tokenized_datasets['train'],
        )
        trainer.train()
        trainer.save_model(FINE_TUNED_MODEL_DIR)
        tokenizer.save_pretrained(FINE_TUNED_MODEL_DIR)


In [None]:
# Model loading and generation pipeline setup
def load_model():
    if not os.path.exists(FINE_TUNED_MODEL_DIR) or not os.path.isfile(os.path.join(FINE_TUNED_MODEL_DIR, 'config.json')):
        # If the fine-tuned model doesn't exist, load the pre-trained model and save it properly
        print("Loading pre-trained model...")
        os.makedirs(FINE_TUNED_MODEL_DIR, exist_ok=True)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForCausalLM.from_pretrained(model_name)

        # Save the pre-trained model to the directory
        tokenizer.save_pretrained(FINE_TUNED_MODEL_DIR)
        model.save_pretrained(FINE_TUNED_MODEL_DIR)
        model.config.to_json_file(os.path.join(FINE_TUNED_MODEL_DIR, 'config.json'))

    # Load the generator pipeline
    generator = pipeline('text-generation', model=FINE_TUNED_MODEL_DIR, tokenizer=FINE_TUNED_MODEL_DIR, device=-1)  # Change device to 0 if using GPU
    return generator

In [None]:
# Cell 5: Function to clean up generated descriptions
def clean_generated_description(text):
    # Remove special tokens, new lines, and unwanted symbols
    text = text.replace('<|endoftext|>', '').strip()

    # Stop generation at the first sentence (period)
    if '.' in text:
        text = text.split('.')[0] + '.'

    # Remove any unwanted characters that may appear
    unwanted_tokens = ['<', '>', '[', ']', '/', '▃', '≤', '≥']
    for token in unwanted_tokens:
        text = text.replace(token, '')

    # Remove consecutive spaces
    text = ' '.join(text.split())

    return text

In [None]:
# Function to generate description for healthcare dataset
def generate_description(generator, table_name, column_name):
    prompt = f"""Generate a brief description for the healthcare database column.

Examples:
Table: patient_records
Column: heart_rate
Description: The patient's heart rate measured in beats per minute.

Table: lab_results
Column: hba1c_level
Description: The patient's HbA1c level indicating average blood sugar over the past 3 months.

Table: medications
Column: dosage
Description: The prescribed dosage of the medication.

Table: {table_name}
Column: {column_name}
Description:"""
    outputs = generator(
        prompt,
        max_length=150,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True
    )
    generated_text = outputs[0]['generated_text']

    # Clean up the generated description
    description = clean_generated_description(generated_text.replace(prompt, '').strip())
    return description


In [None]:
# Main function to process the input CSV file
def process_input_csv():
    df = pd.read_csv('/content/input.csv')  # Update path for Colab
    generator = load_model()
    df['description'] = df.apply(lambda row: generate_description(generator, row['table_name'], row['column_name']), axis=1)
    df.to_csv('/content/output.csv', index=False)
    print('Descriptions generated and saved to output.csv')


In [None]:
# Call the process function
process_input_csv()
