In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score

train = pd.read_csv("./kaggle/input/ml-olympiad-detect-hallucinations-in-llms/train.csv")
test = pd.read_csv("./kaggle/input/ml-olympiad-detect-hallucinations-in-llms/test.csv")

In [None]:
# Define project information
PROJECT_ID = "<your_project_id>"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Initialize Vertex AI models
from vertexai.language_models import TextEmbeddingModel
from vertexai.generative_models import GenerativeModel

model = GenerativeModel("gemini-1.0-pro")
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [None]:
def get_distance(prompt, target_answer, model, embedding_model):
    """Calculate the distance between a generated answer and a target answer"""

    try:
        # Generate an answer from the prompt using the model
        response = model.generate_content(
                    prompt,
                    generation_config={"temperature": 0},
                    )
        # Extract the generated answer from the response
        answer = response.candidates[0].content.parts[0].text
    except:
        # If an exception occurs during answer generation, set the answer to "no answer"
        answer = "no answer"

    # Get embeddings for the generated answer and the target answer
    embedded_answer = embedding_model.get_embeddings([str(answer)])[0].values
    embedded_target = embedding_model.get_embeddings([str(target_answer)])[0].values

    # Calculate the dot product between the embeddings and normalize it
    dot_product = np.dot(embedded_answer, embedded_target) / 1.
    
    # Calculate the distance between the embeddings
    distance = 1 - dot_product
        
    return distance


In [None]:
def calculate_scores(data, model, embedding_model, max_retries=10, print_interval=None):
    """Calculate scores for each row in the provided data"""
    
    scores = []
    
    # Iterate over each row in the data
    for row in tqdm(range(len(data))):
        score = None
        retry = 0
        
        # Attempt to calculate score with retries
        while score is None and retry <= max_retries:
            score = get_distance(data.iloc[row].Prompt, data.iloc[row].Answer, model, embedding_model)
            retry += 1
        
        # If score is still None after retries, set it to 0.25
        if score is None:
            score = 0.25
        
        scores.append(score)
        
        # Print progress and ROC AUC score if print_interval is set
        if print_interval is not None and row > 0 and row % print_interval == 0:
            roc_auc = roc_auc_score(y_true=data.Target.iloc[:len(scores)], y_score=scores)
            print(f"{row}/{len(data)} ROC AUC: {roc_auc}")
    
    return scores

In [None]:
# Compute scores for the train data
train_scores = calculate_scores(train, model, embedding_model, max_retries=10, print_interval=500)

In [None]:
# Save scores for the train data
train["score"] = train_scores
train.to_csv("scored_train.csv", index=False)

In [None]:
# Compute scores for the test data
test_scores = calculate_scores(test, model, embedding_model, max_retries=10, print_interval=None)

In [None]:
# Save your submission
submission = pd.read_csv("./kaggle/input/ml-olympiad-detect-hallucinations-in-llms/sample_submission.csv")
submission.Target = test_scores
submission.to_csv("submission.csv", index=False)