In [None]:
import ollama
import pandas as pd
import json
import re


In [None]:
# Ruta del archivo original de entrenamiento
ruta_dataset = '/app/input/petfinder-adoption-prediction/'
train_data = pd.read_csv(ruta_dataset + 'train/train.csv')
breed_labels = pd.read_csv(ruta_dataset + 'breed_labels.csv')
color_labels = pd.read_csv(ruta_dataset + 'color_labels.csv')
state_labels = pd.read_csv(ruta_dataset + 'state_labels.csv')


In [None]:
# Unir datos con etiquetas
train_data_ext = train_data.merge(breed_labels, left_on='Breed1', right_on='BreedID', how='left', suffixes=('', '_PrimaryBreed'))
train_data_ext = train_data_ext.merge(breed_labels, left_on='Breed2', right_on='BreedID', how='left', suffixes=('', '_SecondaryBreed'))
train_data_ext = train_data_ext.merge(color_labels, left_on='Color1', right_on='ColorID', how='left', suffixes=('', '_Color1'))
train_data_ext = train_data_ext.merge(color_labels, left_on='Color2', right_on='ColorID', how='left', suffixes=('', '_Color2'))
train_data_ext = train_data_ext.merge(color_labels, left_on='Color3', right_on='ColorID', how='left', suffixes=('', '_Color3'))
train_data_ext = train_data_ext.merge(state_labels, left_on='State', right_on='StateID', how='left')


In [None]:
# Diccionarios para mapear los valores numéricos a descripciones
gender_dict = {1: 'Male', 2: 'Female', 3: 'Mixed'}
maturity_size_dict = {1: 'Small', 2: 'Medium', 3: 'Large', 4: 'Extra Large', 0: 'Not Specified'}
fur_length_dict = {1: 'Short', 2: 'Medium', 3: 'Long', 0: 'Not Specified'}
vaccinated_dict = {1: 'Yes', 2: 'No', 3: 'Not Sure'}
dewormed_dict = {1: 'Yes', 2: 'No', 3: 'Not Sure'}
sterilized_dict = {1: 'Yes', 2: 'No', 3: 'Not Sure'}
health_dict = {1: 'Healthy', 2: 'Minor Injury', 3: 'Serious Injury', 0: 'Not Specified'}


In [None]:
train_data_ext.info()

In [None]:
#guardar el dataset extendido en forma termporaria
train_data_ext.to_csv('/app/input/petfinder-adoption-prediction/train/train_data_ext.csv', index=False)

In [54]:
# Función para generar la pregunta y obtener la respuesta de coherencia
def analyze_coherence(description, structured_data):
    prompt = f"""
    Analyze the pet profile description and determine if the provided information is coherent with the tabular data profile. Consider aspects like age, size, health, behavior, and any other relevant details. If you find inconsistencies, briefly describe them.

    Description: "{description}"

    Structured Data:
    - Type: {'Dog' if structured_data['Type'] == 1 else 'Cat'}
    - Gender: {gender_dict[structured_data['Gender']]}
    - Age: {structured_data['Age']} months
    - Primary Breed: {structured_data['BreedName']}
    - Secondary Breed: {structured_data['BreedName_Secondary']}
    - Color: {structured_data['ColorName']}
    - Maturity Size: {maturity_size_dict[structured_data['MaturitySize']]}
    - Fur Length: {fur_length_dict[structured_data['FurLength']]}
    - Vaccinated: {vaccinated_dict[structured_data['Vaccinated']]}
    - Dewormed: {dewormed_dict[structured_data['Dewormed']]}
    - Sterilized: {sterilized_dict[structured_data['Sterilized']]}
    - Health: {health_dict[structured_data['Health']]}
    - Photo Amount: {structured_data['PhotoAmt']}

    Answer the following questions with 0 or 1:
    - Does the pet require space to run? (requires_running_space)
    - Is the pet friendly with children? (friendly_with_children)
    - Is the pet friendly with other animals? (friendly_with_other_pets)
    - Is the pet described as calm? (is_calm)
    - Is the pet described as energetic? (is_energetic)

    Provide the answer in the following JSON format without any additional explanation:
    {{
        "coherence": "high",  // Possible values: "high", "medium", "low"
        "requires_running_space": 0,
        "friendly_with_children": 0,
        "friendly_with_other_pets": 0,
        "is_calm": 0,
        "is_energetic": 0,
        "inconsistencies": "brief description of inconsistency, max 10 words",
    }}
    """
    #print(prompt)
    client = ollama.Client()
    response = client.generate(
        model='phi3',
        prompt=prompt,
        format="json",
        stream=False,
        options= {
            "temperature": 0.0
        }
    )

    response_text = response['response']
    response_text = re.sub(r"(\w)(')(\w)", r"\1\2 \3", response_text)  # Arreglar apóstrofes no escapados
    response_text = response_text.split('}', 1)[0] + '}'  # Asegurar que solo hay un objeto JSON
    print(response_text)  # Agregar un print para verificar la respuesta formateada

    try:
        return json.loads(response_text)
    except json.JSONDecodeError as e:
        print(f"Error decodificando JSON: {e}")
        return {
            "coherence": "error",
            "requires_running_space": 0,
            "friendly_with_children": 0,
            "friendly_with_other_pets": 0,
            "is_calm": 0,
            "is_energetic": 0,
            "inconsistencies": ""
        }


In [55]:
# Crear una función para aplicar el análisis de coherencia a todo el dataset
def apply_coherence_analysis(df):
    coherences = []
    for _, row in df.iterrows():
        structured_data = {
            'Type': row['Type'],
            'Gender': row['Gender'],
            'Age': row['Age'],
            'BreedName': row['BreedName'],
            'BreedName_Secondary': row['BreedName_SecondaryBreed'],
            'ColorName': row['ColorName'],
            'MaturitySize': row['MaturitySize'],
            'FurLength': row['FurLength'],
            'Vaccinated': row['Vaccinated'],
            'Dewormed': row['Dewormed'],
            'Sterilized': row['Sterilized'],
            'Health': row['Health'],
            'PhotoAmt': row['PhotoAmt']
        }
        description = row['Description']
        coherence = analyze_coherence(description, structured_data)
        coherences.append(coherence)
        #imprimir el número de la fila que se está procesando
        print(_)

    coherence_df = pd.DataFrame(coherences)
    coherence_df['PetID'] = df['PetID'].values  # Añadir la columna PetID para facilitar el join
    return coherence_df


In [56]:
# Aplicar el análisis de coherencia al dataset
coherence_df = apply_coherence_analysis(train_data_ext)


{"coherence": "high", 
"requires_running_space": 1, 
"friendly_with_children": 0, 
"friendly_with_other_pets": 0, 
"is_calm": 0, 
"is_energetic": 1, 
"inconsistencies": "No inconsistencies found."}
0
{
    "coherence": "medium",
    "requires_running_space": 1,
    "friendly_with_children": 0,
    "friendly_with_other_pets": 0,
    "is_calm": 0,
    "is_energetic": 1,
    "inconsistencies": "Vaccination status unknown"
}
1
{
    "coherence": "medium",
    "requires_running_space": 1,
    "friendly_with_children": 1,
    "friendly_with_other_pets": 1,
    "is_calm": 0,
    "is_energetic": 1,
    "inconsistencies": "No information on behavior or temperament"
}
2
{"coherence": "high", 
"requires_running_space": 1, 
"friendly_with_children": 0, 
"friendly_with_other_pets": 0, 
"is_calm": 0, 
"is_energetic": 1, 
"inconsistencies": "No inconsistencies found."}
3
{
    "coherence": "high",
    "requires_running_space": 1,
    "friendly_with_children": 1,
    "friendly_with_other_pets": 0,
   

In [57]:
# Guardar el nuevo dataset con las características de coherencia
coherence_df.to_csv('/app/input/petfinder-adoption-prediction/train/coherence_analysis.csv', index=False)
print("Dataset de coherencia guardado exitosamente en 'coherence_analysis.csv'")

Dataset de coherencia guardado exitosamente en 'coherence_analysis.csv'
