# Imports

In [None]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Set pandas options to display full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Load & format data

## Validation DF

In [64]:
# Original data dictionary
data = {
    "REALITY OF CHANGE": [
        "Q10_1 Climate variation is normal, so why should this be a problem?",
        "Q10_2 Climate change will not be a problem because there will be technological solutions available.",
        "Q10_3 I think it is safe to say climate change is here.",
        "Q10_4 The negative effects of climate change on local commerce should be taken into account.",
        "Q10_5 I do not trust what scientists say about climate change.",
        "Q10_6 I do not trust what I hear about climate change from the government.",
        "Q10_7 It is difficult to trust what comes out in the media on the issue of climate change.",
        "Q10_8 There is not enough information to definitively say that climate change is real."
    ],
    "RESPONSABILITY OF ACTION": [
        "Q20_1 There is not much point in me doing anything to make agriculture more sustainable. No one else is going to.",
        "Q20_2 I do not know what to do. I am very concerned and would like to do something, but I don’t have a realistic shortlist of things that would really make a difference.",
        "Q20_3 I believe that the difference we can have as an individual, in Switzerland, is so minimal that our actions are worthless.",
        "Q20_4 I am not going to do anything to address the impacts of the agri-food system because it is not a major issue."
    ],
    "ECONOMY AND MARKETS": [
        "Q30_1 I am happy to pay a higher price for local or organic food products",
        "Q30_2 Eating sustainably is too expensive, and only wealthy people can afford it",
        "Q30_3 Agricultural subsidies should promote sustainable, pesticide and antibiotic-free production.",
        "Q30_4 Sustainable agriculture increases prices and harms consumer freedom. The production should be adapted to what people demand.",
        "Q30_5 Food accessibility (price) and security (availability) is NOT COMPATIBLE with a sustainable food system",
        "Q30_6 Strong regulation of food production will eventually damage our economic relationship with other countries",
        "Q30_7 We should develop short distribution channels and support local production instead of supporting big agrifood corporations",
        "Q30_8 We need to focus on adaptation actions that are concerned with the economic future of the country."
    ],
    "POLLUTION AND BIODIVERSITY": [
        "Q40_1 Actions on the food system in Switzerland will lead to increased imports and a shift of pollution abroad",
        "Q40_2 We need actions to protect inhabitants and biodiversity from harmful pesticides.",
        "Q40_3 As long as we use animals in agriculture, it cannot be sustainable",
        "Q40_4 We need to deal with the water pollution issue through measures such as reducing fertilizer and pesticide use."
    ],
    "SUSTAINABLE AGRICULTURE": [
        "Q50_1 Banning pesticides might help the local biodiversity but decreases our food production capacity and autonomy",
        "Q50_2 Agriculture is already very sustainable. Swiss farmers are on the right track",
        "Q50_3 Intensive food production can go along with the protection of the environment and health",
        "Q50_4 Farmers already suffer enough from strict rules and paperwork. Family farmers should have more liberties",
        "Q50_5 The main focus of planning for adaptation should be to prepare the food system for extreme weather events, such as drought and flooding.",
        "Q50_6 Products that use environmentally harmful ingredients, such as palm oil, should be taxed in the same way as tobacco products.",
        "Q50_7 If Switzerland reduces greenhouse gas emissions in its food system, it won’t make a difference."
    ],
    "GLOBAL PERSPECTIVE": [
        "Q60_1 It is important to plan for migration by people displaced by malnutrition and famine.",
        "Q60_2 We should focus on protecting human systems rather than environmental ecosystems.",
        "Q60_3 The Swiss food system depends on imports from other countries, and it is in our interest to help find an effective global solution.",
        "Q60_4 Some communities are more affected than others by the global agri-food system, but this is not a priority issue.",
        "Q60_5 The Swiss food system is particularly vulnerable to climate change, and it is in our interest to help find an effective global solution.",
        "Q60_6 Climate warming will ultimately benefit food production globally as plants better growth in higher temperature"
    ],
    "ACTION AND RESPONSES": [
        "Q70_1 It is NOT necessary to engage with the community when deciding what are the most important issues for planning for food system changes.",
        "Q70_2 Planning for food system adaptation needs a holistic approach, considering a whole range of issues, instead of focusing on individual issues.",
        "Q70_3 There is no urgent need to prepare for an increase in illness and death caused by intensive use of pesticides/antibiotics",
        "Q70_4 The community should be involved in discussions about food system adaptation.",
        "Q70_5 It is important to support small-land local farming as they have the fewest resources to deal with the impacts of climate change."
    ],
    "PREFERENCES": [
        "Q90_1 Leave the policy settings as they are.",
        "Q90_2 Policies that emphasise fewer regulations (e.g., policies that promote free trade agreements by reducing food regulation)",
        "Q90_3 Policies that involve moderate sustainable regulations (e.g., banning certain types of pesticides, incentivising organic agriculture)",
        "Q90_4 Policies that involve a radical sustainable food regulation (e.g., banning all pesticides, banning the importation of food-related products that employ non-sustainable approaches, exclusively incentivising organic agriculture)",
        "Q90_5 Policies that create incentives for local producers to produce sustainable food (e.g., subsidies or tax breaks for local farmers who produce organic products)",
        "Q90_6 Adaptation policies and expenditure. Planning controls and emergency response programs.",
        "Q90_7 Preparing for climate risk for food production through the development of new approaches and technologies that enhance resilience to the impacts of climate variability or change."
    ]
}

# Reshaping the data into a new DataFrame
reshaped_data = []

for category, questions in data.items():
    for question in questions:
        if question:  # Check if question is not None
            question_id, question_text = question.split(" ", 1)
            reshaped_data.append({
                "Qid": question_id,
                "category": category,
                "statement": question_text
            })

validation_df = pd.DataFrame(reshaped_data)

# Adding a new column 'category #' by extracting the first digit after 'Q'
validation_df['category #'] = validation_df['Qid'].str.extract(r'Q(\d)').astype(int)

# Filtering rows where 'category #' is between 1 and 7
validation_df = validation_df[validation_df['category #'].between(1, 9)]

## Preferences DF

In [None]:
def transform_preferences(preferences_df):
    """
    Transforms a wide-format DataFrame into a long-format DataFrame with
    columns for 'statement' and 'political leaning'.
    Drops rows where 'statement' is NaN.

    Parameters:
        preferences_df (pd.DataFrame): Input DataFrame with political preferences.
        political_leaning (str): A single political leaning label for all rows.

    Returns:
        pd.DataFrame: Transformed DataFrame.
    """
    # Melt the DataFrame to long format
    melted_df = preferences_df.melt(var_name="political leaning",
                                    value_name="statement",
                                    ignore_index=False)

    # Drop rows where 'statement' is NaN
    melted_df = melted_df.dropna(subset=["statement"])

    # Reset the index for a clean DataFrame
    melted_df = melted_df.reset_index(drop=True)

    return melted_df

In [None]:
preferences_df = pd.read_csv('data/05_LLM_data/outputs/policy_options/policy_preferences.csv')
preferences_df = transform_preferences(preferences_df,)

## Statements DF

In [None]:
def transform_statements(input_csv, political_leaning):
    """
    Transforms a wide CSV format into a long format DataFrame with
    columns for 'statement', 'similarity_score_id', and 'political leaning'.
    Drops rows where 'statement' is NaN.

    Parameters:
        input_csv (str): Path to the input CSV file.
        political_leaning (str): A single political leaning label for all rows.

    Returns:
        pd.DataFrame: Transformed DataFrame.
    """
    # Read the CSV file into a DataFrame
    df = pd.read_csv(input_csv)

    # Melt the DataFrame to long format
    melted_df = df.melt(var_name="similarity_score",
                        value_name="statement",
                        ignore_index=False)

    # Extract the similarity_score_id from the column name
    melted_df["similarity score id"] = melted_df["similarity_score"].str.extract(r'(\d+)').astype(int)

    # Assign the provided political leaning to all rows
    melted_df["political leaning"] = political_leaning

    # Drop rows where 'statement' is NaN
    melted_df = melted_df.dropna(subset=["statement"])

    # Drop the original similarity_score column
    melted_df = melted_df.drop(columns=["similarity_score"])

    # Reset the index for a clean DataFrame
    melted_df = melted_df.reset_index(drop=True)

    return melted_df

In [None]:
# Define the political leanings and corresponding file paths
political_leanings = ["left", "left-liberal", "centrist", "right-liberal", "right"]
file_paths = [
    'data/LLM_data/outputs/consideration_statements/left_consideration_statements.csv',
    'data/LLM_data/outputs/consideration_statements/left-liberal_consideration_statements.csv',
    'data/LLM_data/outputs/consideration_statements/centrist_consideration_statements.csv',
    'data/LLM_data/outputs/consideration_statements/right-liberal_consideration_statements.csv',
    'data/LLM_data/outputs/consideration_statements/right_consideration_statements.csv'
]

# Initialize an empty list to store the DataFrames
all_statements = []

# Loop through each file and political leaning, transforming the data
for political_leaning, file_path in zip(political_leanings, file_paths):
    transformed_df = transform_statements(file_path, political_leaning)
    all_statements.append(transformed_df)

# Combine all DataFrames into a single DataFrame
statements_df = pd.concat(all_statements, ignore_index=True)

# Embedding

In [None]:
# Function to embed statements
def embed_statements(df, model, column_name='statement'):
    """
    Embeds the text in the specified column using the given model.

    Parameters:
        df (pd.DataFrame): The input DataFrame containing the text data.
        model (SentenceTransformer): The preloaded SentenceTransformer model.
        column_name (str): The column containing the text to embed.

    Returns:
        pd.DataFrame: The input DataFrame with an added 'embedding' column.
    """
    print(f"Embedding {len(df)} statements from column '{column_name}'...")
    # Ensure all entries in the specified column are strings and handle NaN values
    df[column_name] = df[column_name].astype(str).fillna('')

    # Encode the text column into embeddings
    embeddings = model.encode(df[column_name].tolist(), batch_size=32, convert_to_tensor=True)
    df['embedding'] = embeddings.tolist()

    print("Embedding completed for this DataFrame.")
    return df

# Model name and initialization
model_name = 'paraphrase-multilingual-MiniLM-L12-v2'
print("Loading embeddings model...")
model = SentenceTransformer(model_name, device='cpu')  # Use 'cuda' if GPU is available

# Embed statements in each DataFrame
preferences_df = embed_statements(preferences_df, model)
statements_df = embed_statements(statements_df, model)
validation_df = embed_statements(validation_df, model)

print("Embedding process completed for all DataFrames.")

# Similarity Scoring

In [None]:
# Function to calculate similarity scores between two sets of embeddings
def calculate_similarity(source_df, target_df, source_column, target_column):
    """
    Calculates cosine similarity scores between all elements of source_df[source_column]
    and target_df[target_column].

    Parameters:
        source_df (pd.DataFrame): DataFrame containing source embeddings.
        target_df (pd.DataFrame): DataFrame containing target embeddings.
        source_column (str): Column name in source_df with embeddings.
        target_column (str): Column name in target_df with embeddings.

    Returns:
        pd.DataFrame: DataFrame with similarity scores.
    """
    print(f"Calculating similarity between {len(source_df)} source embeddings and {len(target_df)} target embeddings...")

    # Convert embeddings to tensors
    source_embeddings = torch.tensor(source_df[source_column].tolist())
    target_embeddings = torch.tensor(target_df[target_column].tolist())

    # Compute cosine similarity
    similarity_scores = util.cos_sim(source_embeddings, target_embeddings)

    return similarity_scores


# Filter validation_df based on categories
validation_preferences = validation_df[validation_df['category #'] == 9]
validation_statements = validation_df[validation_df['category #'] < 9]

# Calculate similarity scores
similarity_prefs_validation = calculate_similarity(
    preferences_df, validation_preferences, source_column='embedding', target_column='embedding'
)

similarity_statements_validation = calculate_similarity(
    statements_df, validation_statements, source_column='embedding', target_column='embedding'
)


In [None]:
def rank_by_similarity(single_validation, source_df, source_column='embedding', target_column='embedding', text_column='statement', extra_validation_columns=None, extra_source_columns=None):
    """
    Ranks all elements of source_df by similarity score to a single element from validation_df,
    displaying the validation statement and source statements with similarity scores and additional columns.

    Parameters:
        single_validation (pd.Series): A single row from validation_df with the target embedding.
        source_df (pd.DataFrame): DataFrame containing source embeddings to rank.
        source_column (str): Column name in source_df with embeddings.
        target_column (str): Column name in single_validation containing the embedding.
        text_column (str): Column name in source_df and single_validation containing the statements.
        extra_validation_columns (list): Additional columns from single_validation to display.
        extra_source_columns (list): Additional columns from source_df to include in the result.

    Returns:
        pd.DataFrame: DataFrame with similarity scores ranked in descending order, including source statements and extra columns.
    """
    print("Calculating similarity for a single validation element...")

    # Extract the validation statement
    validation_statement = single_validation[text_column]
    print(f"Validation Statement: {validation_statement}\n")

    # Extract additional validation columns if provided
    validation_data = {
        col: single_validation[col] for col in extra_validation_columns if col in single_validation
    }

    # Convert the single validation embedding to a tensor
    target_embedding = torch.tensor(single_validation[target_column])

    # Convert source embeddings to tensors
    source_embeddings = torch.tensor(source_df[source_column].tolist())

    # Compute cosine similarity
    similarity_scores = util.cos_sim(target_embedding.unsqueeze(0), source_embeddings)

    # Flatten similarity scores and create a DataFrame
    result_df = pd.DataFrame({
        'source_index': source_df.index.tolist(),  # Use the index as the identifier
        'source_statement': source_df[text_column].tolist(),  # Include the source statements
        'similarity_score': similarity_scores.squeeze(0).tolist()
    })

    # Add additional source columns if provided
    if extra_source_columns:
        for col in extra_source_columns:
            if col in source_df:
                result_df[col] = source_df[col].tolist()

    # Sort by similarity score in descending order
    result_df = result_df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)

    print("Ranking completed.")
    return validation_statement, validation_data, result_df


def create_similarity_summary(validation_df, source_df, top_n=5, source_column='embedding', target_column='embedding', text_column='statement', extra_validation_columns=None, extra_source_columns=None):
    """
    Creates a summary DataFrame containing the top N closest statements and their metadata for each validation statement.

    Parameters:
        validation_df (pd.DataFrame): DataFrame containing validation statements and embeddings.
        source_df (pd.DataFrame): DataFrame containing source statements and embeddings.
        top_n (int): Number of closest statements to include.
        source_column (str): Column name in source_df with embeddings.
        target_column (str): Column name in validation_df with embeddings.
        text_column (str): Column name in source_df and validation_df containing the statements.
        extra_validation_columns (list): Additional columns from validation_df to include in the result.
        extra_source_columns (list): Additional columns from source_df to include in the result.

    Returns:
        pd.DataFrame: Summary DataFrame.
    """
    summary_data = []

    for _, single_validation in validation_df.iterrows():
        validation_statement, validation_data, ranked_statements = rank_by_similarity(
            single_validation,
            source_df,
            source_column=source_column,
            target_column=target_column,
            text_column=text_column,
            extra_validation_columns=extra_validation_columns,
            extra_source_columns=extra_source_columns
        )

        top_results = ranked_statements.head(top_n)

        summary_row = {
            'Qid': validation_data.get('Qid', None),
            'validation_statement': validation_statement
        }

        for i, row in top_results.iterrows():
            summary_row[f'{i + 1}st_closest_statement'] = row['source_statement']
            summary_row[f'{i + 1}st_closest_category'] = row.get('similarity score id', None)
            summary_row[f'{i + 1}st_closest_political_leaning'] = row.get('political leaning', None)

        summary_data.append(summary_row)

    summary_df = pd.DataFrame(summary_data)
    return summary_df



# Generate summary DataFrame for validation_df against statements_df
statements_similarity_summary = create_similarity_summary(
    validation_df[:41],
    statements_df,
    top_n=5,
    source_column='embedding',
    target_column='embedding',
    text_column='statement',
    extra_validation_columns=['category #', 'Qid'],
    extra_source_columns=['similarity score id', 'political leaning']
)

# Display the summary
print("Similarity Summary:")
statements_similarity_summary

# Saving

In [None]:
# Generate summary DataFrame for validation_df against statements_df

preferences_similarity_summary = create_similarity_summary(
    validation_df[41:],
    preferences_df,
    top_n=5,
    source_column='embedding',
    target_column='embedding',
    text_column='statement',
    extra_validation_columns=['category #', 'Qid'],
    extra_source_columns=['similarity score id', 'political leaning']
)

In [None]:
statements_similarity_summary.to_csv('data/06_comparison/statements_similarity_summary.csv', index=False)
preferences_similarity_summary.to_csv('data/06_comparison/preferences_similarity_summary.csv', index=False)