Author: Naomi Baes
Aim: Compute change scores (100-0) for bootstrap sampling; ROC (100-0) for stratified random sampling.

# Bootstrap sampling (all-year)

In [63]:
import pandas as pd
import numpy as np

def load_data(filepath):
    """Load CSV data, drop empty rows, and flatten multi-index columns."""
    try:
        df = pd.read_csv(filepath)
        
        # Drop any completely empty rows
        df = df.dropna(how='all')

        # Flatten multi-index columns if needed
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
        
        return df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None

def calculate_diff(df, value_columns, dimension, measure_tag="", measure_type=""):
    """
    Calculates the difference between injection_ratio 100 and 0 for the specified value columns,
    and assigns increase or decrease based on specific logic per dataset.
    """
    if df is None:
        print("Dataframe is empty, skipping processing.")
        return pd.DataFrame()
    
    if 'inj_ratio' in df.columns:
        df = df.rename(columns={'inj_ratio': 'injection_ratio'})
    
    if 'injection_ratio' not in df.columns:
        print("Missing 'injection_ratio' column, skipping processing.")
        return pd.DataFrame()

    df_filtered = df[df['injection_ratio'].isin([0, 100])]
    if df_filtered.empty:
        print("No relevant injection ratios found, skipping processing.")
        return pd.DataFrame()

    index_col = 'term' if 'term' in df.columns else 'target'
    df_pivoted = df_filtered.pivot(index=index_col, columns='injection_ratio', values=value_columns)
    df_pivoted.reset_index(inplace=True)  # Ensure index columns are back to normal columns

    results = []
    for col in value_columns:
        diff_col_name = f'{col}_diff{measure_tag}'
        df_pivoted[diff_col_name] = df_pivoted[(col, 100)] - df_pivoted[(col, 0)]
        temp_df = df_pivoted.loc[:, [index_col, diff_col_name]]
        temp_df.rename(columns={index_col: 'target', diff_col_name: 'change_score'}, inplace=True)
        
        # Apply increase or decrease logic based on column and measure
        temp_df['condition'] = np.where(temp_df['change_score'] > 0, "Increase", "Decrease")
        temp_df['dimension'] = dimension
        temp_df['measure'] = measure_type
        
        results.append(temp_df)

    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

# **Load datasets**
df_valence = load_data("../1_sentiment/output/baseline_averaged_valence_index_all-year_normalized.csv")
df_arousal = load_data("../3_intensity/output/baseline_averaged_arousal_index_all-year_normalized.csv")
df_cosine = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_mpnet.csv")
df_valence_absa = load_data("../1_sentiment/output/absa_averaged_sentiment_index_all-year_with_se.csv")
df_cosine_lexeme = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_lexeme.csv")
df_xl_lexeme = load_data("../xl_lexeme_results.csv")  # New dataset

# **Calculate differences for original datasets**
valence_diff = calculate_diff(df_valence, ['avg_valence_index_negative', 'avg_valence_index_positive'], "Sentiment", "", "SIB")
arousal_diff = calculate_diff(df_arousal, ['avg_arousal_index_high', 'avg_arousal_index_low'], "Intensity", "", "SIB")
cosine_diff = calculate_diff(df_cosine, ['cosine_dissim_mean'], "Breadth", "", "SIB")
valence_absa_diff = calculate_diff(df_valence_absa, ['avg_valence_index_negative', 'avg_valence_index_positive'], "Sentiment", "_absa", "absa")
cosine_lexeme_diff = calculate_diff(df_cosine_lexeme, ['cosine_dissim_mean'], "Breadth", "_lexeme", "Breadth (lexeme)")

# **Process xl_lexeme_results.csv to extract 100_0 column**
if df_xl_lexeme is not None:
    df_xl_lexeme = df_xl_lexeme[['target', 'dimension', 'condition', '100_0']].copy()
    df_xl_lexeme.rename(columns={'100_0': 'change_score'}, inplace=True)

    # **Fix inconsistent casing for 'dimension' values**
    df_xl_lexeme['dimension'] = df_xl_lexeme['dimension'].str.capitalize()

    # **Ensure 'condition' is correctly mapped**
    df_xl_lexeme['condition'] = df_xl_lexeme['condition'].replace({
        'positive': 'Increase',
        'negative': 'Decrease',
        'high': 'Increase',
        'low': 'Decrease',
        'neutral': 'Neutral'
    })

    # **Ensure 'condition' is filled correctly for Breadth and Intensity**
    df_xl_lexeme.loc[df_xl_lexeme['dimension'].isin(['Breadth', 'Intensity']), 'condition'] = df_xl_lexeme['condition']

    # **Add measure label**
    df_xl_lexeme['measure'] = 'LSC'

    # **Ensure proper column ordering**
    df_xl_lexeme = df_xl_lexeme[['target', 'change_score', 'condition', 'dimension', 'measure']]

# **Combine all results into a single DataFrame**
final_results = pd.concat(
    [valence_diff, arousal_diff, cosine_diff, valence_absa_diff, cosine_lexeme_diff, df_xl_lexeme], 
    ignore_index=True
)

# **Drop duplicate columns & ensure uniform headers**
final_results = final_results.loc[:, ~final_results.columns.duplicated()].copy()

# **Save the final DataFrame to a CSV file**
final_results.to_csv("final_change_scores_all-year.csv", index=False)
print("All differences calculated and saved to final_change_scores_all-year.csv")

All differences calculated and saved to final_change_scores_all-year.csv


# fix misalignment and labels

In [69]:
import pandas as pd

# Load the misaligned dataset
df = pd.read_csv("final_change_scores_all-year.csv", dtype=str)

# Ensure proper column names
expected_columns = ["target", "change_score", "condition", "dimension", "measure"]

# Identify rows that are misaligned (leading NaNs)
misaligned_rows = df[df.isna().sum(axis=1) > 2].index

# Shift the LSC rows left by counting the number of empty columns
for idx in misaligned_rows:
    row_values = df.loc[idx].dropna().tolist()  # Remove NaNs and convert to list
    if len(row_values) == len(expected_columns):  # Ensure full alignment
        df.loc[idx, expected_columns] = row_values

# Drop remaining NaN-filled columns
df = df[expected_columns]

# **Fix 'health' and 'illness' in the 'target' column**
df["target"] = df["target"].replace({
    "health": "mental_health",
    "illness": "mental_illness"
})

# **Ensure 'change_score' is numeric**
df["change_score"] = pd.to_numeric(df["change_score"], errors="coerce")

# **Add a negative value to the 'decrease' condition for all change scores**
df.loc[df["condition"].str.lower() == "decrease", "change_score"] *= -1

# Save the cleaned DataFrame
df.to_csv("final_change_scores_all-year.csv", index=False, encoding="utf-8")
print("✅ Misaligned rows fixed, target names corrected, and negative values added to 'decrease' condition. Saved to 'final_change_scores_all-year.csv'")


✅ Misaligned rows fixed, target names corrected, and negative values added to 'decrease' condition. Saved to 'final_change_scores_all-year.csv'


# Stratified Random Sampling (5-year)

## Get rate of change score for SIB measures

## Get rate of change score for SIB measures

## Get rate of change score for SIB measures