Author: Naomi Baes
Aim: Compute change scores (100-0) for bootstrap sampling; ROC (100-0) for stratified random sampling.

# Bootstrap sampling (all-year)

## raw

In [113]:
import pandas as pd
import numpy as np

def load_data(filepath):
    """Load CSV data, drop empty rows, and flatten multi-index columns."""
    try:
        df = pd.read_csv(filepath)
        
        # Drop any completely empty rows
        df = df.dropna(how='all')

        # Flatten multi-index columns if needed
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in df.columns]
        
        return df
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return None

def calculate_diff(df, value_columns, dimension, measure_tag="", measure_type=""):
    """
    Calculates the difference between injection_ratio 100 and 0 for the specified value columns,
    and assigns increase or decrease based on specific logic per dataset.
    """
    if df is None:
        print("Dataframe is empty, skipping processing.")
        return pd.DataFrame()
    
    if 'inj_ratio' in df.columns:
        df = df.rename(columns={'inj_ratio': 'injection_ratio'})
    
    if 'injection_ratio' not in df.columns:
        print("Missing 'injection_ratio' column, skipping processing.")
        return pd.DataFrame()

    df_filtered = df[df['injection_ratio'].isin([0, 100])]
    if df_filtered.empty:
        print("No relevant injection ratios found, skipping processing.")
        return pd.DataFrame()

    index_col = 'term' if 'term' in df.columns else 'target'
    df_pivoted = df_filtered.pivot(index=index_col, columns='injection_ratio', values=value_columns)
    df_pivoted.reset_index(inplace=True)  # Ensure index columns are back to normal columns

    results = []
    for col in value_columns:
        diff_col_name = f'{col}_diff{measure_tag}'
        df_pivoted[diff_col_name] = df_pivoted[(col, 100)] - df_pivoted[(col, 0)]
        temp_df = df_pivoted.loc[:, [index_col, diff_col_name]]
        temp_df.rename(columns={index_col: 'target', diff_col_name: 'change_score'}, inplace=True)
        
        # Apply increase or decrease logic based on column and measure
        temp_df['condition'] = np.where(temp_df['change_score'] > 0, "Increase", "Decrease")
        temp_df['dimension'] = dimension
        temp_df['measure'] = measure_type
        
        results.append(temp_df)

    return pd.concat(results, ignore_index=True) if results else pd.DataFrame()

# **Load datasets**
df_valence = load_data("../1_sentiment/output/baseline_averaged_valence_index_all-year_normalized.csv")
df_arousal = load_data("../3_intensity/output/baseline_averaged_arousal_index_all-year_normalized.csv")
df_cosine = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_mpnet.csv")
df_valence_absa = load_data("../1_sentiment/output/absa_averaged_sentiment_index_all-year_with_se.csv")
df_cosine_lexeme = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_lexeme.csv")
df_xl_lexeme = load_data("../xl_lexeme_results.csv")  # New dataset

# **Calculate differences for original datasets**
valence_diff = calculate_diff(df_valence, ['avg_valence_index_negative', 'avg_valence_index_positive'], "Sentiment", "", "SIB")
arousal_diff = calculate_diff(df_arousal, ['avg_arousal_index_high', 'avg_arousal_index_low'], "Intensity", "", "SIB")
cosine_diff = calculate_diff(df_cosine, ['cosine_dissim_mean'], "Breadth", "", "SIB")
valence_absa_diff = calculate_diff(df_valence_absa, ['avg_valence_index_negative', 'avg_valence_index_positive'], "Sentiment", "_absa", "absa")
cosine_lexeme_diff = calculate_diff(df_cosine_lexeme, ['cosine_dissim_mean'], "Breadth", "_lexeme", "Breadth (lexeme)")

# **Process xl_lexeme_results.csv to extract 100_0 column**
if df_xl_lexeme is not None:
    df_xl_lexeme = df_xl_lexeme[['target', 'dimension', 'condition', '100_0']].copy()
    df_xl_lexeme.rename(columns={'100_0': 'change_score'}, inplace=True)

    # **Fix inconsistent casing for 'dimension' values**
    df_xl_lexeme['dimension'] = df_xl_lexeme['dimension'].str.capitalize()

    # **Ensure 'condition' is correctly mapped**
    df_xl_lexeme['condition'] = df_xl_lexeme['condition'].replace({
        'positive': 'Increase',
        'negative': 'Decrease',
        'high': 'Increase',
        'low': 'Decrease',
        'neutral': 'Neutral'
    })

    # **Ensure 'condition' is filled correctly for Breadth and Intensity**
    df_xl_lexeme.loc[df_xl_lexeme['dimension'].isin(['Breadth', 'Intensity']), 'condition'] = df_xl_lexeme['condition']

    # **Add measure label**
    df_xl_lexeme['measure'] = 'LSC'

    # **Ensure proper column ordering**
    df_xl_lexeme = df_xl_lexeme[['target', 'change_score', 'condition', 'dimension', 'measure']]

# **Combine all results into a single DataFrame**
final_results = pd.concat(
    [valence_diff, arousal_diff, cosine_diff, valence_absa_diff, cosine_lexeme_diff, df_xl_lexeme], 
    ignore_index=True
)

# **Drop duplicate columns & ensure uniform headers**
final_results = final_results.loc[:, ~final_results.columns.duplicated()].copy()

# **Save the final DataFrame to a CSV file**
final_results.to_csv("final_change_scores_all-year.csv", index=False)
print("All differences calculated and saved to final_change_scores_all-year.csv")

All differences calculated and saved to final_change_scores_all-year.csv


### fix misalignment and labels

In [114]:
import pandas as pd

# Load the misaligned dataset
df = pd.read_csv("final_change_scores_all-year.csv", dtype=str)

# Ensure proper column names
expected_columns = ["target", "change_score", "condition", "dimension", "measure"]

# Identify rows that are misaligned (leading NaNs)
misaligned_rows = df[df.isna().sum(axis=1) > 2].index

# Shift the LSC rows left by counting the number of empty columns
for idx in misaligned_rows:
    row_values = df.loc[idx].dropna().tolist()  # Remove NaNs and convert to list
    if len(row_values) == len(expected_columns):  # Ensure full alignment
        df.loc[idx, expected_columns] = row_values

# Drop remaining NaN-filled columns
df = df[expected_columns]

# **Fix 'health' and 'illness' in the 'target' column**
df["target"] = df["target"].replace({
    "health": "mental_health",
    "illness": "mental_illness"
})

# **Ensure 'change_score' is numeric**
df["change_score"] = pd.to_numeric(df["change_score"], errors="coerce")

# **Add a negative value to the 'decrease' condition for all change scores**
df.loc[(df["condition"].str.lower() == "decrease") & (df["measure"].str.lower() == "lsc"), "change_score"] *= -1

# Save the cleaned DataFrame
df.to_csv("final_change_scores_all-year.csv", index=False, encoding="utf-8")
print("✅ Misaligned rows fixed, target names corrected, and negative values added to 'decrease' condition. Saved to 'final_change_scores_all-year.csv'")


✅ Misaligned rows fixed, target names corrected, and negative values added to 'decrease' condition. Saved to 'final_change_scores_all-year.csv'


# Stratified Random Sampling (5-year)

## Get rate of change score for SIB measures

In [None]:
# **Load datasets**
df_valence = load_data("../1_sentiment/output/baseline_averaged_valence_index_all-year_normalized.csv")
df_arousal = load_data("../3_intensity/output/baseline_averaged_arousal_index_all-year_normalized.csv")
df_cosine = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_mpnet.csv")
df_valence_absa = load_data("../1_sentiment/output/absa_averaged_sentiment_index_all-year_with_se.csv")
df_cosine_lexeme = load_data("../2_breadth/output/baseline_final_combined.all-year.cds_lexeme.csv")
df_xl_lexeme = load_data("../xl_lexeme_results.csv")  # New dataset

In [None]:
df_valence = load_data("../1_sentiment/output/baseline_averaged_valence_index_5-year_normalized.csv")


In [149]:
import pandas as pd

# Load datasets
file_path_valence = "../1_sentiment/output/baseline_averaged_valence_index_5-year_normalized.csv"
file_path_arousal = "../3_intensity/output/baseline_averaged_arousal_index_5-year_normalized.csv"
file_path_breadth = "../2_breadth/output/baseline_final_combined.5-year.cds_mpnet.csv"

df_valence = pd.read_csv(file_path_valence)
df_arousal = pd.read_csv(file_path_arousal)
df_breadth = pd.read_csv(file_path_breadth)

# Convert epoch to numeric start year for sorting
for df in [df_valence, df_arousal, df_breadth]:
    df['epoch_start'] = df['epoch'].str.split('-').str[0].astype(int)

# Rename columns in breadth dataset for consistency
df_breadth.rename(columns={'term': 'target', 'inj_ratio': 'injection_ratio'}, inplace=True)

# Function to compute total change and average rate of change per epoch
def compute_change_metrics(df, metric_high, metric_low, measure, dimension, condition_high, condition_low):
    # Keep only 0% and 100% injection ratios
    df = df[df['injection_ratio'].isin([0, 100])]

    # Pivot table to get values for 0% and 100% injections
    if metric_high != metric_low:  # Sentiment & Intensity: Two different metrics
        df_pivot = df.pivot(index=['target', 'epoch_start'], columns='injection_ratio', values=[metric_high, metric_low])
    else:  # Breadth: Single metric
        df_pivot = df.pivot(index=['target', 'epoch_start'], columns='injection_ratio', values=[metric_high])

    # Flatten MultiIndex columns
    df_pivot.columns = ['_'.join(map(str, col)) for col in df_pivot.columns]
    df_pivot = df_pivot.reset_index()

    # Drop rows with missing values
    required_cols = [f'{metric_high}_0', f'{metric_high}_100']
    if metric_high != metric_low:  # Sentiment & Intensity
        required_cols += [f'{metric_low}_0', f'{metric_low}_100']
    df_pivot = df_pivot.dropna(subset=required_cols)

    # Compute change scores
    df_pivot[f'change_score_high'] = df_pivot[f'{metric_high}_100'] - df_pivot[f'{metric_high}_0']
    
    if metric_high != metric_low:  # Sentiment & Intensity
        df_pivot[f'change_score_low'] = df_pivot[f'{metric_low}_100'] - df_pivot[f'{metric_low}_0']

    # Compute total change for both conditions
    df_total_change = df_pivot.groupby('target')[['change_score_high']].sum().reset_index()
    df_total_change.rename(columns={'change_score_high': 'total_change'}, inplace=True)

    # Compute number of epochs per target
    n_epochs = df_pivot.groupby('target')['epoch_start'].nunique().reset_index()
    n_epochs.rename(columns={'epoch_start': 'n_epochs'}, inplace=True)

    # Merge total change and number of epochs
    df_final = df_total_change.merge(n_epochs, on='target')

    # Compute average rate of change per epoch
    df_final['avg_rate_of_change'] = df_final['total_change'] / df_final['n_epochs']

    # Add measure, dimension, and condition columns
    df_final['measure'] = measure
    df_final['dimension'] = dimension
    df_final['condition'] = condition_high  # Default to high/increase

    # Separate high and low condition rows only for Sentiment & Intensity
    if metric_high != metric_low:
        df_total_change_low = df_pivot.groupby('target')[['change_score_low']].sum().reset_index()
        df_total_change_low.rename(columns={'change_score_low': 'total_change'}, inplace=True)

        df_final_low = df_total_change_low.merge(n_epochs, on='target')
        df_final_low['avg_rate_of_change'] = df_final_low['total_change'] / df_final_low['n_epochs']
        df_final_low['measure'] = measure
        df_final_low['dimension'] = dimension
        df_final_low['condition'] = condition_low  # Low condition for Sentiment & Intensity

        # Combine high and low conditions
        df_final = pd.concat([df_final, df_final_low], ignore_index=True)

    return df_final

# Compute change metrics for valence (Sentiment), arousal (Intensity), and breadth (Breadth)
df_valence_final = compute_change_metrics(df_valence, 'avg_valence_index_positive', 'avg_valence_index_negative', 'SIB', 'Sentiment', 'positive', 'negative')
df_arousal_final = compute_change_metrics(df_arousal, 'avg_arousal_index_high', 'avg_arousal_index_low', 'SIB', 'Intensity', 'high', 'low')
df_breadth_final = compute_change_metrics(df_breadth, 'cosine_dissim_mean', 'cosine_dissim_mean', 'SIB', 'Breadth', 'increase', 'increase')

# Merge final outputs
df_final_output = pd.concat([df_valence_final, df_arousal_final, df_breadth_final], ignore_index=True)

# Save the final dataframe to a CSV file
output_file_path = "final_change_scores_5-year_SIB.csv"
df_final_output.to_csv(output_file_path, index=False)

# Display the output file location
print(f"Final output saved to: {output_file_path}")


Final output saved to: final_change_scores_5-year_SIB.csv


In [150]:
import pandas as pd

# Load dataset
file_path_absa = "../1_sentiment/output/absa_averaged_sentiment_index_5-year.csv"  # Change this to the actual file path
df_absa = pd.read_csv(file_path_absa)

# Convert epoch to numeric start year for sorting
df_absa['epoch_start'] = df_absa['epoch'].str.split('-').str[0].astype(int)

# Function to compute total change and average rate of change per epoch
def compute_change_metrics(df, metric_high, metric_low, measure, dimension, condition_high, condition_low):
    # Keep only 0% and 100% injection ratios
    df = df[df['injection_ratio'].isin([0, 100])]

    # Pivot table to get values for 0% and 100% injections
    df_pivot = df.pivot(index=['target', 'epoch_start'], columns='injection_ratio', values=[metric_high, metric_low])

    # Flatten MultiIndex columns
    df_pivot.columns = ['_'.join(map(str, col)) for col in df_pivot.columns]
    df_pivot = df_pivot.reset_index()

    # Drop rows with missing values
    required_cols = [f'{metric_high}_0', f'{metric_high}_100', f'{metric_low}_0', f'{metric_low}_100']
    df_pivot = df_pivot.dropna(subset=required_cols)

    # Compute change scores for both conditions
    df_pivot[f'change_score_high'] = df_pivot[f'{metric_high}_100'] - df_pivot[f'{metric_high}_0']
    df_pivot[f'change_score_low'] = df_pivot[f'{metric_low}_100'] - df_pivot[f'{metric_low}_0']

    # Compute total change for both conditions
    df_total_change = df_pivot.groupby('target')[['change_score_high']].sum().reset_index()
    df_total_change.rename(columns={'change_score_high': 'total_change'}, inplace=True)

    # Compute number of epochs per target
    n_epochs = df_pivot.groupby('target')['epoch_start'].nunique().reset_index()
    n_epochs.rename(columns={'epoch_start': 'n_epochs'}, inplace=True)

    # Merge total change and number of epochs
    df_final = df_total_change.merge(n_epochs, on='target')

    # Compute average rate of change per epoch
    df_final['avg_rate_of_change'] = df_final['total_change'] / df_final['n_epochs']

    # Add measure, dimension, and condition columns
    df_final['measure'] = measure
    df_final['dimension'] = dimension
    df_final['condition'] = condition_high  # Default to high

    # Separate high and low condition rows
    df_total_change_low = df_pivot.groupby('target')[['change_score_low']].sum().reset_index()
    df_total_change_low.rename(columns={'change_score_low': 'total_change'}, inplace=True)

    df_final_low = df_total_change_low.merge(n_epochs, on='target')
    df_final_low['avg_rate_of_change'] = df_final_low['total_change'] / df_final_low['n_epochs']
    df_final_low['measure'] = measure
    df_final_low['dimension'] = dimension
    df_final_low['condition'] = condition_low  # Low condition

    # Combine high and low conditions
    df_final = pd.concat([df_final, df_final_low], ignore_index=True)

    return df_final

# Compute change metrics for ABSA (Sentiment)
df_absa_final = compute_change_metrics(df_absa, 'avg_valence_index_positive', 'avg_valence_index_negative', 'ABSA', 'Sentiment', 'positive', 'negative')

# Save the final dataframe to a CSV file
output_file_path = "final_change_scores_5-year_ABSA.csv"
df_absa_final.to_csv(output_file_path, index=False)

# Display the output file location
print(f"Final output saved to: {output_file_path}")


Final output saved to: final_change_scores_5-year_ABSA.csv


In [154]:
import pandas as pd

# Load dataset
file_path_breadth = "../2_breadth/output/baseline_final_combined.5-year.cds_lexeme.csv"  # Update this to your actual file path
df_breadth = pd.read_csv(file_path_breadth)

# Convert epoch to numeric start year for sorting
df_breadth['epoch_start'] = df_breadth['epoch'].str.split('-').str[0].astype(int)

# Rename columns for consistency
df_breadth.rename(columns={'term': 'target', 'inj_ratio': 'injection_ratio'}, inplace=True)

# Function to compute total change and average rate of change per epoch
def compute_change_metrics(df, metric, measure, dimension, condition):
    # Keep only 0% and 100% injection ratios
    df = df[df['injection_ratio'].isin([0, 100])]

    # Pivot table to get values for 0% and 100% injections
    df_pivot = df.pivot(index=['target', 'epoch_start'], columns='injection_ratio', values=metric)

    # Flatten MultiIndex columns (Fix applied)
    df_pivot.columns = [str(col) for col in df_pivot.columns]
    df_pivot = df_pivot.reset_index()

    # Drop rows with missing values
    required_cols = ['0', '100']
    df_pivot = df_pivot.dropna(subset=required_cols)

    # Compute change scores
    df_pivot['change_score'] = df_pivot['100'] - df_pivot['0']

    # Compute total change
    df_total_change = df_pivot.groupby('target')[['change_score']].sum().reset_index()
    df_total_change.rename(columns={'change_score': 'total_change'}, inplace=True)

    # Compute number of epochs per target
    n_epochs = df_pivot.groupby('target')['epoch_start'].nunique().reset_index()
    n_epochs.rename(columns={'epoch_start': 'n_epochs'}, inplace=True)

    # Merge total change and number of epochs
    df_final = df_total_change.merge(n_epochs, on='target')

    # Compute average rate of change per epoch
    df_final['avg_rate_of_change'] = df_final['total_change'] / df_final['n_epochs']

    # Add measure, dimension, and condition columns
    df_final['measure'] = measure
    df_final['dimension'] = dimension
    df_final['condition'] = condition

    return df_final

# Compute change metrics for Breadth using Lexeme as the measure
df_breadth_final = compute_change_metrics(df_breadth, 'cosine_dissim_mean', 'Lexeme', 'Breadth', 'increase')

# Save the final dataframe to a CSV file
output_file_path = "final_change_scores_5-year_Lexeme.csv"
df_breadth_final.to_csv(output_file_path, index=False)

# Display the output file location
print(f"Final output saved to: {output_file_path}")

Final output saved to: final_change_scores_5-year_Lexeme.csv


In [160]:
import pandas as pd
import os

# File paths
file_paths = [
    "../xl-lexeme_dissimilarity_5-year_breadth.csv",
    "../xl-lexeme_dissimilarity_5-year_negative_sentiment.csv",
    "../xl-lexeme_dissimilarity_5-year_positive_sentiment.csv"
]

# Extract dimension and condition from file names
def extract_metadata(file_name):
    if "breadth" in file_name:
        return "Breadth", "increase"
    elif "negative_sentiment" in file_name:
        return "Sentiment", "negative"
    elif "positive_sentiment" in file_name:
        return "Sentiment", "positive"
    else:
        return None, None  # Fallback case (shouldn't happen if files are named correctly)

# Function to process each file
def process_file(file_path):
    # Load dataset
    df = pd.read_csv(file_path)

    # Extract metadata from filename
    dimension, condition = extract_metadata(file_path)
    
    # Convert year to numeric start year for sorting
    df['epoch_start'] = df['year'].astype(str).str.split('-').str[0].astype(int)

    # Keep only injection_level == 0_100
    df = df[df['injection_level'] == "0_100"]

    # Compute total change
    df_total_change = df.groupby('target')[['avg_dissimilarity']].sum().reset_index()
    df_total_change.rename(columns={'avg_dissimilarity': 'total_change'}, inplace=True)

    # Compute number of epochs per target
    n_epochs = df.groupby('target')['epoch_start'].nunique().reset_index()
    n_epochs.rename(columns={'epoch_start': 'n_epochs'}, inplace=True)

    # Merge total change and number of epochs
    df_final = df_total_change.merge(n_epochs, on='target')

    # Compute average rate of change per epoch
    df_final['avg_rate_of_change'] = df_final['total_change'] / df_final['n_epochs']

    # Add measure, dimension, and condition columns
    df_final['measure'] = "LSC"
    df_final['dimension'] = dimension
    df_final['condition'] = condition

    return df_final

# Process all files and concatenate results
df_final_output = pd.concat([process_file(file) for file in file_paths], ignore_index=True)

# Save final dataframe to CSV
output_file_path = "final_change_scores_5-year_LSC.csv"
df_final_output.to_csv(output_file_path, index=False)

# Display output file location
print(f"Final output saved to: {output_file_path}")


Final output saved to: final_change_scores_5-year_LSC.csv


## combine plots

In [161]:
import pandas as pd

# List of input files
file_paths = [
    "final_change_scores_5-year_ABSA.csv",
    "final_change_scores_5-year_Lexeme.csv",
    "final_change_scores_5-year_LSC.csv",
    "final_change_scores_5-year_SIB.csv"
]

# Load all files and concatenate them
df_combined = pd.concat([pd.read_csv(file) for file in file_paths], ignore_index=True)

# Convert 'target' values: 'health' → 'mental_health', 'illness' → 'mental_illness'
df_combined['target'] = df_combined['target'].replace({
    'health': 'mental_health',
    'illness': 'mental_illness'
})

# Apply negative sign for LSC measure when condition is 'negative' or 'low'
df_combined.loc[
    (df_combined['measure'] == 'LSC') & (df_combined['condition'].isin(['negative', 'low'])),
    ['total_change', 'avg_rate_of_change']
] *= -1  # Multiply by -1 to flip sign

# Save final combined dataframe to CSV
output_file_path = "final_combined_change_scores_5-year.csv"
df_combined.to_csv(output_file_path, index=False)

# Display output file location
print(f"Final combined output saved to: {output_file_path}")

Final combined output saved to: final_combined_change_scores_5-year.csv


# End of script