In [3]:
import pandas as pd
import os
from sklearn.metrics import roc_auc_score
import numpy as np

In [26]:
# Step 1: Manually specify the directory and filenames
directory = "../data/csv_performance_all_models/"  # Replace with your directory path

# "labse_inference_test_set_sample_with_country.csv"
# "small_labse_inference_test_set_sample_with_country.csv"
# "xlmt_inference_test_set_sample_with_country.csv"
filename = "xlmt_inference_test_set_sample_with_country.csv"  

# Combine directory path with filenames
file_path = os.path.join(directory, filename)
file_path

'../data/csv_performance_all_models/xlmt_inference_test_set_sample_with_country.csv'

In [27]:
# Load the dataset
df = pd.read_csv(file_path, engine='python', on_bad_lines='skip', encoding='utf-8')  

In [28]:
# Step 1: Filter out rows where country is 'Unknown'
df_filtered = df[df['country'] != 'Unknown']

In [29]:
# Step 2: Initialize a list to store results
results = []

# Step 3: Group by country
for country, group in df_filtered.groupby('country'):
    # Flatten true labels and predictions for micro AUC calculation
    true_labels = group[['post7geo10_true', 'post7geo30_true', 'post7geo50_true', 
                         'pre7geo10_true', 'pre7geo30_true', 'pre7geo50_true']].values.flatten()
    predictions = group[['post7geo10', 'post7geo30', 'post7geo50', 
                         'pre7geo10', 'pre7geo30', 'pre7geo50']].values.flatten()

    # Ensure no NaN values and at least two classes for AUC calculation
    if len(set(true_labels)) > 1 and not (np.isnan(true_labels).any() or np.isnan(predictions).any()):
        auc = roc_auc_score(true_labels, predictions)
    else:
        auc = None  # AUC not defined for single-class groups or NaNs

    # Append the result
    results.append({
        'country': country,
        'num_obs': len(group),
        'AUC': auc
    })

In [30]:
# Step 4: Create a new DataFrame from results
country_auc_df = pd.DataFrame(results)
country_auc_df

Unnamed: 0,country,num_obs,AUC
0,Afghanistan,16,0.720899
1,Algeria,4,
2,Brazil,176,0.591264
3,Colombia,27,0.814882
4,Congo-Brazzaville,1,0.222222
5,Democratic Republic of the Congo,2,0.259259
6,Egypt,439,0.614387
7,Honduras,2,0.472222
8,India,5,0.267081
9,Iraq,86,0.561336


In [31]:
# Filter observations where AUC is NaN and num_obs >= 10
filtered_country_auc_labse_df = country_auc_df[country_auc_df['AUC'].notna() & (country_auc_df['num_obs'] >= 10)]
filtered_country_auc_labse_df

Unnamed: 0,country,num_obs,AUC
0,Afghanistan,16,0.720899
2,Brazil,176,0.591264
3,Colombia,27,0.814882
6,Egypt,439,0.614387
9,Iraq,86,0.561336
10,Israel,43,0.732199
11,Jordan,74,0.808695
12,Kenya,60,0.662178
13,Lebanon,192,0.649516
14,Libya,31,0.550147


In [32]:
# Step 5: Save to a new CSV file
filtered_country_auc_labse_df.to_csv("../data/csv_performance_all_models/country_auc_summary_xlmt_sample.csv", index=False)