In [1]:
import pandas as pd
from collections import Counter
import os
from google.colab import files

def perform_majority_voting(file_paths):
    """
    Performs majority voting on predictions from multiple TSV files.
    This function assumes the files have already been uploaded to the Colab environment.

    Args:
        file_paths (list): A list of file paths (filenames) to the TSV files.

    Returns:
        pandas.DataFrame: A DataFrame with the final voted predictions (id, label, model).
    """
    if not file_paths:
        print("Error: No file paths provided.")
        return None

    # Dictionary to store all labels for each unique ID
    all_labels = {}

    print(f"Reading {len(file_paths)} files and collecting labels...")
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: The file {file_path} was not found. Please ensure it was uploaded correctly.")
            continue

        try:
            # Read the TSV file into a pandas DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Check if the required 'id' and 'label' columns exist
            if 'id' not in df.columns or 'label' not in df.columns:
                print(f"Warning: Skipping {file_path}. It does not contain 'id' and 'label' columns.")
                continue

            # Replace NaN in the 'label' column with 'None'
            df['label'] = df['label'].fillna('None')

            # Populate dictionary
            for index, row in df.iterrows():
                all_labels.setdefault(row['id'], []).append(row['label'])

        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

    # List for final predictions
    voted_predictions = []

    # Define the ensemble model name
    ensemble_model_name = 'xlm_large+mdeb-v3+murili_base'

    print("Performing majority voting...")
    for id_val, labels in all_labels.items():
        label_counts = Counter(labels)
        most_common_label = label_counts.most_common(1)[0][0]

        voted_predictions.append({
            'id': id_val,
            'label': most_common_label,
            'model': ensemble_model_name
        })

    # Create DataFrame
    final_df = pd.DataFrame(voted_predictions)
    return final_df


if __name__ == '__main__':
    # --- STEP 1: UPLOAD FILES ---
    print("Please upload your prediction files (subtask_1B1.tsv, subtask_1B_2.tsv, subtask_1B_3.tsv).")
    try:
        uploaded = files.upload()
        print("Files uploaded successfully!")
    except Exception as e:
        print(f"File upload failed: {e}")

    # --- STEP 2: PERFORM VOTING ---
    input_files = [
        'xlm_large.tsv',
        'mdeb.tsv',
        'indic.tsv'
    ]

    # Output file name
    output_file = 'subtask_1B.tsv'

    # Execute voting
    final_results = perform_majority_voting(input_files)

    if final_results is not None and not final_results.empty:
        final_results.to_csv(output_file, sep='\t', index=False)
        print(f"\nVoting complete! Results saved to '{output_file}'.")
        print("Here are the first 5 voted predictions:")
        print(final_results.head())
    else:
        print("\nNo results to save. Please check your input files.")

Please upload your prediction files (subtask_1B1.tsv, subtask_1B_2.tsv, subtask_1B_3.tsv).


Saving indic.tsv to indic.tsv
Saving mdeb.tsv to mdeb.tsv
Saving xlm_large.tsv to xlm_large.tsv
Files uploaded successfully!
Reading 3 files and collecting labels...
Performing majority voting...

Voting complete! Results saved to 'subtask_1B.tsv'.
Here are the first 5 voted predictions:
       id       label                          model
0   12764     Society  xlm_large+mdeb-v3+murili_base
1  202933        None  xlm_large+mdeb-v3+murili_base
2  165894   Community  xlm_large+mdeb-v3+murili_base
3  124999   Community  xlm_large+mdeb-v3+murili_base
4  535301  Individual  xlm_large+mdeb-v3+murili_base


In [2]:
!zip subtask_1B.zip subtask_1B.tsv

  adding: subtask_1B.tsv (deflated 89%)
