In [None]:
import pandas as pd
from collections import Counter
import os
from google.colab import files

def perform_majority_voting(file_paths):
    """
    Performs majority voting on predictions from multiple TSV files.
    This function assumes the files have already been uploaded to the Colab environment.

    Args:
        file_paths (list): A list of file paths (filenames) to the TSV files.

    Returns:
        pandas.DataFrame: A DataFrame with the final voted predictions (id, label, model).
    """
    if not file_paths:
        print("Error: No file paths provided.")
        return None

    # Dictionary to store all labels for each unique ID
    # The key is the ID, and the value is a list of labels from all files.
    all_labels = {}

    print(f"Reading {len(file_paths)} files and collecting labels...")
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: The file {file_path} was not found. Please ensure it was uploaded correctly.")
            continue

        try:
            # Read the TSV file into a pandas DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Check if the required 'id' and 'label' columns exist
            if 'id' not in df.columns or 'label' not in df.columns:
                print(f"Warning: Skipping {file_path}. It does not contain 'id' and 'label' columns.")
                continue

            # --- MODIFICATION START ---
            # Replace any blank cells (NaN) in the 'label' column with the string 'None'.
            df['label'] = df['label'].fillna('None')
            # --- MODIFICATION END ---

            # Iterate through the DataFrame rows to populate the labels dictionary
            for index, row in df.iterrows():
                # Use setdefault to create a list for the ID if it doesn't exist,
                # then append the current label.
                all_labels.setdefault(row['id'], []).append(row['label'])

        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

    # List to store the final voted predictions
    voted_predictions = []

    # Define the name for the ensemble model
    ensemble_model_name = 'MajorityVotingEnsemble'

    print("Performing majority voting...")
    for id_val, labels in all_labels.items():
        # Use collections.Counter to efficiently find the most common label
        label_counts = Counter(labels)

        # Get the most common label. most_common(1) returns the top item.
        most_common_label = label_counts.most_common(1)[0][0]

        # Append the final prediction to our list, including the new model name
        voted_predictions.append({
            'id': id_val,
            'label': most_common_label,
            'model': ensemble_model_name
        })

    # Create the final DataFrame from the list of dictionaries
    final_df = pd.DataFrame(voted_predictions)

    return final_df

if __name__ == '__main__':
    # --- STEP 1: UPLOAD FILES ---
    # Run this code to upload your TSV files to the Colab environment.
    print("Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).")
    try:
        uploaded = files.upload()
        print("Files uploaded successfully!")
    except Exception as e:
        print(f"File upload failed: {e}")
        # In a Colab environment, you may want to handle this gracefully
        # or simply let the next steps fail if the files are not present.

    # --- STEP 2: PERFORM VOTING ---
    # List of your prediction files
    input_files = [
        'model1.tsv',
        'model2.tsv',
        'model3.tsv'
    ]

    # Output file name
    output_file = 'subtask_1A.tsv'

    # Execute the voting function
    final_results = perform_majority_voting(input_files)

    if final_results is not None and not final_results.empty:
        # Save the results to a new TSV file, without the index column
        final_results.to_csv(output_file, sep='\t', index=False)
        print(f"\nVoting complete! Results saved to '{output_file}'.")
        print("Here are the first 5 voted predictions:")
        print(final_results.head())
    else:
        print("\nNo results to save. Please check your input files.")

Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).


Saving model1.tsv to model1.tsv
Saving model2.tsv to model2.tsv
Saving model3.tsv to model3.tsv
Files uploaded successfully!
Reading 3 files and collecting labels...
Performing majority voting...

Voting complete! Results saved to 'subtask_1A.tsv'.
Here are the first 5 voted predictions:
       id    label                   model
0  879187     None  MajorityVotingEnsemble
1  316919  Profane  MajorityVotingEnsemble
2  916242     None  MajorityVotingEnsemble
3  786824     None  MajorityVotingEnsemble
4   47284     None  MajorityVotingEnsemble


In [None]:
!zip subtask_1A.zip subtask_1A.tsv

  adding: subtask_1A.tsv (deflated 87%)


XLM+MDEV+MUrili


### 72.69

In [1]:
import pandas as pd
from collections import Counter
import os
from google.colab import files

def perform_majority_voting(file_paths):
    """
    Performs majority voting on predictions from multiple TSV files.
    This function assumes the files have already been uploaded to the Colab environment.

    Args:
        file_paths (list): A list of file paths (filenames) to the TSV files.

    Returns:
        pandas.DataFrame: A DataFrame with the final voted predictions (id, label, model).
    """
    if not file_paths:
        print("Error: No file paths provided.")
        return None

    # Dictionary to store all labels for each unique ID
    # The key is the ID, and the value is a list of labels from all files.
    all_labels = {}

    print(f"Reading {len(file_paths)} files and collecting labels...")
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: The file {file_path} was not found. Please ensure it was uploaded correctly.")
            continue

        try:
            # Read the TSV file into a pandas DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Check if the required 'id' and 'label' columns exist
            if 'id' not in df.columns or 'label' not in df.columns:
                print(f"Warning: Skipping {file_path}. It does not contain 'id' and 'label' columns.")
                continue

            # --- MODIFICATION START ---
            # Replace any blank cells (NaN) in the 'label' column with the string 'None'.
            df['label'] = df['label'].fillna('None')
            # --- MODIFICATION END ---

            # Iterate through the DataFrame rows to populate the labels dictionary
            for index, row in df.iterrows():
                # Use setdefault to create a list for the ID if it doesn't exist,
                # then append the current label.
                all_labels.setdefault(row['id'], []).append(row['label'])

        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

    # List to store the final voted predictions
    voted_predictions = []

    # Define the name for the ensemble model
    ensemble_model_name = 'xlm_large+mdeb-v3+murili_base'

    print("Performing majority voting...")
    for id_val, labels in all_labels.items():
        # Use collections.Counter to efficiently find the most common label
        label_counts = Counter(labels)

        # Get the most common label. most_common(1) returns the top item.
        most_common_label = label_counts.most_common(1)[0][0]

        # Append the final prediction to our list, including the new model name
        voted_predictions.append({
            'id': id_val,
            'label': most_common_label,
            'model': ensemble_model_name
        })

    # Create the final DataFrame from the list of dictionaries
    final_df = pd.DataFrame(voted_predictions)

    return final_df

if __name__ == '__main__':
    # --- STEP 1: UPLOAD FILES ---
    # Run this code to upload your TSV files to the Colab environment.
    print("Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).")
    try:
        uploaded = files.upload()
        print("Files uploaded successfully!")
    except Exception as e:
        print(f"File upload failed: {e}")
        # In a Colab environment, you may want to handle this gracefully
        # or simply let the next steps fail if the files are not present.

    # --- STEP 2: PERFORM VOTING ---
    # List of your prediction files
    input_files = [
        'xlm_large.tsv',
        'mdeb.tsv',
        'murili.tsv'
    ]

    # Output file name
    output_file = 'subtask_1A.tsv'

    # Execute the voting function
    final_results = perform_majority_voting(input_files)

    if final_results is not None and not final_results.empty:
        # Save the results to a new TSV file, without the index column
        final_results.to_csv(output_file, sep='\t', index=False)
        print(f"\nVoting complete! Results saved to '{output_file}'.")
        print("Here are the first 5 voted predictions:")
        print(final_results.head())
    else:
        print("\nNo results to save. Please check your input files.")

Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).


Saving mdeb.tsv to mdeb.tsv
Saving murili.tsv to murili.tsv
Saving xlm_large.tsv to xlm_large.tsv
Files uploaded successfully!
Reading 3 files and collecting labels...
Performing majority voting...

Voting complete! Results saved to 'subtask_1A.tsv'.
Here are the first 5 voted predictions:
       id           label                          model
0   12764         Abusive  xlm_large+mdeb-v3+murili_base
1  202933            None  xlm_large+mdeb-v3+murili_base
2  165894         Abusive  xlm_large+mdeb-v3+murili_base
3  124999         Profane  xlm_large+mdeb-v3+murili_base
4  535301  Religious Hate  xlm_large+mdeb-v3+murili_base


In [2]:
!zip subtask_1A.zip subtask_1A.tsv

  adding: subtask_1A.tsv (deflated 91%)


4 Models at Once

In [1]:
import pandas as pd
from collections import Counter
import os
from google.colab import files

def perform_majority_voting(file_paths):
    """
    Performs majority voting on predictions from multiple TSV files.
    This function assumes the files have already been uploaded to the Colab environment.

    Args:
        file_paths (list): A list of file paths (filenames) to the TSV files.

    Returns:
        pandas.DataFrame: A DataFrame with the final voted predictions (id, label, model).
    """
    if not file_paths:
        print("Error: No file paths provided.")
        return None

    # Dictionary to store all labels for each unique ID
    # The key is the ID, and the value is a list of labels from all files.
    all_labels = {}

    print(f"Reading {len(file_paths)} files and collecting labels...")
    for file_path in file_paths:
        if not os.path.exists(file_path):
            print(f"Warning: The file {file_path} was not found. Please ensure it was uploaded correctly.")
            continue

        try:
            # Read the TSV file into a pandas DataFrame
            df = pd.read_csv(file_path, sep='\t')

            # Check if the required 'id' and 'label' columns exist
            if 'id' not in df.columns or 'label' not in df.columns:
                print(f"Warning: Skipping {file_path}. It does not contain 'id' and 'label' columns.")
                continue

            # --- MODIFICATION START ---
            # Replace any blank cells (NaN) in the 'label' column with the string 'None'.
            df['label'] = df['label'].fillna('None')
            # --- MODIFICATION END ---

            # Iterate through the DataFrame rows to populate the labels dictionary
            for index, row in df.iterrows():
                # Use setdefault to create a list for the ID if it doesn't exist,
                # then append the current label.
                all_labels.setdefault(row['id'], []).append(row['label'])

        except Exception as e:
            print(f"An error occurred while processing {file_path}: {e}")

    # List to store the final voted predictions
    voted_predictions = []

    # Define the name for the ensemble model
    ensemble_model_name = 'xlm_large+mdeb-v3+murili_base+indica'

    print("Performing majority voting...")
    for id_val, labels in all_labels.items():
        # Use collections.Counter to efficiently find the most common label
        label_counts = Counter(labels)

        # Get the most common label. most_common(1) returns the top item.
        most_common_label = label_counts.most_common(1)[0][0]

        # Append the final prediction to our list, including the new model name
        voted_predictions.append({
            'id': id_val,
            'label': most_common_label,
            'model': ensemble_model_name
        })

    # Create the final DataFrame from the list of dictionaries
    final_df = pd.DataFrame(voted_predictions)

    return final_df

if __name__ == '__main__':
    # --- STEP 1: UPLOAD FILES ---
    # Run this code to upload your TSV files to the Colab environment.
    print("Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).")
    try:
        uploaded = files.upload()
        print("Files uploaded successfully!")
    except Exception as e:
        print(f"File upload failed: {e}")
        # In a Colab environment, you may want to handle this gracefully
        # or simply let the next steps fail if the files are not present.

    # --- STEP 2: PERFORM VOTING ---
    # List of your prediction files
    input_files = [
        'xlm_large.tsv',
        'mdeb.tsv',
        'murili.tsv',
        'indica.tsv'
    ]

    # Output file name
    output_file = 'subtask_1A.tsv'

    # Execute the voting function
    final_results = perform_majority_voting(input_files)

    if final_results is not None and not final_results.empty:
        # Save the results to a new TSV file, without the index column
        final_results.to_csv(output_file, sep='\t', index=False)
        print(f"\nVoting complete! Results saved to '{output_file}'.")
        print("Here are the first 5 voted predictions:")
        print(final_results.head())
    else:
        print("\nNo results to save. Please check your input files.")

Please upload your prediction files (subtask_1A1.tsv, subtask_1A_2.tsv, subtask_1A_3.tsv).


Saving indica.tsv to indica.tsv
Saving mdeb.tsv to mdeb.tsv
Saving murili.tsv to murili.tsv
Saving xlm_large.tsv to xlm_large.tsv
Files uploaded successfully!
Reading 4 files and collecting labels...
Performing majority voting...

Voting complete! Results saved to 'subtask_1A.tsv'.
Here are the first 5 voted predictions:
       id           label                                 model
0   12764         Abusive  xlm_large+mdeb-v3+murili_base+indica
1  202933            None  xlm_large+mdeb-v3+murili_base+indica
2  165894         Abusive  xlm_large+mdeb-v3+murili_base+indica
3  124999         Profane  xlm_large+mdeb-v3+murili_base+indica
4  535301  Religious Hate  xlm_large+mdeb-v3+murili_base+indica
