In [4]:
import os
import re
from datetime import datetime
import pandas as pd

In [23]:
# List the submitted classifiers folders
submitted_classifiers = [folder for folder in os.listdir(".") if os.path.isdir(folder) and re.search(r"\d", folder)]
classifier_conf = [float(s.split("-")[1]) for s in submitted_classifiers]

# remove the cls tha conf < threshold
THRESHOLD_CONF = 0.6
submitted_classifiers = [c for c, conf in zip(submitted_classifiers, classifier_conf) if conf >= THRESHOLD_CONF]
classifier_conf = [conf for conf in classifier_conf if conf >= THRESHOLD_CONF]

# Remove duplicated classifier with non-max conf in group (keep the max for each classifier)
# Remove duplicated classifiers and keep the one with the maximum confidence
classifier_dict = {}
for c, conf in zip(submitted_classifiers, classifier_conf):
    classifier_name = c.split("-")[0]  # Extract classifier name (assumes format "name-conf")
    if classifier_name not in classifier_dict or conf > classifier_dict[classifier_name][1]:
        classifier_dict[classifier_name] = (c, conf)
submitted_classifiers = [item[0] for item in classifier_dict.values()]
classifier_conf = [item[1] for item in classifier_dict.values()]

for c, conf in zip(submitted_classifiers, classifier_conf):
    print(f"{c.split('-')[0]} - {conf}")

CNNBinaryClassifier - 0.64843
LogisticRegression - 0.63281
MLPClassifier - 0.72656
BaggingClassifier - 0.71875


In [24]:
# 1 level subfolder is the parameter
submitted_classifiers_params = [os.listdir(classifier)[0] for classifier in submitted_classifiers]

# csv in the subfolder is the predictions
predictions_csvs = []
for classifier, param in zip(submitted_classifiers, submitted_classifiers_params):
    # Append file in the subfolder if is .csv
    for file in os.listdir(os.path.join(classifier, param)):
        if file.endswith(".csv"):
            predictions_csvs.append(os.path.join(classifier, param, file))

predictions_dfs = [pd.read_csv(csv).sort_index() for csv in predictions_csvs]

# Add conf data to each prediction
for df, name, conf in zip(predictions_dfs, submitted_classifiers, classifier_conf):
    # turn df["EventType"] 0/1 -> -1/1
    df["EventType"] = df["EventType"].apply(lambda x: 2 * x - 1)
    df[name] = df["EventType"]
    df["Confidence"] = conf
    df["Confidence"] = df["Confidence"].astype(float)
    df["SoftEventType"] = df["EventType"] * df["Confidence"]

predictions_dfs[0].head()

Unnamed: 0,ID,EventType,CNNBinaryClassifier-0.64843,Confidence,SoftEventType
0,6_0,-1,-1,0.64843,-0.64843
1,6_1,-1,-1,0.64843,-0.64843
2,6_2,1,1,0.64843,0.64843
3,6_3,1,1,0.64843,0.64843
4,6_4,1,1,0.64843,0.64843


In [34]:
import math

# Sum the predictions
summed_predictions = pd.concat(predictions_dfs).groupby("ID").sum()
# summed_predictions["AvgVotes"] = round(summed_predictions["EventType"] / len(predictions_dfs))
# summed_predictions["ConfVotes"] = round(summed_predictions["SoftEventType"] / summed_predictions["Confidence"])
summed_predictions["AvgVotes"] = (summed_predictions["EventType"] / len(predictions_dfs) > 0)
summed_predictions["ConfVotes"] = (summed_predictions["SoftEventType"] / summed_predictions["Confidence"] >= 0)
# avg_conf = sum(classifier_conf) / len(predictions_dfs)
# summed_predictions["DistVotes"] = (summed_predictions["SoftEventType"] > 0.2)
summed_predictions = summed_predictions.drop(columns=["EventType", "Confidence", "SoftEventType"])
summed_predictions

Unnamed: 0_level_0,EventType,CNNBinaryClassifier-0.64843,Confidence,SoftEventType,LogisticRegression-0.63281,MLPClassifier-0.72656,BaggingClassifier-0.71875,AvgVotes,ConfVotes,DistVotes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15_0,0.0,-1.0,2.72655,0.16407,-1.0,1.0,1.0,False,True,False
15_1,0.0,-1.0,2.72655,0.16407,-1.0,1.0,1.0,False,True,False
15_10,-2.0,-1.0,2.72655,-1.27343,-1.0,1.0,-1.0,False,False,False
15_100,0.0,-1.0,2.72655,0.16407,-1.0,1.0,1.0,False,True,False
15_101,0.0,-1.0,2.72655,0.16407,-1.0,1.0,1.0,False,True,False
...,...,...,...,...,...,...,...,...,...,...
9_95,-2.0,1.0,2.72655,-1.42969,-1.0,-1.0,-1.0,False,False,False
9_96,-2.0,1.0,2.72655,-1.42969,-1.0,-1.0,-1.0,False,False,False
9_97,-2.0,1.0,2.72655,-1.42969,-1.0,-1.0,-1.0,False,False,False
9_98,-2.0,1.0,2.72655,-1.42969,-1.0,-1.0,-1.0,False,False,False


In [35]:
# For all col -1/1 -> 0/1
summed_predictions = summed_predictions.map(lambda x: 1.0 if x > 0 else 0.0)
summed_predictions

Unnamed: 0_level_0,EventType,CNNBinaryClassifier-0.64843,Confidence,SoftEventType,LogisticRegression-0.63281,MLPClassifier-0.72656,BaggingClassifier-0.71875,AvgVotes,ConfVotes,DistVotes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
15_0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
15_1,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
15_10,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15_100,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
15_101,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
9_95,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9_96,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9_97,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9_98,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
save_path = f"voting-{datetime.today().strftime('%Y%m%d')}-{THRESHOLD_CONF}-non-duplicated.csv"
save_df = summed_predictions.copy()
save_df["EventType"] = summed_predictions["ConfVotes"]
save_df = save_df[["EventType"]]
save_df.to_csv(save_path)