# Post-Process Dataframe for Submission

In this notebook the final dataframe containing labels for each specified annotator in the test file is post-processed, meaning it is transformed in the variables required.

In [2]:
import pandas as pd

In [8]:
ANNOTATOR_COLUMNS = ['A001', 'A002', 'A003', 'A004', 'A005', 'A007', 'A008', 'A009', 'A010', 'A012']

In [10]:
df = pd.read_csv('df_comp_multi_2.csv')
df = df.drop('Unnamed: 0', axis=1)

In [11]:
def get_distr(df, bin):
    """
    Calculates distribution metrics for subtask 2 based on annotator predictions for either binary or multi-class tasks.
    """
    for index, row in df.iterrows():
        row_predictions = row[ANNOTATOR_COLUMNS] 
        row_predictions = row_predictions[row_predictions != -1]
        
        if bin:
            dist_bin_0 = row_predictions.value_counts(normalize=True).get(0, 0) 
            dist_bin_1 = row_predictions.value_counts(normalize=True).get(1, 0)  
            dist_bin_1 += row_predictions[row_predictions > 1].count() / len(row_predictions)
            
            df.at[index, 'dist_bin_0'] = dist_bin_0
            df.at[index, 'dist_bin_1'] = 1 - dist_bin_0 
        else:
            for i in range(1, 5): 
                df.at[index, f'dist_multi_{i}'] = row_predictions.value_counts(normalize=True).get(i, 0)
            dist_multi_sum = sum(df.at[index, f'dist_multi_{i}'] for i in range(1, 5))
            df.at[index, 'dist_multi_0'] = 1 - dist_multi_sum
    return df

In [13]:
df = get_distr(df, bin=True)
df = get_distr(df, bin=False)

In [14]:
df.head(2)

Unnamed: 0,id,text,A008,A007,A003,A005,A004,A012,A009,A002,A001,A010,dist_bin_0,dist_bin_1,dist_multi_1,dist_multi_2,dist_multi_3,dist_multi_4,dist_multi_0
0,f3b81af2f6852bf1b9896629525d2f41,"Ja, Frauen können krankhaft eifersüchtig werde...",3,2,3,2,3,2,1,2,2,2,0.0,1.0,0.1,0.6,0.3,0.0,0.0
1,cf8b8bac7165144bb62b399a98843366,"Ich hau' auf jede Religion gern drauf, aber de...",0,0,0,0,0,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
def calculate_js_distance(df1, df2, columns):
    """
    Calculates Jensen-Shannon distances between corresponding columns of two dataframes.
    """
    js_distances = []
    for column in columns:
        p = df1[column].values
        q = df2[column].values
        
        if np.sum(p) == 0 or np.sum(q) == 0:
            js_dist = np.nan
        else:
            js_dist = jensenshannon(p, q)
        
        js_distances.append(js_dist)
    
    valid_distances = [dist for dist in js_distances if not np.isnan(dist)]
    mean_js_distance = np.mean(valid_distances) if valid_distances else np.nan
    
    print("Jensen-Shannon Distanz:", mean_js_distance)
    return mean_js_distance

In [None]:
def increment_values(df, columns):
    """
    Increments non-negative values in specified columns of a dataframe by 1.
    """
    df[columns] = df[columns].applymap(lambda x: x + 1 if x != -1 else x)
    return df

def decrement_values(df, columns):
    """
    Decrements values in specified columns of a dataframe by 2.
    """
    df[columns] = df[columns].applymap(lambda x: x - 2)
    return df

In [16]:
def apply_rules_to_expand(df):
    """
    Assigning categorical features for subtask 1 based on feature values computed for subtask 2.
    """
    def apply_rules(row):
        if row['dist_bin_1'] > row['dist_bin_0']:
            bin_maj = 1
        elif row['dist_bin_1'] < row['dist_bin_0']:
            bin_maj = 0
        else:
            bin_maj = 0 
        bin_one = 1 if row['dist_bin_1'] != 0 else 0
        bin_all = 1 if row['dist_bin_1'] == 1 else 0
        max_dist_multi = max(row['dist_multi_0'], row['dist_multi_1'], row['dist_multi_2'], row['dist_multi_3'], row['dist_multi_4'])
        multi_maj = [i for i, val in enumerate([row['dist_multi_0'], row['dist_multi_1'], row['dist_multi_2'], row['dist_multi_3'], row['dist_multi_4']]) if val == max_dist_multi][0]
        disagree_bin = 1 if (row['dist_bin_0'] != 0 and row['dist_bin_1'] != 0) else 0
        return pd.Series({'bin_maj': bin_maj, 'bin_one': bin_one, 'bin_all': bin_all, 'multi_maj': multi_maj, 'disagree_bin': disagree_bin})

    new_columns = df.apply(apply_rules, axis=1)
    df = pd.concat([df, new_columns], axis=1)
    return df

In [17]:
df = apply_rules_to_expand(df)
df.head(2)

Unnamed: 0,id,text,A008,A007,A003,A005,A004,A012,A009,A002,...,dist_multi_1,dist_multi_2,dist_multi_3,dist_multi_4,dist_multi_0,bin_maj,bin_one,bin_all,multi_maj,disagree_bin
0,f3b81af2f6852bf1b9896629525d2f41,"Ja, Frauen können krankhaft eifersüchtig werde...",3,2,3,2,3,2,1,2,...,0.1,0.6,0.3,0.0,0.0,1,1,1,2,0
1,cf8b8bac7165144bb62b399a98843366,"Ich hau' auf jede Religion gern drauf, aber de...",0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0,0,0,0,0
