# Calculating cross entropy loss values for different overlap percentages

### In probability and information theory, the cross-entropy loss $H(p, q)$ between two probability distributions $p$ and $q$ is defined as:
## $$H(p, q) = -\sum_{i} p(i) \ln(q(i))$$
> ### Where...
> ### $p(i)$ is the true probability distribution (often represented as a vector or array).
> ### $q(i)$ is the predicted probability distribution (often represented as a vector or array).

In [155]:
import pandas as pd
import numpy as np

In [156]:
zero_overlap = pd.read_csv('filtered_zero_overlap.csv')
fifty_overlap = pd.read_csv('filtered_fifty_overlap.csv')
seventy_overlap = pd.read_csv('filtered_seventy_overlap.csv')

In [157]:
zero_overlap

Unnamed: 0,unique_id,p_laugh,p_drum,p_pik,overlap,truth,more than one?,talking or other sounds?
0,273072621,2.512462e-01,0.000000e+00,7.487538e-01,0,pik,0,0
1,561220301,0.000000e+00,0.000000e+00,1.000000e+00,0,pik,0,0
2,309454251,1.000000e+00,0.000000e+00,0.000000e+00,0,laugh,0,0
3,239810171,2.820000e-14,3.844195e-02,9.615581e-01,0,pik,0,1
4,347229281,1.000000e+00,0.000000e+00,0.000000e+00,0,laugh,0,0
...,...,...,...,...,...,...,...,...
62,616587785,2.705461e-02,7.022236e-01,2.707218e-01,0,drum,0,1
63,376246401,1.000000e+00,0.000000e+00,1.180000e-24,0,laugh,0,0
64,462975991,1.000000e+00,0.000000e+00,4.090000e-36,0,laugh,0,0
65,222959451,5.529619e-02,4.690000e-13,9.447038e-01,0,pik,0,1


In [158]:
threshold = 10e-8
def cross_entropy_loss(df):
    # Initialize an empty list to store CE losses
    ce_losses = []
    
    # Iterate through each row in the DataFrame
    for index, row in df.iterrows():
        
        if row['truth'] == 'laugh':
            if row['p_laugh'] == 0:
                ce_loss = -np.log(threshold)
            else:
                ce_loss = -np.log(row['p_laugh'])
        
        elif row['truth'] == 'drum':
            if row['p_drum'] == 0:
                ce_loss = -np.log(threshold)
            else:
                ce_loss = -np.log(row['p_drum'])
        
        elif row['truth'] == 'pik':
            if row['p_pik'] == 0:
                ce_loss = -np.log(threshold)
            else:
                ce_loss = -np.log(row['p_pik'])
        #else:
            # Handle other cases if needed
            #ce_loss = np.nan  # or any default value
        
        # Append the calculated CE loss to the list
        ce_losses.append(ce_loss)
    
    # Assign the list of CE losses to a new column 'CE_loss' in the DataFrame
    df.loc[:, 'CE_loss'] = ce_losses  # Use .loc[] to assign values
    
    return df


In [138]:
fifty_overlap

Unnamed: 0,unique_id,p_laugh,p_drum,p_pik,overlap,truth,more than one?,talking or other sounds?
0,273072621,0.167592,0.000000e+00,0.832408,0.5,pik,0,0
1,561220301,0.000000,0.000000e+00,1.000000,0.5,pik,0,0
2,309454251,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0
3,239810171,0.037080,1.853224e-02,0.944388,0.5,pik,0,1
4,347229281,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0
...,...,...,...,...,...,...,...,...
67,616587785,0.026624,7.049360e-01,0.270000,0.5,drum,0,1
68,376246401,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0
69,462975991,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0
70,222959451,0.027961,2.370000e-13,0.970000,0.5,pik,0,1


In [159]:
merged = pd.merge(zero_overlap, fifty_overlap, on='unique_id', how = 'inner')
in_common_nums = merged['unique_id'].tolist()

In [160]:
fifty_filtered = fifty_overlap[fifty_overlap['unique_id'].isin(in_common_nums)].reset_index(drop=True)
print(fifty_filtered.to_string(index=True))

    unique_id       p_laugh        p_drum         p_pik  overlap  truth  more than one?  talking or other sounds?
0   273072621  1.675920e-01  0.000000e+00  8.324080e-01      0.5    pik               0                         0
1   561220301  0.000000e+00  0.000000e+00  1.000000e+00      0.5    pik               0                         0
2   309454251  1.000000e+00  0.000000e+00  0.000000e+00      0.5  laugh               0                         0
3   239810171  3.708016e-02  1.853224e-02  9.443876e-01      0.5    pik               0                         1
4   347229281  1.000000e+00  0.000000e+00  0.000000e+00      0.5  laugh               0                         0
5   533570501  5.891212e-02  8.063332e-01  1.347547e-01      0.5   drum               0                         0
6   150354041  5.530000e-24  9.999923e-01  7.700000e-06      0.5   drum               0                         0
7   570770731  0.000000e+00  1.000000e+00  0.000000e+00      0.5   drum               0 

In [161]:
seventy_filtered = seventy_overlap[seventy_overlap['unique_id'].isin(in_common_nums)].reset_index(drop=True)
print(seventy_filtered.to_string(index=True))

    unique_id       p_laugh        p_drum         p_pik  overlap  truth  more than one?  talking or other sounds?
0   273072621  1.900872e-01  0.000000e+00  8.099128e-01      0.7    pik               0                         0
1   561220301  0.000000e+00  0.000000e+00  1.000000e+00      0.7    pik               0                         0
2   309454251  1.000000e+00  0.000000e+00  0.000000e+00      0.7  laugh               0                         0
3   239810171  6.600755e-02  1.341555e-03  9.326509e-01      0.7    pik               0                         1
4   347229281  1.000000e+00  0.000000e+00  1.080000e-36      0.7  laugh               0                         0
5   533570501  3.542149e-02  8.240607e-01  1.405178e-01      0.7   drum               0                         0
6   150354041  3.350000e-24  1.000000e+00  4.680000e-06      0.7   drum               0                         0
7   570770731  0.000000e+00  1.000000e+00  0.000000e+00      0.7   drum               0 

In [162]:
cross_entropy_loss(zero_overlap)

Unnamed: 0,unique_id,p_laugh,p_drum,p_pik,overlap,truth,more than one?,talking or other sounds?,CE_loss
0,273072621,2.512462e-01,0.000000e+00,7.487538e-01,0,pik,0,0,0.289345
1,561220301,0.000000e+00,0.000000e+00,1.000000e+00,0,pik,0,0,-0.000000
2,309454251,1.000000e+00,0.000000e+00,0.000000e+00,0,laugh,0,0,-0.000000
3,239810171,2.820000e-14,3.844195e-02,9.615581e-01,0,pik,0,1,0.039200
4,347229281,1.000000e+00,0.000000e+00,0.000000e+00,0,laugh,0,0,-0.000000
...,...,...,...,...,...,...,...,...,...
62,616587785,2.705461e-02,7.022236e-01,2.707218e-01,0,drum,0,1,0.353503
63,376246401,1.000000e+00,0.000000e+00,1.180000e-24,0,laugh,0,0,-0.000000
64,462975991,1.000000e+00,0.000000e+00,4.090000e-36,0,laugh,0,0,-0.000000
65,222959451,5.529619e-02,4.690000e-13,9.447038e-01,0,pik,0,1,0.056884


In [163]:
cross_entropy_loss(fifty_filtered)

Unnamed: 0,unique_id,p_laugh,p_drum,p_pik,overlap,truth,more than one?,talking or other sounds?,CE_loss
0,273072621,0.167592,0.000000e+00,0.832408,0.5,pik,0,0,0.183433
1,561220301,0.000000,0.000000e+00,1.000000,0.5,pik,0,0,-0.000000
2,309454251,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0,-0.000000
3,239810171,0.037080,1.853224e-02,0.944388,0.5,pik,0,1,0.057219
4,347229281,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0,-0.000000
...,...,...,...,...,...,...,...,...,...
62,616587785,0.026624,7.049360e-01,0.270000,0.5,drum,0,1,0.349648
63,376246401,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0,-0.000000
64,462975991,1.000000,0.000000e+00,0.000000,0.5,laugh,0,0,-0.000000
65,222959451,0.027961,2.370000e-13,0.970000,0.5,pik,0,1,0.030459


In [164]:
cross_entropy_loss(seventy_filtered)

Unnamed: 0,unique_id,p_laugh,p_drum,p_pik,overlap,truth,more than one?,talking or other sounds?,CE_loss
0,273072621,0.190087,0.000000e+00,8.099128e-01,0.7,pik,0,0,0.210829
1,561220301,0.000000,0.000000e+00,1.000000e+00,0.7,pik,0,0,-0.000000
2,309454251,1.000000,0.000000e+00,0.000000e+00,0.7,laugh,0,0,-0.000000
3,239810171,0.066008,1.341555e-03,9.326509e-01,0.7,pik,0,1,0.069724
4,347229281,1.000000,0.000000e+00,1.080000e-36,0.7,laugh,0,0,-0.000000
...,...,...,...,...,...,...,...,...,...
62,616587785,0.038725,6.890807e-01,2.720000e-01,0.7,drum,0,1,0.372397
63,376246401,0.857145,0.000000e+00,1.428548e-01,0.7,laugh,0,0,0.154148
64,462975991,1.000000,0.000000e+00,1.360000e-36,0.7,laugh,0,0,-0.000000
65,222959451,0.051586,6.050000e-09,9.480000e-01,0.7,pik,0,1,0.053401


In [165]:
print(f'Cross entropy loss for overlap of 0% = {sum(zero_overlap["CE_loss"])}')
print(f'Cross entropy loss for overlap of 50% = {sum(fifty_filtered["CE_loss"])}')
print(f'Cross entropy loss for overlap of 70% = {sum(seventy_filtered["CE_loss"])}')

Cross entropy loss for overlap of 0% = 26.369330713665903
Cross entropy loss for overlap of 50% = 42.46368418085971
Cross entropy loss for overlap of 70% = 68.5883902376048


In [153]:
print(f'best ce = {min(sum(zero_overlap["CE_loss"]), sum(fifty_filtered["CE_loss"]), sum(seventy_filtered["CE_loss"]))}')

best ce = 42.46368418085971


In [151]:
unique_ids = seventy_filtered['unique_id'].to_list()
unique_ids

[273072621,
 561220301,
 309454251,
 239810171,
 347229281,
 533570501,
 150354041,
 570770731,
 347858241,
 617342193,
 618172651,
 104357441,
 311688651,
 601652861,
 323641781,
 434372741,
 318540001,
 616122737,
 490910681,
 565879421,
 520350001,
 220759221,
 443490761,
 323237621,
 401144841,
 392495311,
 181737771,
 324141031,
 279260991,
 303711261,
 614563468,
 429950981,
 147890801,
 436170191,
 618030099,
 464790091,
 350330041,
 616774892,
 155599071,
 505559401,
 408109331,
 356095091,
 445344591,
 349695741,
 532778151,
 366742991,
 473274261,
 612950835,
 611406572,
 211006261,
 613574951,
 462696961,
 334233671,
 90534411,
 609014861,
 367317641,
 82665391,
 613508440,
 599433361,
 618381365,
 369261521,
 224559801,
 616587785,
 376246401,
 462975991,
 222959451,
 228466501]