In [1]:
import pandas as pd

# search thresholds for imbalanced classification
from numpy import arange, argmax, mean, std
from sklearn.metrics import f1_score, precision_score, recall_score, fbeta_score
import json

def preprocess(df):
    df = pd.DataFrame.from_dict(df).sort_values('Index')
    df.index = df['Index']
    df.drop('Index', axis=1, inplace=True)
    df.index.name = None
    return df

In [11]:
def to_labels(probs, threshold):
 return (probs >= threshold).astype('int')

thresholds_final = []
f1_scores_binary = []
f1_scores_macro = []
precisions = []
recalls = []

for i in range(10):
    with open('../data/train_dev_test/split_' + str(i) + '.json', "r") as json_file:
        data = json.load(json_file)

    train = preprocess(data["train"])
    test = preprocess(data['test'])

    df = pd.read_csv('../data/toxicity_score_jigsaw.csv', index_col='index')['toxicity_score_jigsaw']  # it's a pandas series now
    train = train.merge(df, left_index=True, right_index=True)
    test = test.merge(df, left_index=True, right_index=True)

    thresholds = arange(0, 1, 0.001)

    # evaluate each threshold
    scores = [f1_score(train.Label, to_labels(train.toxicity_score_jigsaw, t), average='binary') for t in thresholds]
    print(scores)
    
    # get best threshold
    ix = argmax(scores)
    print('Threshold=%.3f, F-Score=%.5f' % (thresholds[ix], scores[ix]))

    label_pred = to_labels(test.toxicity_score_jigsaw, thresholds[ix])
    precisions.append(precision_score(test.Label, label_pred, average='macro'))
    recalls.append(recall_score(test.Label, label_pred, average='macro'))
    thresholds_final.append(thresholds[ix])
    f1_scores_binary.append(f1_score(test.Label, label_pred, average='binary'))
    f1_scores_macro.append(f1_score(test.Label, label_pred, average='macro'))

t_final = mean(thresholds_final)
print(round(t_final, 6), 
      'precision:', round(mean(precisions), 6), round(std(precisions), 6), 
      'recall:', round(mean(recalls), 6), round(std(recalls), 6), 
      'f1 binary:', round(mean(f1_scores_binary), 6), round(std(f1_scores_binary), 6), 
      'f1 macro:', round(mean(f1_scores_binary), 6), round(std(f1_scores_binary), 6)
      )


[0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.5760350318471338, 0.57501494321578, 0.5753588516746412, 0.5753588516746412, 0.5753588516746412, 0.5753588516746412, 0.575074775672981, 0.575074775672981, 0.574790586358197, 0.5745062836624776, 0.5742218675179569, 0.5742218675179569, 0.5742218675179569, 0.5740518962075849, 0.5738817891373802, 0.5738817891373802, 0.5710823909531503, 0.5711977378307411, 0.5709090909090909, 0.5709090909090909, 0.5709090909090909, 0.5712553062462098, 0.5709664375252729, 0.5706774519716885, 0.5706774519716885, 0.5707928802588996, 0.5706191825171996, 0.5706191825171996, 0.5706191825171996, 0.5708502024291497, 0.5710814094775213, 0.57

  _warn_prf(average, modifier, msg_start, len(result))


[0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5771223595057793, 0.5761021344504289, 0.5764471057884232, 0.5764471057884232, 0.5764471057884232, 0.5764471057884232, 0.5762779552715656, 0.5762779552715656, 0.5759936089474735, 0.5757091490211745, 0.5754245754245754, 0.5754245754245754, 0.5754245754245754, 0.5751398880895284, 0.5749700119952019, 0.5749700119952019, 0.5726426547956293, 0.5727585509006274, 0.5724696356275303, 0.5724696356275303, 0.5724696356275303, 0.5729335494327391, 0.572644376899696, 0.5723550871503851, 0.5723550871503851, 0.5724711129130347, 0.572297708375583, 0.572297708375583, 0.572297708375583, 0.5724137931034482, 0.5727623300182667, 0.5

  _warn_prf(average, modifier, msg_start, len(result))


[0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5818974054268172, 0.5810033710093199, 0.5813492063492064, 0.5813492063492064, 0.5813492063492064, 0.5813492063492064, 0.5811830091306074, 0.5811830091306074, 0.5809013301568394, 0.5806195393169182, 0.5803376365441907, 0.5803376365441907, 0.5803376365441907, 0.5800556217719507, 0.5798887122416534, 0.5798887122416534, 0.577643504531722, 0.5777598710717163, 0.5774733024380415, 0.5774733024380415, 0.5771866182990729, 0.5776522791448165, 0.5773653419406899, 0.5770782889426957, 0.5770782889426957, 0.5771947527749748, 0.5770240258429236, 0.5770240258429236, 0.5767366720516963, 0.5768531609775802, 0.5772029102667745, 

  _warn_prf(average, modifier, msg_start, len(result))


[0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5832839359810315, 0.5827893175074185, 0.5831353919239904, 0.5831353919239904, 0.5831353919239904, 0.5831353919239904, 0.5829702970297029, 0.5829702970297029, 0.5826896415131709, 0.5824088748019017, 0.5821279968297999, 0.5821279968297999, 0.5821279968297999, 0.5822433610780816, 0.5820777160983347, 0.5820777160983347, 0.579983922829582, 0.5801005025125628, 0.5798150381986329, 0.5798150381986329, 0.5795294590790266, 0.5799959750452808, 0.5797101449275363, 0.5794241997181397, 0.5794241997181397, 0.5795408779701973, 0.5796576032225579, 0.5796576032225579, 0.5793714746172441, 0.5796049979846836, 0.5798387096774194, 

  _warn_prf(average, modifier, msg_start, len(result))


[0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5860023724792408, 0.5855106888361046, 0.5857425742574258, 0.5857425742574258, 0.5857425742574258, 0.5857425742574258, 0.5855784469096671, 0.5855784469096671, 0.5852981969486825, 0.5850178359096314, 0.5847373637264618, 0.5847373637264618, 0.5847373637264618, 0.5848532910388581, 0.5846886156287188, 0.5846886156287188, 0.5836513356095602, 0.5837685817597429, 0.5834840265220013, 0.5834840265220013, 0.5831993569131833, 0.5837859585596459, 0.5837859585596459, 0.5837859585596459, 0.5837859585596459, 0.5839034205231388, 0.58402092976454, 0.58402092976454, 0.5837359098228664, 0.5838534326555266, 0.5839710028191704, 0.5

  _warn_prf(average, modifier, msg_start, len(result))


[0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5851632047477745, 0.5846702317290552, 0.5849019219338221, 0.5849019219338221, 0.5849019219338221, 0.5849019219338221, 0.5850178359096314, 0.5850178359096314, 0.5847373637264619, 0.584456780333069, 0.584456780333069, 0.584456780333069, 0.584456780333069, 0.5842919476398255, 0.5841269841269842, 0.5841269841269842, 0.5840530226953203, 0.584170349537967, 0.5838858750251156, 0.5838858750251156, 0.5836012861736334, 0.584070796460177, 0.584070796460177, 0.584070796460177, 0.584070796460177, 0.584188292094146, 0.5840209297645401, 0.5840209297645401, 0.5837359098228663, 0.5838534326555265, 0.5840886203423968, 0.5840886

  _warn_prf(average, modifier, msg_start, len(result))


[0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5816812053925456, 0.5811830091306074, 0.5814138204924543, 0.5814138204924543, 0.5814138204924543, 0.5814138204924543, 0.5815292949354518, 0.5815292949354518, 0.5815292949354518, 0.5812475168851808, 0.5812475168851808, 0.5812475168851808, 0.5812475168851808, 0.581081081081081, 0.5809145129224652, 0.5809145129224652, 0.5788519637462236, 0.5789685737308623, 0.5786822486399356, 0.5786822486399356, 0.5783958081418783, 0.5788624445340863, 0.5785757514625782, 0.5782889426957224, 0.5782889426957224, 0.5782889426957224, 0.5781186919660881, 0.5781186919660881, 0.577831617201696, 0.5780650373661885, 0.5784155214227971, 0

  _warn_prf(average, modifier, msg_start, len(result))


[0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5836468026133439, 0.5834324217201744, 0.5835480673934589, 0.5835480673934589, 0.5835480673934589, 0.5835480673934589, 0.5833829069998017, 0.5833829069998017, 0.5833829069998017, 0.5831019436731456, 0.5828208688752231, 0.5828208688752231, 0.5828208688752231, 0.5826552887477674, 0.5823739579198094, 0.5823739579198094, 0.5792437650844731, 0.5793602896801449, 0.5790744466800806, 0.5790744466800806, 0.5787884886295029, 0.5793714746172441, 0.5790852307072335, 0.5787988714228134, 0.5787988714228134, 0.5787988714228134, 0.5786290322580645, 0.5786290322580645, 0.5783424077434968, 0.5785757514625782, 0.5789261203068227,

  _warn_prf(average, modifier, msg_start, len(result))


[0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5855873642645607, 0.5853754940711462, 0.5856069592724398, 0.5856069592724398, 0.5856069592724398, 0.5856069592724398, 0.5854430379746834, 0.5854430379746834, 0.5851632047477745, 0.5851632047477745, 0.5848832607835378, 0.5848832607835378, 0.5848832607835378, 0.5847189231987332, 0.5847189231987332, 0.5847189231987332, 0.5826487677820077, 0.5826487677820077, 0.5826487677820077, 0.5826487677820077, 0.5823647294589178, 0.5828319294023265, 0.5825476429287864, 0.5822632423756019, 0.5822632423756019, 0.5823800923138671, 0.5820955439582497, 0.5820955439582497, 0.5818108813491266, 0.5820445872665194, 0.582395498392283, 

  _warn_prf(average, modifier, msg_start, len(result))


[0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5819980217606331, 0.5811051693404634, 0.5814506539833532, 0.5814506539833532, 0.5814506539833532, 0.5814506539833532, 0.581169474727453, 0.581169474727453, 0.5808881839809674, 0.5808881839809674, 0.580606781677573, 0.580606781677573, 0.580606781677573, 0.5804403888117436, 0.5805555555555556, 0.5805555555555556, 0.5779650812763395, 0.5779650812763395, 0.5779650812763395, 0.5779650812763395, 0.5776796467282217, 0.5780277164089175, 0.57774206508638, 0.5774562989752863, 0.5774562989752863, 0.577572347266881, 0.5772864321608041, 0.5772864321608041, 0.5770004020908724, 0.5772325020112631, 0.5774647887323944, 0.57746

  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
# Set the maximum number of rows and columns to be displayed
pd.set_option('display.max_rows', None)  # To display all rows
pd.set_option('display.max_columns', None)  # To display all columns

for i in range(10):
    with open('data/train_dev_test_cleaned/split_' + str(i) + '.json', "r") as json_file:
        data = json.load(json_file)

    test = preprocess(data['test'])

    df = pd.read_csv('data/toxicity_score_jigsaw.csv', index_col='index')['toxicity_score_jigsaw']  # it's a pandas series now
    df = test.merge(df, left_index=True, right_index=True)
    df = df[df['toxicity_score_jigsaw'] < 0.5]

    # Calculate the absolute difference between two columns
    df['Absolute_Difference'] = abs(df['Label'] - df['toxicity_score_jigsaw'])

    # Sort the DataFrame by the absolute difference in descending order
    df = df.sort_values(by='Absolute_Difference', ascending=False)

    # Get the top n rows with the highest difference
    n = 100  # Replace with the desired number of rows
    top_n = df.iloc[:n]

    # Remove the 'Absolute_Difference' column if needed
    top_n = top_n.drop('Absolute_Difference', axis=1)

    for i, row in top_n.iterrows():
        print(row.Article_title)
        print(row.Comment)
        print(row.Tags)
        print('Annotated:', row.Label)
        print('Toxicity Score:', row.toxicity_score_jigsaw)

Dänemark führt 3G-Regel wieder ein; 8.178 Neuinfektionen
Das Kickl und Menschenwürde? Irgendwie passt das nicht recht zusammen.
[{'Tag': 'Target_Individual', 'Token': 'Kickl'}]
Annotated: 1
Toxicity Score: 0.2152329
Moderna beantragte EU-Zulassung für jüngere Kinder, über 400 Covid-Patienten auf Intensivstationen
Jetzt les ichs erst, die Gewerkschaft WARNT vor Personalmangel? Die Gewerkschaft? Wie wärs wenn man damit höhere Löhne durchsetzt ihr *+%34§$
[{'Tag': 'Target_Other', 'Token': 'höhere Löhne'}, {'Tag': 'Target_Group', 'Token': 'die Gewerkschaft'}, {'Tag': 'Target_Other', 'Token': 'Personalmangel'}]
Annotated: 1
Toxicity Score: 0.2565272
Wer steht hinter dem Medium "Exxpress", das gegen Staatsanwälte und Journalisten ausreitet?
Konkurrenz vom Hals schaffen. Offenbar ist der Standard am Weg zum Boulevard Medium, und da muss man die anderen anpatzen. Als Boulevard Medium ist das auch einfacher, Floskeln wie "die nicht genannt werden wollen" oder "angeblich" zu Papier zu bringen di