In [1]:
import pandas as pd
import numpy as np
import os
import torchmetrics as tm
import torch
from tqdm.auto import tqdm

In [2]:
# Dataset
DATA_DIR_PATH = os.path.abspath("../../data")
SESSION_DIR_PATH = os.path.abspath("../../session")
TRAIN_DATASET_PATH = os.path.join(DATA_DIR_PATH, "jigsaw2019-train.csv")
TEST_DATASET_PATH = os.path.join(DATA_DIR_PATH, "jigsaw2019-test.csv")
LABEL_LIST = ['toxicity', 'obscene', 'sexual_explicit',
            'identity_attack', 'insult', 'threat']
IDENTITY_LIST = ['male', 'female', 'transgender', 'other_gender', 'heterosexual',
                'homosexual_gay_or_lesbian', 'bisexual','other_sexual_orientation',
                'christian', 'jewish', 'muslim', 'hindu','buddhist', 'atheist',
                'other_religion', 'black', 'white', 'asian', 'latino',
                'other_race_or_ethnicity', 'physical_disability',
                'intellectual_or_learning_disability',
                'psychiatric_or_mental_illness','other_disability']
SELECTED_IDENTITY_LIST = ['male', 'female', 'black', 'white', 'homosexual_gay_or_lesbian',
                    'christian', 'jewish', 'muslim', 'psychiatric_or_mental_illness']


In [29]:
train_df = pd.read_csv(TRAIN_DATASET_PATH, index_col=0)
train_df[LABEL_LIST] = (train_df[LABEL_LIST]>=0.5).astype(int)

In [30]:
train_df_0 = train_df[(train_df['toxicity'] == 0) &
                      (train_df['obscene'] == 0) &
                      (train_df['identity_attack'] == 0) &
                      (train_df['insult'] == 0) &
                      (train_df['threat'] == 0) &
                      (train_df['sexual_explicit'] == 0)]

train_df_1 = train_df[(train_df['toxicity'] == 1) |
                      (train_df['obscene'] == 1) |
                      (train_df['identity_attack'] == 1) |
                      (train_df['insult'] == 1) |
                      (train_df['threat'] == 1) |
                      (train_df['sexual_explicit'] == 1)]

nb_0 = len(train_df_0)
n_sampling = 0.1
nb_1 = int(nb_0 * n_sampling)
print("NB 0: {}".format(nb_0))
print("NB 1: {}".format(nb_1))
ids_0 = np.random.randint(0, high=nb_0, size=nb_1)

train_df_0 = train_df_0.iloc[ids_0]
train_df_2 = pd.concat([train_df_0, train_df_1])

print("Train size: {}".format(len(train_df_2)))

NB 0: 1653158
NB 1: 165315
Train size: 313859


In [23]:
ids_0 = np.random.randint(0, high=nb_0, size=nb_1)
ids_0

array([1188,  321,  672, 1130, 1181,  495, 1193,  406,  823, 1033,  459,
        314,  782, 1205, 1049,  689, 1196,  297,  612,  704, 1045,  510,
        933,  635, 1037,  483,  651,  692, 1113,  470,   78,  333,  988,
         95, 1192,  290, 1015,  624,   78,   68,  282,  922, 1012,   63,
        956,  791,  247,  948,  222, 1119,  933,  926,  474,  441,  436,
        803, 1189,  246,  853,  128,  119,  320, 1077,  954,  404,  584,
        128, 1018,  857, 1030, 1070, 1034, 1196,  569,  353, 1081,  111,
         47,  559, 1217, 1190,  822,  169,  799,  535,  579,  939,  348,
        728, 1144,  906,  862,  235,  122,  880,  995,  666, 1194,  621,
       1153, 1123,  856, 1217,  607,  435,  450,  386,   26, 1012, 1181,
       1157,  273, 1164,  517,  720,  720,  746, 1123,  136,  185, 1160,
        929,  846])

In [18]:
train_df.sample(10)

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
1292804,1090644,sad story feel bad young daughters losing fath...,train,2017-03-06 20:44:17.983327+00,55,,317165,approved,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4
1218193,588237,right wingers famous talking sides mouths,train,2016-11-10 01:01:13.338050+00,54,587197.0,151108,approved,0,0,...,,,,,,,,,0,4
451092,379405,perfect opportunity ohio dan stop pretending a...,train,2016-07-08 07:58:42.674424+00,21,,140851,approved,0,0,...,,,,,,,,,0,4
1226995,5651812,experienced healthcare systems say never trade...,train,2017-07-25 00:35:23.187865+00,54,,358623,approved,0,0,...,,,,,,,,,0,4
642131,893003,republicans books written die eisenhower mistr...,train,2017-01-25 00:30:19.339997+00,54,892872.0,163841,approved,1,0,...,,,,,,,,,0,4
47557,6075864,ms harrop absolutely nothing wrong revamping m...,train,2017-10-03 17:34:47.825827+00,66,,384854,approved,0,0,...,,,,,,,,,0,4
444336,704522,since canada private healthcare faith based ho...,train,2016-12-13 19:37:45.590436+00,54,702646.0,156260,approved,0,0,...,,,,,,,,,0,4
1185247,5193695,rainfall eugene vary location location officia...,train,2017-04-29 06:21:03.983073+00,13,,330162,approved,0,0,...,,,,,,,,,0,4
1138896,570659,like leap maifesto eco marxists longer aberrat...,train,2016-11-02 18:54:33.082731+00,54,,150353,approved,0,0,...,,,,,,,,,0,4
472177,5221594,us true last years months dropping due jobs pe...,train,2017-05-05 23:39:28.988265+00,54,5220192.0,332226,approved,0,0,...,,,,,,,,,0,4


In [14]:
train_df_0 = train_df[(train_df[LABEL_LIST] == 0).all(axis=1)]

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
356383,5596248,vino veritas,train,2017-07-15 02:08:40.357362+00,67,,355555,approved,0,0,...,,,,,,,,,0,10
1062775,528774,handy timeline http www washingtonpost com sf ...,train,2016-10-14 19:00:42.721837+00,21,,148532,approved,0,0,...,,,,,,,,,0,4
1478111,6030567,obama longer president,train,2017-09-26 23:21:46.825182+00,102,6030472.0,382311,approved,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
446346,841303,remember page scandal nothing putin foreigners...,train,2017-01-14 22:46:12.179569+00,54,841203.0,161946,approved,0,0,...,,,,,,,,,0,4
1019380,6057449,perhaps gov might reconsider reopening toronto...,train,2017-09-30 17:17:55.980104+00,54,,383720,approved,0,0,...,,,,,,,,,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715217,4967787,president trying protect american people slew ...,train,2017-03-07 16:47:32.316232+00,22,,317419,approved,0,0,...,,,,,,,,,0,4
603450,5215804,ach reviewing post came either great minds thi...,train,2017-05-04 19:56:34.387189+00,54,5215790.0,331969,approved,2,0,...,,,,,,,,,0,4
580564,5220966,look international maritime center see linkage...,train,2017-05-05 20:52:00.307930+00,54,5220946.0,332332,approved,0,0,...,,,,,,,,,0,4
806110,5420732,lower deficit work together open purse strings...,train,2017-06-15 15:23:56.789674+00,54,5419798.0,344602,approved,0,0,...,,,,,,,,,0,4


In [11]:
train_df_0 = train_df[(train_df['toxicity'] == 0) &
                      (train_df['obscene'] == 0) &
                      (train_df['identity_attack'] == 0) &
                      (train_df['insult'] == 0) &
                      (train_df['threat'] == 0) &
                      (train_df['sexual_explicit'] == 0)]
train_df_0

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
356383,5596248,vino veritas,train,2017-07-15 02:08:40.357362+00,67,,355555,approved,0,0,...,,,,,,,,,0,10
1062775,528774,handy timeline http www washingtonpost com sf ...,train,2016-10-14 19:00:42.721837+00,21,,148532,approved,0,0,...,,,,,,,,,0,4
1478111,6030567,obama longer president,train,2017-09-26 23:21:46.825182+00,102,6030472.0,382311,approved,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
446346,841303,remember page scandal nothing putin foreigners...,train,2017-01-14 22:46:12.179569+00,54,841203.0,161946,approved,0,0,...,,,,,,,,,0,4
1019380,6057449,perhaps gov might reconsider reopening toronto...,train,2017-09-30 17:17:55.980104+00,54,,383720,approved,0,0,...,,,,,,,,,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
715217,4967787,president trying protect american people slew ...,train,2017-03-07 16:47:32.316232+00,22,,317419,approved,0,0,...,,,,,,,,,0,4
603450,5215804,ach reviewing post came either great minds thi...,train,2017-05-04 19:56:34.387189+00,54,5215790.0,331969,approved,2,0,...,,,,,,,,,0,4
580564,5220966,look international maritime center see linkage...,train,2017-05-05 20:52:00.307930+00,54,5220946.0,332332,approved,0,0,...,,,,,,,,,0,4
806110,5420732,lower deficit work together open purse strings...,train,2017-06-15 15:23:56.789674+00,54,5419798.0,344602,approved,0,0,...,,,,,,,,,0,4


In [8]:
len(train_df_1)

4611

In [9]:
len(train_df_0)

123946

In [5]:
train_df

Unnamed: 0,id,comment_text,split,created_date,publication_id,parent_id,article_id,rating,funny,wow,...,white,asian,latino,other_race_or_ethnicity,physical_disability,intellectual_or_learning_disability,psychiatric_or_mental_illness,other_disability,identity_annotator_count,toxicity_annotator_count
356383,5596248,vino veritas,train,2017-07-15 02:08:40.357362+00,67,,355555,approved,0,0,...,,,,,,,,,0,10
1062775,528774,handy timeline http www washingtonpost com sf ...,train,2016-10-14 19:00:42.721837+00,21,,148532,approved,0,0,...,,,,,,,,,0,4
1478111,6030567,obama longer president,train,2017-09-26 23:21:46.825182+00,102,6030472.0,382311,approved,3,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
446346,841303,remember page scandal nothing putin foreigners...,train,2017-01-14 22:46:12.179569+00,54,841203.0,161946,approved,0,0,...,,,,,,,,,0,4
1019380,6057449,perhaps gov might reconsider reopening toronto...,train,2017-09-30 17:17:55.980104+00,54,,383720,approved,0,0,...,,,,,,,,,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602200,5376113,speculate late showing wonder woman let cynical,train,2017-06-08 00:18:21.554950+00,21,5375806.0,341771,approved,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
602235,6264639,god bless guys let islamist loving teachers de...,train,2017-11-01 12:26:39.845479+00,85,,395771,approved,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,6
602237,600362,looks like political rally chris lee maybe wan...,train,2016-11-14 20:02:10.436585+00,22,,151598,approved,0,0,...,0.0,0.3,0.0,0.1,0.0,0.0,0.0,0.0,10,10
1786967,6152063,previous post sorry misread sex intend pregnan...,train,2017-10-15 18:59:20.271630+00,102,6151852.0,388716,approved,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,4
