In [None]:
!pip install transformers

In [1]:
import tensorflow as tf
import transformers

In [2]:
max_length = 40  # Maximum length of input sentence to the model.
batch_size = 128
epochs = 2

In [3]:
from transformers import BertTokenizer, TFBertModel

In [4]:

max_length = 40  # Maximum length of input sentence to the model.
batch_size = 64
epochs = 2

class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentences: Array of input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to include the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentences,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentences = sentences
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

        self.indexes = np.arange(len(self.sentences))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentences) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size: (idx + 1) * self.batch_size]
        sentences = [self.sentences[i] for i in indexes]

        # Single sentence input.
        encoded = self.tokenizer(
            sentences,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Rest of the code remains unchanged.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [5]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            15G        1.5G        8.0G        676K        5.8G         13G
Swap:            0B          0B          0B


In [6]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from xlm roberta base tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    
    # Loading pretrained xlm roberta base model.
    bert_model = TFBertModel.from_pretrained("bert-base-multilingual-cased") 
    # Freeze the xlm roberta base model model to reuse the pretrained features without modifying them.
    bert_model.trainable = True

    bert_output = bert_model.bert(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
     
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])

    dropout = tf.keras.layers.Dropout(0.3)(concat)
    
    # Adjust output layer for regression (remove softmax activation)
    output = tf.keras.layers.Dense(7, activation="linear")(dropout)
    
    model = tf.keras.models.Model(
        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
    )

    # Change loss function to mean squared error for regression
    model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="mean_squared_error",
        metrics=["mae"],  # You can use other metrics like "mse" or custom metrics as needed
    )

print(f"Strategy: {strategy}")
model.summary()


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Strategy: <tensorflow.python.distribute.mirrored_strategy.MirroredStrategy object at 0x7fb8cfd0bf70>
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 40)]                 0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 40)]                 0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                          

In [7]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            15G        2.9G        6.6G         12M        5.9G         12G
Swap:            0B          0B          0B


In [8]:
import pandas as pd 
df=pd.read_csv('reduced_dataframe_content_moderation.csv')
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
0,0,1270034,940723,Ze lieten de banden zien van het bedrijf dat g...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,approved,18
1,1,1856731,697213,"Inget att säga, förutom att du lutar dig själv!",0.200000,0.000000,0.000000,0.000000,0.100000,0.000000,0.0,approved,8
2,2,3148137,6212108,"যখন গনতন ্ ত ্ রেররা যেমন আপনার ""দেহের ছেলেমেয...",0.671642,0.000000,0.104478,0.059701,0.656716,0.000000,0.0,approved,17
3,3,3462920,5260044,ทรัมพ ์ ได ้ พิสูจน ์ ครั ้ งแล ้ วครั ้ งเล ่...,0.500000,0.000000,0.000000,0.000000,0.400000,0.000000,0.0,approved,17
4,4,713376,6316762,"En el caso de los Estados Unidos de América, l...",0.166667,0.000000,0.000000,0.166667,0.000000,0.000000,0.0,approved,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068941,2068941,924503,668417,அவர்கள் தங்கள் சந்தேகங்களை ஒரு தேவாலயத்தின் கத...,0.200000,0.000000,0.000000,0.000000,0.200000,0.000000,0.0,approved,18
2068942,2068942,2913891,1007823,کسی بھی طرح ایک خانہ جنگی شروع کرنے کے لئے ایک...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,approved,11
2068943,2068943,573825,832766,말도 안되는 사소한 것들. 이 나라에 중요한 문제가 있습니다. 좀 더 진행해 보시겠어요?,0.833333,0.000000,0.000000,0.000000,0.833333,0.000000,0.0,approved,15
2068944,2068944,2641491,5120314,"我希望人们会选择抵制UA,他们应该破产。",0.300000,0.000000,0.000000,0.000000,0.100000,0.200000,0.0,approved,19


In [9]:
df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]] = df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]].round(2)

In [10]:
df=df.dropna()

In [11]:
num=0.2

In [12]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
0,0,1270034,940723,Ze lieten de banden zien van het bedrijf dat g...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,approved,18
1,1,1856731,697213,"Inget att säga, förutom att du lutar dig själv!",0.20,0.00,0.00,0.00,0.10,0.00,0.0,approved,8
2,2,3148137,6212108,"যখন গনতন ্ ত ্ রেররা যেমন আপনার ""দেহের ছেলেমেয...",0.67,0.00,0.10,0.06,0.66,0.00,0.0,approved,17
3,3,3462920,5260044,ทรัมพ ์ ได ้ พิสูจน ์ ครั ้ งแล ้ วครั ้ งเล ่...,0.50,0.00,0.00,0.00,0.40,0.00,0.0,approved,17
4,4,713376,6316762,"En el caso de los Estados Unidos de América, l...",0.17,0.00,0.00,0.17,0.00,0.00,0.0,approved,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068941,2068941,924503,668417,அவர்கள் தங்கள் சந்தேகங்களை ஒரு தேவாலயத்தின் கத...,0.20,0.00,0.00,0.00,0.20,0.00,0.0,approved,18
2068942,2068942,2913891,1007823,کسی بھی طرح ایک خانہ جنگی شروع کرنے کے لئے ایک...,0.00,0.00,0.00,0.00,0.00,0.00,0.0,approved,11
2068943,2068943,573825,832766,말도 안되는 사소한 것들. 이 나라에 중요한 문제가 있습니다. 좀 더 진행해 보시겠어요?,0.83,0.00,0.00,0.00,0.83,0.00,0.0,approved,15
2068944,2068944,2641491,5120314,"我希望人们会选择抵制UA,他们应该破产。",0.30,0.00,0.00,0.00,0.10,0.20,0.0,approved,19


In [13]:
df_toxic_total=df[(df["toxic"]>num) | (df["severe_toxicity"]>num) | (df["obscene"]>num) | (df["identity_attack"]>num) | (df["insult"]>num) | (df["threat"]>num) | (df["sexual_explicit"]>num)]
df_toxic_total

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
2,2,3148137,6212108,"যখন গনতন ্ ত ্ রেররা যেমন আপনার ""দেহের ছেলেমেয...",0.67,0.00,0.10,0.06,0.66,0.00,0.0,approved,17
3,3,3462920,5260044,ทรัมพ ์ ได ้ พิสูจน ์ ครั ้ งแล ้ วครั ้ งเล ่...,0.50,0.00,0.00,0.00,0.40,0.00,0.0,approved,17
5,5,1750750,5492047,"Tak, jestem. Dlatego jestem dziedzicem ""trony""...",0.70,0.00,0.00,0.00,0.70,0.00,0.0,approved,15
11,11,461001,6136471,Such is BS. IF such is true Americans are not...,0.36,0.03,0.05,0.27,0.16,0.09,0.0,approved,12
12,12,2231151,5733617,Kim Jong Saddam tem armas químicas. Precisamos...,0.30,0.00,0.00,0.00,0.00,0.30,0.0,approved,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2068930,2068930,3687569,7160721,"Thanks, Tom. You'll be pleased to know that yo...",0.30,0.00,0.00,0.00,0.30,0.00,0.0,approved,17
2068935,2068935,4065892,5304438,"Tidak, kriteria itu saja akan bodoh. Saya memb...",0.67,0.03,0.12,0.00,0.62,0.00,0.0,approved,18
2068943,2068943,573825,832766,말도 안되는 사소한 것들. 이 나라에 중요한 문제가 있습니다. 좀 더 진행해 보시겠어요?,0.83,0.00,0.00,0.00,0.83,0.00,0.0,approved,15
2068944,2068944,2641491,5120314,"我希望人们会选择抵制UA,他们应该破产。",0.30,0.00,0.00,0.00,0.10,0.20,0.0,approved,19


In [14]:
shuffled_df = df

In [15]:
#shuffled_df=shuffled_df.iloc[:100]

In [16]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into train, validation, and test sets
train_ratio = 0.8  # 70% for training
valid_ratio = 0.1  # 15% for validation
test_ratio = 0.1   # 15% for testing

train_df, test_valid_df = train_test_split(shuffled_df, test_size=(valid_ratio + test_ratio), random_state=42)
valid_df, test_df = train_test_split(test_valid_df, test_size=test_ratio/(valid_ratio + test_ratio), random_state=42)

In [17]:
test_valid_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
1297347,1297347,1688027,5219065,"Nic z tego nie jest napisane w ustawodawstwie,...",0.00,0.0,0.0,0.0,0.0,0.0,0.0,approved,11
489777,489777,2099854,6031077,"Hvala vam, Ron za ovu sliku onoga što je bilo ...",0.30,0.0,0.0,0.0,0.3,0.0,0.0,approved,12
787763,787763,1848578,571346,"socialism, du får skuld, mismanagement och en ...",0.00,0.0,0.0,0.0,0.0,0.0,0.0,approved,19
860413,860413,3119655,5759829,"ক ্ রিস, তুমি পারো! শুধু এই নেতিবাচকতা বন ্ ধ ...",0.17,0.0,0.0,0.0,0.0,0.0,0.0,approved,9
185554,185554,202922,5005232,carefully consider the company you keep.,0.00,0.0,0.0,0.0,0.0,0.0,0.0,approved,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1161891,1161891,3047704,761904,ক ্ যান ্ ডিয়ান ্ ট এর বিধ ্ বস ্ তিগ ্ রস ্ ...,0.50,0.0,0.1,0.0,0.5,0.0,0.0,approved,15
1768622,1768622,2822503,658180,من الواضح جداً أن جاستين تردو ليس ذكياً جداً و...,0.50,0.0,0.0,0.0,0.5,0.0,0.0,approved,20
779200,779200,118176,754869,It makes NO sense to have Israel surrounded by...,0.20,0.0,0.0,0.0,0.0,0.0,0.0,approved,17
354631,354631,3975998,6240139,"""यह तभी अवैध है जब डेमोक्रेट इसे करते हैं"".",0.30,0.1,0.0,0.1,0.3,0.0,0.0,approved,10


In [18]:
y_train=train_df[["toxic","severe_toxicity","obscene","identity_attack","insult","threat","sexual_explicit"]]
train_df=train_df[['comment_text']]

In [19]:
y_val=valid_df[["toxic","severe_toxicity","obscene","identity_attack","insult","threat","sexual_explicit"]]
valid_df=valid_df[['comment_text']]

In [20]:
y_val.values

array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.64, 0.  , 0.06, ..., 0.64, 0.  , 0.01],
       ...,
       [0.17, 0.  , 0.  , ..., 0.17, 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ]])

In [21]:
train_df

Unnamed: 0,comment_text
2028164,"""O PGO é composto maioritariamente de lixo bra..."
1596592,"Tree Huggers不会快乐,除非他们通过杀死所有人类来拯救地球。"
679244,Guns don't kill people; people kill people.
1814448,ไอ ้ โง ่ ที ่ ชื ่ อว ่ า ฮิลลารี ่ เกิดขึ ้ น
247256,Seguite il denaro alle proprietà di Trump e ag...
...,...
259178,"comrade - మీరు ""మరింత లేదా తక్కువ"" భాగంగా వదిలి."
1414414,سرلیک یقینی طور پر اس دو طرفدار کہانی کو ایک ج...
131932,Ha ha ha. Các nhà báo động rất hài hước! Các b...
671155,Nije li obećao ukinuti Agenciju?


In [22]:
train_df['comment_text']

2028164    "O PGO é composto maioritariamente de lixo bra...
1596592                  Tree Huggers不会快乐,除非他们通过杀死所有人类来拯救地球。
679244           Guns don't kill people; people kill people.
1814448      ไอ ้ โง ่ ที ่ ชื ่ อว ่ า ฮิลลารี ่ เกิดขึ ้ น
247256     Seguite il denaro alle proprietà di Trump e ag...
                                 ...                        
259178      comrade - మీరు "మరింత లేదా తక్కువ" భాగంగా వదిలి.
1414414    سرلیک یقینی طور پر اس دو طرفدار کہانی کو ایک ج...
131932     Ha ha ha. Các nhà báo động rất hài hước! Các b...
671155                      Nije li obećao ukinuti Agenciju?
121958                               そしてあなたの米国市民の詐欺師は、より良いか?
Name: comment_text, Length: 1655156, dtype: object

In [23]:
y_test=test_df[["toxic","severe_toxicity","obscene","identity_attack","insult","threat","sexual_explicit"]]
test_df=test_df[['comment_text']]

In [24]:
import numpy as np
sentence_pairs = np.array(["My name is Vikram Pal CEO of AI Intelli Inc."])
test_data = BertSemanticDataGenerator(
    sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
)

proba = model.predict(test_data[0])

print(proba)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[[ 0.01576383 -0.30640525  0.2880426  -0.0074704  -0.6280422  -0.10152701
   0.40612826]]


In [25]:
y_val

Unnamed: 0,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
1814594,0.00,0.00,0.00,0.00,0.00,0.0,0.00
227506,0.00,0.00,0.00,0.00,0.00,0.0,0.00
1710425,0.64,0.00,0.06,0.01,0.64,0.0,0.01
1300093,0.40,0.00,0.10,0.00,0.40,0.0,0.00
650426,0.65,0.04,0.26,0.01,0.19,0.0,0.61
...,...,...,...,...,...,...,...
1381259,0.20,0.00,0.00,0.20,0.20,0.0,0.00
777919,0.30,0.00,0.00,0.00,0.30,0.0,0.00
238532,0.17,0.00,0.00,0.17,0.17,0.0,0.00
1666000,0.00,0.00,0.00,0.00,0.00,0.0,0.00


In [26]:
import numpy as np

train_data = BertSemanticDataGenerator(
    train_df['comment_text'].tolist(),
    y_train.values,
    batch_size=batch_size,
    shuffle=True,
)

valid_data = BertSemanticDataGenerator(
    valid_df['comment_text'].tolist(),
    y_val.values,
    batch_size=batch_size,
    shuffle=False,
)

In [27]:
y_train

Unnamed: 0,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit
2028164,0.64,0.09,0.13,0.54,0.62,0.03,0.0
1596592,0.55,0.05,0.00,0.00,0.08,0.49,0.0
679244,0.30,0.00,0.10,0.00,0.20,0.20,0.0
1814448,0.94,0.04,0.18,0.00,0.94,0.00,0.0
247256,0.17,0.00,0.00,0.00,0.00,0.00,0.0
...,...,...,...,...,...,...,...
259178,0.00,0.00,0.00,0.00,0.00,0.00,0.0
1414414,0.00,0.00,0.00,0.00,0.00,0.00,0.0
131932,0.17,0.17,0.00,0.00,0.17,0.00,0.0
671155,0.20,0.20,0.00,0.00,0.00,0.00,0.0


In [28]:
train_data.indexes

array([1413085,  857583, 1267066, ...,  131932,  671155,  121958])

In [None]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2




INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Reduce to /job:localhost/replica:0/task:0/device:CPU:0 then broadcast to ('/job:localhost/replica:0/task:0/device:CPU:0',).
INFO:tensorflow:Redu

# Unfreeze the bert_model.
bert_model.trainable = True

# Recompile the model to make the change effective.
# Change loss function to mean squared error for regression
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss="mean_squared_error",
    metrics=["mae"],  # You can use other metrics like "mse" or custom metrics as needed
)
model.summary()

history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

In [None]:
model.save("content_moderation_multilingual_direct_fine_tuning")

In [None]:
y_test=test_df[["toxic","severe_toxicity","obscene","identity_attack","insult","threat","sexual_explicit"]]
test_df=test_df[['comment_text']]

In [None]:
test_data = BertSemanticDataGenerator(
    test_df['comment_text'].tolist(),
    y_test.values,
    batch_size=batch_size,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)

In [None]:
input_data=[train_data[0][0][0][0],train_data[0][0][1][0]]

In [None]:
test_valid_df[test_valid_df['obscene']>0.5]

In [None]:
sentence_pairs = ["fucking gonna kill you I am gonna rape your sister", "I am going there"]
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

proba = model.predict(test_data[0])

print(proba)

In [None]:
sentence_pairs = ["fucking gonna kill you", "I am gonna rape your sister"]
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

test_data[0]

In [30]:
sentence_pairs = ["fucking gonna kill you I am going chop his head off"]
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

test_data[0]

model.predict(test_data[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




array([[ 7.1624140e-03,  2.5746314e-04, -3.1638111e-04, -2.5033794e-04,
         6.0482211e-03, -3.0591059e-04, -4.9833936e-05]], dtype=float32)

In [31]:
df=pd.read_csv("multilingual_content_moderation_text_dataset.csv")

In [32]:
num=0.1

In [33]:
df_toxic_total=df[(df["toxic"]>num) | (df["severe_toxicity"]>num) | (df["obscene"]>num) | (df["identity_attack"]>num) | (df["insult"]>num) | (df["threat"]>num) | (df["sexual_explicit"]>num)]
df_toxic_total

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
3,3,239583,It's ridiculous that these guys are being call...,0.600000,0.000000,0.100000,0.000000,0.600000,0.100000,0.000000,approved,20
13,13,239635,I'm crazy about that illustration! I never wou...,0.200000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,approved,17
16,16,239653,Mormons have had a complicated relationship wi...,0.300000,0.000000,0.000000,0.300000,0.000000,0.000000,0.000000,approved,9
17,17,239655,"Send more dildos, STAT.",0.361111,0.041667,0.083333,0.000000,0.041667,0.000000,0.388889,approved,4
22,22,239698,AND Left wing trolls!\n\nJust because someone ...,0.400000,0.000000,0.000000,0.100000,0.400000,0.000000,0.000000,approved,17
...,...,...,...,...,...,...,...,...,...,...,...,...
4182036,4182036,7194001,"Larry, ar trebui să luaţi aceleaşi droguri pe ...",0.500000,0.000000,0.000000,0.000000,0.400000,0.000000,0.000000,approved,13
4182037,4182037,7194007,Trump doreşte să ucidă America. El este pudul ...,0.750000,0.078947,0.013158,0.065789,0.460526,0.355263,0.000000,approved,13
4182044,4182044,7194094,"Nu vă îngrijoraţi de ""liberalii"" Pare că trebu...",0.000000,0.000000,0.000000,0.166667,0.000000,0.000000,0.000000,approved,15
4182045,4182045,7194098,Cred că analiza dvs. este inexactă. Cred că a ...,0.200000,0.000000,0.000000,0.000000,0.200000,0.000000,0.000000,approved,20


In [34]:
df_normal=df[(df["toxic"]==0) & (df["severe_toxicity"]==0) & (df["obscene"]==0) & (df["identity_attack"]==0) & (df["insult"]==0) & (df["threat"]==0) & (df["sexual_explicit"]==0)]

In [35]:
df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]] = df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]].round(2)

In [36]:
fraction = 1/6
df_sample = df_normal.sample(frac=fraction, random_state=42)  # Set a random seed for reproducibility
df_sample

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
1807446,1807446,5733651,"Kto dokładnie decyduje, kto jest 100% kobietą,...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,11
2160053,2160053,747589,LOVE IT!,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,2
3615857,3615857,5303814,裁判所の判決は、米国憲法と法の支配は、トランプにもかかわらず、まだ何かのために立つことを示し...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,19
2221715,2221715,5590498,Apenas um simples diploma de licenciatura é co...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,11
3994651,3994651,312290,Bukankah Atwood?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,4
...,...,...,...,...,...,...,...,...,...,...,...,...
2932642,2932642,5190263,ارے،金正恩، صرف ڈونالڈ ایک فون کال دے اور وہ واپس...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,14
3147710,3147710,6205158,স ্ বীকার করছি আমি এমন এক সময় মনে করি যখন আপন...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,15
3815753,3815753,6079051,Entrambe le parti sono colpevoli di privare i ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,15
562330,562330,656568,우리는 절대 알지 못할 겁니다. 하지만 애완견을 돌볼 수 있도록 적절한 조치를 취했...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,approved,19


In [37]:
df_combined_total = pd.concat([df_toxic_total, df_sample], axis=0, ignore_index=True)
df_combined_total

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,severe_toxicity,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
0,3,239583,It's ridiculous that these guys are being call...,0.600000,0.000000,0.100000,0.0,0.600000,0.1,0.000000,approved,20
1,13,239635,I'm crazy about that illustration! I never wou...,0.200000,0.000000,0.000000,0.0,0.200000,0.0,0.000000,approved,17
2,16,239653,Mormons have had a complicated relationship wi...,0.300000,0.000000,0.000000,0.3,0.000000,0.0,0.000000,approved,9
3,17,239655,"Send more dildos, STAT.",0.361111,0.041667,0.083333,0.0,0.041667,0.0,0.388889,approved,4
4,22,239698,AND Left wing trolls!\n\nJust because someone ...,0.400000,0.000000,0.000000,0.1,0.400000,0.0,0.000000,approved,17
...,...,...,...,...,...,...,...,...,...,...,...,...
1548055,2932642,5190263,ارے،金正恩، صرف ڈونالڈ ایک فون کال دے اور وہ واپس...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,approved,14
1548056,3147710,6205158,স ্ বীকার করছি আমি এমন এক সময় মনে করি যখন আপন...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,approved,15
1548057,3815753,6079051,Entrambe le parti sono colpevoli di privare i ...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,approved,15
1548058,562330,656568,우리는 절대 알지 못할 겁니다. 하지만 애완견을 돌볼 수 있도록 적절한 조치를 취했...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,approved,19


In [45]:
### <.1 ---> 0  normal
### 0.1-0.45 ---> 1 low risk 
### 0.45 --> 0.70 --> 2 medium risk 
### >.70 --> 3 --> High risk 

def risk_calculation(x):
    if x<=0.1:
        return 0
    elif x>0.1 and x<=0.45: 
        return 1
    elif x>0.45 and x<=0.70:
        return 2 
    else:
        return 3 
    
    
# Specify the columns to apply the risk calculation function
columns_to_apply = ['toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']

# Apply the risk calculation function to the specified columns
df_combined_total[columns_to_apply] = df_combined_total[columns_to_apply].applymap(risk_calculation)

In [54]:
df_combined_total.drop(["severe_toxicity"], axis=1, inplace=True)

In [55]:
df_combined_total

Unnamed: 0.1,Unnamed: 0,id,comment_text,toxic,obscene,identity_attack,insult,threat,sexual_explicit,rating,length
0,3,239583,It's ridiculous that these guys are being call...,2,0,0,2,0,0,approved,20
1,13,239635,I'm crazy about that illustration! I never wou...,1,0,0,1,0,0,approved,17
2,16,239653,Mormons have had a complicated relationship wi...,1,0,1,0,0,0,approved,9
3,17,239655,"Send more dildos, STAT.",1,0,0,0,0,1,approved,4
4,22,239698,AND Left wing trolls!\n\nJust because someone ...,1,0,0,1,0,0,approved,17
...,...,...,...,...,...,...,...,...,...,...,...
1548055,2932642,5190263,ارے،金正恩، صرف ڈونالڈ ایک فون کال دے اور وہ واپس...,0,0,0,0,0,0,approved,14
1548056,3147710,6205158,স ্ বীকার করছি আমি এমন এক সময় মনে করি যখন আপন...,0,0,0,0,0,0,approved,15
1548057,3815753,6079051,Entrambe le parti sono colpevoli di privare i ...,0,0,0,0,0,0,approved,15
1548058,562330,656568,우리는 절대 알지 못할 겁니다. 하지만 애완견을 돌볼 수 있도록 적절한 조치를 취했...,0,0,0,0,0,0,approved,19


In [56]:
# Shuffle the DataFrame
df_combined_total = df_combined_total.sample(frac=1, random_state=42)  # Set a random seed for reproducibility

# Reset the index after shuffling
df_combined_total = df_combined_total.reset_index(drop=True)

df_combined_total.to_csv("classificaiton_dataframe_content_moderation_v02.csv")

In [59]:
t=df_combined_total[[columns_to_apply]]
t

KeyError: "None of [Index([('toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit')], dtype='object')] are in the [columns]"