In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Downloading transformers-4.36.2-py3-none-any.whl (8.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hDownloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m330.1/330.

In [3]:
import tensorflow as tf
import transformers

In [4]:
max_length = 40  # Maximum length of input sentence to the model.
batch_size = 128
epochs = 2

In [5]:
from transformers import BertTokenizer, TFBertModel

In [6]:

max_length = 40  # Maximum length of input sentence to the model.
batch_size = 64
epochs = 2

class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    """Generates batches of data.

    Args:
        sentences: Array of input sentences.
        labels: Array of labels.
        batch_size: Integer batch size.
        shuffle: boolean, whether to shuffle the data.
        include_targets: boolean, whether to include the labels.

    Returns:
        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
        (or just `[input_ids, attention_mask, `token_type_ids]`
         if `include_targets=False`)
    """

    def __init__(
        self,
        sentences,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentences = sentences
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

        self.indexes = np.arange(len(self.sentences))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentences) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size: (idx + 1) * self.batch_size]
        sentences = [self.sentences[i] for i in indexes]

        # Single sentence input.
        encoded = self.tokenizer(
            sentences,
            max_length=max_length,
            return_attention_mask=True,
            return_token_type_ids=True,
            pad_to_max_length=True,
            return_tensors="tf",
        )

        # Rest of the code remains unchanged.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            
            # Ensure that labels have the expected shape
            # Modify this part based on your actual label structure
            target = {
                'output_layer_1': labels[:, 0],
                'output_layer_2': labels[:, 1],
                'output_layer_3': labels[:, 2],
                'output_layer_4': labels[:, 3],
                'output_layer_5': labels[:, 4],
                'output_layer_6': labels[:, 5],
            }

            return [input_ids, attention_masks, token_type_ids], target
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [7]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            15G        1.5G         10G        676K        3.5G         13G
Swap:            0B          0B          0B


In [8]:
# Create the model under a distribution strategy scope.
strategy = tf.distribute.MirroredStrategy()

with strategy.scope():
    # Encoded token ids from xlm roberta base tokenizer.
    input_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="input_ids"
    )
    # Attention masks indicates to the model which tokens should be attended to.
    attention_masks = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="attention_masks"
    )
    # Token type ids are binary masks identifying different sequences in the model.
    token_type_ids = tf.keras.layers.Input(
        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
    )
    
    # Loading pretrained xlm roberta base model.
    bert_model = TFBertModel.from_pretrained("bert-base-multilingual-cased") 
    # Freeze the xlm roberta base model model to reuse the pretrained features without modifying them.
    bert_model.trainable = True

    bert_output = bert_model.bert(
        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
    )
    sequence_output = bert_output.last_hidden_state
    pooled_output = bert_output.pooler_output
     
    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
    bi_lstm = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(64, return_sequences=True)
    )(sequence_output)
    # Applying hybrid pooling approach to bi_lstm sequence output.
    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
    concat = tf.keras.layers.concatenate([avg_pool, max_pool])

    dropout = tf.keras.layers.Dropout(0.3)(concat)

    # Add Dense layers for each classification task with 4 units (assuming 4 classes) and softmax activation.
    output_layer_1 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_1')(dropout)
    output_layer_2 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_2')(dropout)
    output_layer_3 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_3')(dropout)
    output_layer_4 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_4')(dropout)
    output_layer_5 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_5')(dropout)
    output_layer_6 = tf.keras.layers.Dense(4, activation='softmax', name='output_layer_6')(dropout)

    # Create the model
    model = tf.keras.Model(
        inputs=[input_ids, attention_masks, token_type_ids],
        outputs=[output_layer_1, output_layer_2,output_layer_3,output_layer_4,output_layer_5, output_layer_6]
    )
    
    losses = {
        'output_layer_1': 'sparse_categorical_crossentropy',
        'output_layer_2': 'sparse_categorical_crossentropy',
        'output_layer_3': 'sparse_categorical_crossentropy',
        'output_layer_4': 'sparse_categorical_crossentropy',
        'output_layer_5': 'sparse_categorical_crossentropy',
        'output_layer_6': 'sparse_categorical_crossentropy',
    }

    # Compile the model with an appropriate loss function and optimizer
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
        loss=losses,  
        metrics=['accuracy']
    )

# Print the model summary
model.summary()


INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0',)


config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 40)]                 0         []                            
                                                                                                  
 attention_masks (InputLaye  [(None, 40)]                 0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids (InputLayer  [(None, 40)]                 0         []                            
 )                                                                                                
                                                                                              

In [9]:
model = tf.keras.models.load_model("content_moderation_multilingual_direct_fine_tuning")

In [10]:
import pandas as pd 
df=pd.read_csv('combined_enhaced_multiligual_dataset.csv')
df

Unnamed: 0.3,Unnamed: 0.2,comment_text,toxic,obscene,identity_attack,insult,threat,sexual_explicit,Unnamed: 0.1,Unnamed: 0,id,rating,length
0,0,강간,3,3,0,2,0,3,,,,,
1,1,개새끼,3,3,0,2,0,3,,,,,
2,2,개자식,3,3,0,2,0,3,,,,,
3,3,개좆,3,3,0,2,0,3,,,,,
4,4,개차반,3,3,0,2,0,3,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1536690,1551883,whore,3,3,0,0,0,3,,,,,
1536691,1551884,whores,3,3,0,0,0,3,,,,,
1536692,1551885,willy-whacker,3,3,0,0,0,3,,,,,
1536693,1551886,wise ass,1,1,0,0,0,1,,,,,


In [11]:
df = df.dropna(subset=['comment_text'])

In [12]:
shuffled_df = df

In [13]:
from sklearn.model_selection import train_test_split

# Split the DataFrame into train, validation, and test sets
train_ratio = 0.8  # 70% for training
valid_ratio = 0.1  # 15% for validation
test_ratio = 0.1   # 15% for testing

train_df, test_valid_df = train_test_split(shuffled_df, test_size=(valid_ratio + test_ratio), random_state=42)
valid_df, test_df = train_test_split(test_valid_df, test_size=test_ratio/(valid_ratio + test_ratio), random_state=42)

In [14]:
test_valid_df

Unnamed: 0.3,Unnamed: 0.2,comment_text,toxic,obscene,identity_attack,insult,threat,sexual_explicit,Unnamed: 0.1,Unnamed: 0,id,rating,length
515977,523189,ไม ่ น ่ าแปลกใจเลยที ่ ผู ้ ชมของเขาส ่ วนใหญ...,0,0,0,0,0,0,520523.0,3452812.0,5097197.0,approved,14.0
1133286,1146653,"""Copying my comments and using them is nothing...",0,0,0,0,0,0,1143987.0,274497.0,5325394.0,approved,14.0
1416438,1431393,"BB é Burrito Boy! Lol! Vá lá, homem. """" Ei, va...",1,0,0,0,0,1,1428727.0,2182963.0,4983591.0,approved,15.0
1523727,1538918,"Psihopatija i genetika, bacanje kockica u tko ...",0,0,0,0,0,0,1536252.0,2103152.0,6082030.0,approved,11.0
1072074,1084967,"Tell that guy, Don Young, to put Native Land i...",0,0,0,0,0,0,1082301.0,23820.0,346666.0,approved,19.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
531559,538952,Bedankt voor dat stukje informatie!,0,0,0,0,0,0,536286.0,1127534.0,329169.0,approved,6.0
400595,406322,Правильно... изучение всего. Genius вывод.,0,0,0,0,0,0,403656.0,1323212.0,792024.0,approved,6.0
61491,62476,그렇죠. 이 괴물은 이미 D I E가 되도록 합시다!,2,0,0,2,2,0,59810.0,627379.0,5551166.0,approved,10.0
500436,507478,http://i.onionstatic.com/avclub/5807/95/animat...,0,0,0,0,0,0,504812.0,1367049.0,5360184.0,approved,1.0


In [15]:
y_train=train_df[["toxic","obscene","identity_attack","insult","threat","sexual_explicit"]]
train_df=train_df[['comment_text']]

In [16]:
y_val=valid_df[["toxic","obscene","identity_attack","insult","threat","sexual_explicit"]]
valid_df=valid_df[['comment_text']]

In [17]:
y_val.values

array([[1, 0, 0, 0, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0],
       ...,
       [2, 0, 0, 2, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [3, 0, 1, 1, 3, 0]])

In [18]:
train_df

Unnamed: 0,comment_text
1206043,Clayton Ruby quá già để được bổ nhiệm vào toà ...
530061,১২ বছর বয়সী মেয়ে? ওখানে কট ্ টরটা খুব কষ ্ ট
1121022,I agree he is a carpet bagger only using Alaskans
1486133,"So, it would appear, is attacking Coptic Chris..."
1458026,"Guns dont kill people, Never-Trumpers kill peo..."
...,...
259178,Putting Chief in front of Harrison is like put...
1414414,"Ainda gostam de bater nas mulheres, não é?"
131932,Miten pääsette näin naurettavaan johtopäätökseen?
671155,"Samo to dodaj na svoj popis """" bojkota """", i n..."


In [19]:
train_df['comment_text']

1206043    Clayton Ruby quá già để được bổ nhiệm vào toà ...
530061        ১২ বছর বয়সী মেয়ে? ওখানে কট ্ টরটা খুব কষ ্ ট
1121022    I agree he is a carpet bagger only using Alaskans
1486133    So, it would appear, is attacking Coptic Chris...
1458026    Guns dont kill people, Never-Trumpers kill peo...
                                 ...                        
259178     Putting Chief in front of Harrison is like put...
1414414           Ainda gostam de bater nas mulheres, não é?
131932     Miten pääsette näin naurettavaan johtopäätökseen?
671155     Samo to dodaj na svoj popis "" bojkota "", i n...
121958     Låt mig säga att varenda västerländsk land som...
Name: comment_text, Length: 1229355, dtype: object

In [20]:
y_test=test_df[["toxic","obscene","identity_attack","insult","threat","sexual_explicit"]]
test_df=test_df[['comment_text']]

In [21]:
#test_data[0]

In [22]:
import numpy as np
sentence_pairs = np.array(["My name is Vikram Pal CEO of AI Intelli Inc."])
test_data = BertSemanticDataGenerator(
    sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
)

proba = model.predict(test_data[0])

print(proba)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[array([[9.2707801e-01, 7.0355073e-02, 1.8183644e-03, 7.4845157e-04]],
      dtype=float32), array([[9.7906303e-01, 2.0244712e-02, 4.8491333e-04, 2.0733858e-04]],
      dtype=float32), array([[9.8480761e-01, 1.4902597e-02, 2.1750614e-04, 7.2212883e-05]],
      dtype=float32), array([[9.5497638e-01, 4.3522153e-02, 9.4604929e-04, 5.5536482e-04]],
      dtype=float32), array([[9.8920751e-01, 1.0605867e-02, 1.4993074e-04, 3.6730999e-05]],
      dtype=float32), array([[9.8955101e-01, 1.0162842e-02, 2.1426505e-04, 7.1917173e-05]],
      dtype=float32)]


In [23]:
y_val

Unnamed: 0,toxic,obscene,identity_attack,insult,threat,sexual_explicit
386491,1,0,0,0,0,0
1388016,1,0,0,1,0,0
1454119,1,0,0,1,0,0
523035,1,1,0,0,1,1
1115128,0,0,0,0,0,0
...,...,...,...,...,...,...
1431347,0,0,0,0,0,0
811294,0,0,1,0,0,0
755672,2,0,0,2,0,0
21619,1,0,0,0,0,0


In [24]:
import numpy as np

train_data = BertSemanticDataGenerator(
    train_df['comment_text'].tolist(),
    y_train.values,
    batch_size=batch_size,
    shuffle=True,
)

valid_data = BertSemanticDataGenerator(
    valid_df['comment_text'].tolist(),
    y_val.values,
    batch_size=batch_size,
    shuffle=False,
)

In [25]:
train_data.indexes

array([1190590, 1038285, 1028391, ...,  131932,  671155,  121958])

In [26]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 1/2






Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch 2/2
  660/19208 [>.............................] - ETA: 2:24:25 - loss: 2.2222 - output_layer_1_loss: 0.7483 - output_layer_2_loss: 0.2268 - output_layer_3_loss: 0.2606 - output_layer_4_loss: 0.6670 - output_layer_5_loss: 0.1999 - output_layer_6_loss: 0.1196 - output_layer_1_accuracy: 0.6791 - output_layer_2_accuracy: 0.9345 - output_layer_3_accuracy: 0.9088 - output_layer_4_accuracy: 0.7149 - output_layer_5_accuracy: 0.9343 - output_layer_6_accuracy: 0.9701

KeyboardInterrupt: 

In [None]:
8356

7468

# Unfreeze the bert_model.
bert_model.trainable = True

# Recompile the model to make the change effective.
# Change loss function to mean squared error for regression
model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss="mean_squared_error",
    metrics=["mae"],  # You can use other metrics like "mse" or custom metrics as needed
)
model.summary()

history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=epochs,
    use_multiprocessing=True,
    workers=-1,
)

In [27]:
model.save("content_moderation_multilingual_direct_fine_tuningv02")

INFO:tensorflow:Assets written to: content_moderation_multilingual_direct_fine_tuningv02/assets


INFO:tensorflow:Assets written to: content_moderation_multilingual_direct_fine_tuningv02/assets


In [30]:
test_data = BertSemanticDataGenerator(
    test_df['comment_text'].tolist(),
    y_test.values,
    batch_size=batch_size,
    shuffle=False,
)
model.evaluate(test_data, verbose=1)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.




[2.2643024921417236,
 0.7714725732803345,
 0.22849474847316742,
 0.2566157877445221,
 0.6838600635528564,
 0.20531369745731354,
 0.1185464933514595,
 0.6658098101615906,
 0.9332244396209717,
 0.909783661365509,
 0.7080187797546387,
 0.9330357313156128,
 0.9703834056854248]

In [None]:
input_data=[train_data[0][0][0][0],train_data[0][0][1][0]]

In [None]:
test_valid_df[test_valid_df['obscene']>0.5]

In [52]:
sentence_pairs = ["sei una sorella stronza"]
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

proba = model.predict(test_data[0])

### "toxic","obscene","identity_attack","insult","threat","sexual_explicit"
for prob in proba:
    idx = np.argmax(prob[0])
    prob = f"{prob[0][idx]: .2f}%"
    print(idx)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


2
0
0
2
0
0


In [None]:
### Data cleaning and preparation

In [None]:
df=pd.read_csv("multilingual_content_moderation_text_dataset.csv")

In [None]:
num=0.1

In [None]:
df_toxic_total=df[(df["toxic"]>num) | (df["severe_toxicity"]>num) | (df["obscene"]>num) | (df["identity_attack"]>num) | (df["insult"]>num) | (df["threat"]>num) | (df["sexual_explicit"]>num)]
df_toxic_total

In [None]:
df_normal=df[(df["toxic"]==0) & (df["severe_toxicity"]==0) & (df["obscene"]==0) & (df["identity_attack"]==0) & (df["insult"]==0) & (df["threat"]==0) & (df["sexual_explicit"]==0)]

In [None]:
df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]] = df[["toxic", "severe_toxicity", "obscene", "identity_attack", "insult", "threat", "sexual_explicit"]].round(2)

In [None]:
fraction = 1/6
df_sample = df_normal.sample(frac=fraction, random_state=42)  # Set a random seed for reproducibility
df_sample

In [None]:
df_combined_total = pd.concat([df_toxic_total, df_sample], axis=0, ignore_index=True)
df_combined_total

In [None]:
### <.1 ---> 0  normal
### 0.1-0.45 ---> 1 low risk 
### 0.45 --> 0.70 --> 2 medium risk 
### >.70 --> 3 --> High risk 

def risk_calculation(x):
    if x<=0.1:
        return 0
    elif x>0.1 and x<=0.45: 
        return 1
    elif x>0.45 and x<=0.70:
        return 2 
    else:
        return 3 
    
    
# Specify the columns to apply the risk calculation function
columns_to_apply = ['toxic', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat', 'sexual_explicit']

# Apply the risk calculation function to the specified columns
df_combined_total[columns_to_apply] = df_combined_total[columns_to_apply].applymap(risk_calculation)

In [None]:
df_combined_total.drop(["severe_toxicity"], axis=1, inplace=True)

In [None]:
df_combined_total

In [None]:
# Shuffle the DataFrame
df_combined_total = df_combined_total.sample(frac=1, random_state=42)  # Set a random seed for reproducibility

# Reset the index after shuffling
df_combined_total = df_combined_total.reset_index(drop=True)

df_combined_total.to_csv("classificaiton_dataframe_content_moderation_v02.csv")

In [None]:
!unzip "List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words-master (1).zip"

In [None]:
import pandas as pd
import os

# Set the directory path where your text files are located
directory_path = "./List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words-master/"

# Create an empty DataFrame to store the data
df = pd.DataFrame(columns=['text', 'language_code'])

# Iterate through each file in the directory
for filename in os.listdir(directory_path):
    file_path = os.path.join(directory_path, filename)
    
    # Check if the file is a text file (you can add more conditions based on your naming conventions)
    if os.path.isfile(file_path) and not filename.startswith('.'):
        # Read the content of the text file
        with open(file_path, 'r', encoding='utf-8') as file:
            text_lines = file.readlines()

        # Extract language code from the filename
        language_code = filename

        # Append each line as a separate row to the DataFrame
        for line in text_lines:
            df = df.append({'text': line.strip(), 'language_code': language_code}, ignore_index=True)


In [None]:
df['comment_text']=df['text']

df.drop(['text', 'language_code'], axis=1, inplace=True)

In [None]:
df_profanity=pd.read_csv("profanity_en.csv")
df_profanity

In [None]:
df_profanity["severity_description"].value_counts()

In [None]:
# Assuming df is your DataFrame
df_profanity['severity_description'] = df_profanity['severity_description'].replace({'Strong': 3, 'Severe': 2, 'Mild': 1})
df_profanity

In [None]:
df_profanity=df_profanity[(df_profanity["category_1"]=="sexual anatomy / sexual acts") | (df_profanity["category_1"]=="sexual orientation / gender")]

In [None]:
df_profanity

In [None]:
df_profanity["sexual_explicit"]=df_profanity["severity_description"]
df_profanity

In [None]:
df_profanity['comment_text']=df_profanity['text']

In [None]:
df_profanity=df_profanity[["comment_text","sexual_explicit"]]
df_profanity

In [None]:
df_profanity['toxic']=df_profanity['sexual_explicit']
df_profanity['obscene']=df_profanity['sexual_explicit']

df_profanity

In [None]:
df_profanity['identity_attack']=0
df_profanity['insult']=0
df_profanity['threat']=0

In [None]:
df_profanity

In [None]:
df['toxic']=3
df[ 'obscene']=3
df['identity_attack']=0
df['insult']=2
df['threat']=0
df['sexual_explicit']=3

In [None]:
df

In [None]:
df_c=pd.read_csv("classificaiton_dataframe_content_moderation_v02.csv")

df_c

In [None]:
df_combined=pd.concat([df, df_c, df_profanity], axis=0, ignore_index=True)
df_combined

In [None]:
df_combined['obscene'].value_counts()

In [None]:
df_combined = df_combined.drop_duplicates(subset=['comment_text'], keep='last')
df_combined

In [None]:
df_combined.to_csv("combined_enhaced_multiligual_dataset.csv")

In [None]:
!unzip Suicide_Detection.csv.zip

In [None]:
df_self_harm=pd.read_csv("Suicide_Detection.csv")
df_self_harm

In [None]:
df_self_harm['length']=df_self_harm["text"].apply(lambda x: len(x.split()))

In [None]:
df_self_harm['length'].value_counts()

In [None]:
df_self_harm[df_self_harm['class']=="suicide"]