In [6]:
import gc
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import tensorflow as tf
from typing import Tuple
from tqdm import tqdm

In [7]:
from transformers import (TFGPT2Model,TFMBartModel,TFBertForSequenceClassification,
                         TFDistilBertForSequenceClassification,TFXLMRobertaForSequenceClassification,
                         TFMT5ForConditionalGeneration,TFT5ForConditionalGeneration,T5Tokenizer,AutoTokenizer,AutoConfig)

In [8]:
tpu = None
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
except ValueError:
    strategy = tf.distribute.get_strategy()

def set_seeds(seed:int)->None:
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

MAX_LEN = 512
TEST_SIZE = 0.2
LR = 0.00002
VERBOSE = 1
SEED = 1000
set_seeds(SEED)

AUTOTUNE = tf.data.experimental.AUTOTUNE

BASE_BATCH_SIZE = 4

if tpu is not None:
    BASE_BATCH_SIZE = 8

BATCH_SIZE = BASE_BATCH_SIZE+strategy.num_replicas_in_sync

In [10]:
df = pd.read_csv('train.csv', on_bad_lines = 'warn',index_col=False, encoding='iso-8859-1', nrows=1000)
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [11]:
df.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
train_df, val_df = train_test_split(df,
                                    test_size = TEST_SIZE,
                                    random_state = SEED)

In [14]:
def create_dataset(df, max_len, tokenizer, batch_size, shuffle=False):
    total_samples = df.shape[0]

    # Placeholders for inputs
    input_ids, input_masks = [], []

    # Placeholder for output labels
    labels = []

    # Tokenize
    for _, row in tqdm(df.iterrows(), total=total_samples):

        # Get comment text
        comment_text = row['comment_text']

        # Encode
        input_encoded = tokenizer.encode_plus(comment_text, add_special_tokens=True, max_length=max_len, truncation=True, padding='max_length')
        input_ids.append(input_encoded['input_ids'])
        input_masks.append(input_encoded['attention_mask'])

        # Determine label
        # Here, assuming 'toxic' column represents any form of toxicity
        toxic_labels = [row['toxic'], row['severe_toxic'], row['obscene'], row['threat'], row['insult'], row['identity_hate']]
        label = 1 if any(toxic_labels) else 0
        labels.append(label)

    # Convert lists to numpy arrays
    input_ids = np.array(input_ids)
    input_masks = np.array(input_masks)
    labels = np.array(labels, dtype=np.int32)

    # Create TensorFlow dataset
    dataset = tf.data.Dataset.from_tensor_slices((
        {'input_ids': input_ids, 'attention_mask': input_masks},
        labels
    ))

    # Shuffle dataset if required
    if shuffle:
        dataset = dataset.shuffle(total_samples, reshuffle_each_iteration=True)

    # Batch and prefetch dataset
    dataset = dataset.batch(batch_size, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)

    return dataset


In [15]:
from transformers import TFAutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
import tensorflow as tf
import gc

EPOCHS = 3
model_type = 'distilbert-base-multilingual-cased'
LR = 0.001

config = AutoConfig.from_pretrained(model_type, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_type, add_prefix_space=False, do_lower_case=False)

train_dataset = create_dataset(train_df, MAX_LEN, tokenizer, BATCH_SIZE)
validation_dataset = create_dataset(val_df, MAX_LEN, tokenizer, BATCH_SIZE)

train_steps = train_df.shape[0] // BATCH_SIZE
val_steps = val_df.shape[0] // BATCH_SIZE

print(f'Train Steps: {train_steps}')
print(f'Validation Steps: {val_steps}')

def create_mbert_model(model_type, config, learning_rate):
    model = TFAutoModelForSequenceClassification.from_pretrained(model_type, config=config)

    optimizer = Adam(learning_rate=learning_rate)
    loss = SparseCategoricalCrossentropy(from_logits=True)
    metric = SparseCategoricalAccuracy('accuracy')
    model.compile(optimizer='adam', loss=loss, metrics=[metric])

    return model

model = create_mbert_model(model_type, config, LR)
print(model.summary())

history = model.fit(train_dataset, steps_per_epoch=train_steps, validation_data=validation_dataset,
                    validation_steps=val_steps, epochs=EPOCHS, verbose=1)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

100%|██████████| 800/800 [00:00<00:00, 1189.25it/s]
100%|██████████| 200/200 [00:00<00:00, 1538.29it/s]


Train Steps: 160
Validation Steps: 40


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  134734080 
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 135326210 (516.23 MB)
Trainable params: 135326210 (516.23 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 2/3
Epoch 3/3


In [16]:
  print(f'MultiLingual DistilBERT Classification Model Accuracy: {np.max(history.history["val_accuracy"])*100:.3f}%')

MultiLingual DistilBERT Classification Model Accuracy: 88.890%


In [18]:
model.save('multilingual_distilbert_model',save_format='tf')



In [19]:
model.save_weights('multilingual_distilbert_weights.h5')

In [20]:
from google.colab import drive
drive.mount('/content/drive')

# Save model to Google Drive
model.save('/content/drive/My Drive/Multilingual_DistilBERT')

Mounted at /content/drive




In [33]:
import gradio as gr
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer
from tqdm import tqdm

def predict_text_labels(text):
    # Encode input text using tokenizer
    input_encoded = tokenizer.encode_plus(text, add_special_tokens=True, max_length=128, truncation=True, padding='max_length', return_tensors='tf')

    # Make prediction using model
    prediction = model.predict({'input_ids': input_encoded['input_ids'], 'attention_mask': input_encoded['attention_mask']})

    predicted_labels = [label for label, prob in enumerate(prediction) if prob > 0.5]

    # Map predicted label indices to actual label names
    label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]  # Adjust based on your label names
    predicted_label_names = [label_names[label] for label in predicted_labels]

    return predicted_label_names

# Create Gradio interface
input_text = gr.Textbox(lines=5, label="Input Text")
output_labels = gr.Textbox(label="Predicted Labels")

gr.Interface(fn=predict_text_labels, inputs=input_text, outputs=output_labels).launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://fb71d108bcaf5cb7fc.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


