In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd


In [None]:
df = pd.read_parquet('/content/gdrive/MyDrive/Common_Crawl/data/dataset.parquet')
df.head()

Unnamed: 0,url,html,text,encoding,lang,language,num_words,title
0,https://www.pulse.ng/sports/opinion/3-things-l...,"\n\n\n\n\n<!DOCTYPE html>\n<html lang=""en"">\n\...",CDATA ringDataLayer context variant accelerato...,utf-8,en,en,4797,3 things learnt from Jose Peseiro's second con...
1,https://sportspyder.com/cf/florida-gators-foot...,"<!doctype html><html lang=""en""><head><meta cha...",Sports News Tweets Rumors and Articles SportSp...,UTF-8,en,en,1611,"Sports News, Tweets, Rumors and Articles | Spo..."
2,https://www.homify.hk/professionals/kitchen-pl...,"<!DOCTYPE html>\n<html class=""-logged-out- -lo...",function w d s l i wl wl wl push gtm start ne...,utf-8,en,en,1334,Find the right Kitchen Planners in İstanbul | ...
3,"https://wyborcza.pl/7,75399,28495755,100-dni-w...",\n<!DOCTYPE html>\n<html>\n<head>\n<meta chars...,Wyborcza pl body font family Arial sans serif ...,UTF-8,,en,689,Wyborcza.pl
4,https://www.st.nu/2022-06-03/byggforetagen-vil...,"<!DOCTYPE html><html lang=""sv"" id=""root-elemen...",window hdsconfig androidAppPackage se mittmedi...,utf-8,sv,sv,2956,Byggföretagen vill mörka lönedumpning – Sundsv...


In [None]:
df.columns

Index(['url', 'html', 'text', 'encoding', 'lang', 'language', 'num_words',
       'title'],
      dtype='object')

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import ElectraTokenizer, TFElectraForSequenceClassification
from transformers import XLMRobertaTokenizer, TFXLMRobertaForSequenceClassification
from tqdm import tqdm
import time
import os

In [None]:


# Import data
def import_data(csv_file):
    df = pd.read_csv(csv_file)
    df['text'] = df['Title'] + ' ' + df['Description']
    df.rename(columns={'Class Index': 'label'}, inplace=True)
    df['label'].replace({4: 0}, inplace=True)
    df.drop(['Title', 'Description'], axis=1, inplace=True)
    return df

train_val_df = import_data('/content/gdrive/MyDrive/Common_Crawl/data/train.csv')
train_df, val_df = train_test_split(train_val_df[['text', 'label']],
                                    test_size=0.2, random_state=42)
test_df = import_data('/content/gdrive/MyDrive/Common_Crawl/data/test.csv')

# Define the pipelines for each model
def pipeline(dataframe, pretrained_model, tokenizer):
    inputs = tokenizer(list(dataframe['text']), truncation=True, padding=True, max_length=128)
    dataset = tf.data.Dataset.from_tensor_slices((dict(inputs), dataframe['label']))
    return dataset

# Define a list of models and tokenizers
models = [('google/electra-small-generator', ElectraTokenizer),
          ('facebook/bart-base', AutoTokenizer),
          ('xlm-roberta-base', XLMRobertaTokenizer)]

results = []  # To store the results


# Specify the location to save the fine-tuned models
model_dir = "/content/gdrive/MyDrive/Common_Crawl"

# Iterate over each model
for model_name, tokenizer_class in tqdm(models):
    tokenizer = tokenizer_class.from_pretrained(model_name)
    train_dataset = pipeline(train_df, model_name, tokenizer)
    val_dataset = pipeline(val_df, model_name, tokenizer)
    test_dataset = pipeline(test_df, model_name, tokenizer)

    # Create the model
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)

    # Compile the model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
        metrics=['accuracy']
    )

    start_time = time.time()
    # Train the model
    history = model.fit(train_dataset.shuffle(1000).batch(16),
                        epochs=5,
                        batch_size=16,
                        validation_data=val_dataset.batch(16),
                        verbose=0)
    end_time = time.time()

    # Evaluate the model
    preds = np.argmax(model.predict(test_dataset.batch(16)).logits, axis=-1)
    precision, recall, fscore, support = score(test_df['label'], preds)

    # Save the model
    model.save_pretrained(os.path.join(model_dir, model_name))

    # Store the results
    results.append({'model_name': model_name, 'tokenizer_class': tokenizer_class.__name__,
                    'accuracy': accuracy_score(test_df['label'], preds),
                    'precision': precision, 'recall': recall, 'fscore': fscore, 'support': support,
                    'time_taken': end_time - start_time})

# Print the results
for result in results:
    print(f"Results for {result['model_name']} ({result['tokenizer_class']})")
    print(f"Accuracy: {result['accuracy']}")
    print(f"Precision: {result['precision']}")
    print(f"Recall: {result['recall']}")
    print(f"F-score: {result['fscore']}")
    print(f"Support: {result['support']}")
    print(f"Time taken: {result['time_taken']} seconds")
    print("="*60)



  0%|          | 0/3 [00:00<?, ?it/s]Some layers from the model checkpoint at google/electra-small-generator were not used when initializing TFElectraForSequenceClassification: ['activation', 'generator_predictions', 'generator_lm_head']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-generator and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for p



 33%|███▎      | 1/3 [42:09<1:24:18, 2529.37s/it]All model checkpoint layers were used when initializing TFBartForSequenceClassification.

Some layers of TFBartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['classification_head']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.




 33%|███▎      | 1/3 [1:38:56<3:17:52, 5936.35s/it]


ResourceExhaustedError: ignored