In [1]:
!pip install gdown
!pip install transformers datasets torch gdown optuna
!pip install hazm



In [2]:
import gdown
import zipfile

# Define the Google Drive file ID of the file to be downloaded
file_id = '1YgrCYY-Z0h2z0-PfWVfOGt1Tv0JDI-qz'
# Create the URL based on the Google Drive file ID
url = f'https://drive.google.com/uc?id={file_id}'
# Specify the name of the file to save after downloading
output_file_name = 'dataset.zip'

# Download the file from Google Drive
gdown.download(url, output_file_name, quiet=False)

# Extract the contents of the downloaded ZIP file
with zipfile.ZipFile(output_file_name, 'r') as zip_ref:
    # Specify the directory to extract to
    extract_directory = 'dataset'
    zip_ref.extractall(extract_directory)


Downloading...
From: https://drive.google.com/uc?id=1YgrCYY-Z0h2z0-PfWVfOGt1Tv0JDI-qz
To: /content/dataset.zip
100%|██████████| 13.6M/13.6M [00:00<00:00, 85.4MB/s]


In [3]:
# Standard library imports
import os
import re
import json
import copy
import collections

# Data handling and numerical operations
import numpy as np
import pandas as pd

# NLP and text preprocessing tools
from hazm import Normalizer
from transformers import (AutoTokenizer, BertConfig, BertTokenizer, BertModel,
                          AdamW, get_linear_schedule_with_warmup)

# Machine learning utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import shuffle

# Neural network tools
import torch
import torch.nn as nn
import torch.nn.functional as F

# Visualization tools
import plotly.express as px
import plotly.graph_objects as go

# Progress bar for loops
from tqdm.notebook import tqdm

In [5]:
import re
from hazm import Normalizer

# Instantiate the text normalizer for Persian processing
text_normalizer = Normalizer()

def preprocess_persian_text(input_text):
    """
    Perform comprehensive preprocessing on Persian text.

    Parameters:
        input_text (str): The original Persian text.

    Returns:
        str: Preprocessed Persian text.
    """
    # Normalize the text using Hazm normalizer
    normalized_text = text_normalizer.normalize(input_text)

    # Replace non-alphanumeric characters with spaces
    alphanumeric_filtered_text = re.sub(r'[^\w\s]', ' ', normalized_text)

    # Arabic to Persian character conversion mapping
    arabic_to_persian_chars = {'ي': 'ی', 'ك': 'ک', 'ؤ': 'و', 'ى': 'ی', 'ة': 'ه', 'ۀ': 'ه'}
    for arabic_char, persian_char in arabic_to_persian_chars.items():
        alphanumeric_filtered_text = alphanumeric_filtered_text.replace(arabic_char, persian_char)

    # Limit repetition of characters to two occurrences
    repeat_filtered_text = re.sub(r'(.)\1{2,}', r'\1\1', alphanumeric_filtered_text)

    # Remove Arabic diacritics
    diacritics_filtered_text = re.sub(r'[\u064B-\u065F\u0670]', '', repeat_filtered_text)

    # Allow only Persian characters and spaces, replace others with space
    persian_char_filtered_text = re.sub(r'[^آ-ی\s]', ' ', diacritics_filtered_text)

    # Numeric character conversion from Persian to English
    persian_to_english_nums = {'۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4',
                               '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9'}
    for persian_num, english_num in persian_to_english_nums.items():
        persian_char_filtered_text = persian_char_filtered_text.replace(persian_num, english_num)

    # Compile a pattern to match and remove emojis and other special Unicode characters
    emoji_and_special_char_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    emoji_filtered_text = emoji_and_special_char_pattern.sub(r'', persian_char_filtered_text)

    # Clean up spaces
    cleaned_text = re.sub(r'\s+', ' ', emoji_filtered_text).strip()  # Remove extra spaces and trim

    return cleaned_text


In [7]:
from transformers import AutoTokenizer

try:
    # Load the tokenizer for the AriaBERT model, designed for Persian text.
    # Specifying `use_fast=True` to utilize the fast tokenizer if available.
    tokenizer = AutoTokenizer.from_pretrained('ViraIntelligentDataMining/AriaBERT', use_fast=True)
    print("Tokenizer loaded successfully.")
except Exception as e:
    print(f"Failed to load tokenizer: {e}")


Tokenizer loaded successfully.


In [9]:
import pandas as pd

# Define the common delimiter for the CSV files
csv_delimiter = '\t'

# Load the training, development, and test datasets from CSV files
train_df = pd.read_csv('/content/dataset/digimag/train.csv', delimiter=csv_delimiter)
dev_df = pd.read_csv('/content/dataset/digimag/dev.csv', delimiter=csv_delimiter)
test_df = pd.read_csv('/content/dataset/digimag/test.csv', delimiter=csv_delimiter)

# Display the first few rows of the training dataset to confirm loading
print("First few rows of the training dataset:")
print(train_df.head())


First few rows of the training dataset:
   Unnamed: 0                                            content  \
0           0  نمایش تبلیغ در لاک‌اسکرین تعدادی از گوشی‌های ه...   
1           1  شکست Justice League در باکس آفیس پس از بازخورد...   
2           2  کلاسیک بینی؛ همه چیز در یک شب اتفاق افتاد فیلم...   
3           3  اپل دوباره سراغ رنده رفته چراکه آپگرید کردن سط...   
4           4  بررسی جزء به جزء بهترین بخش Ori and the Blind ...   

            label  label_id  
0  علم و تکنولوژی         3  
1     هنر و سینما         5  
2     هنر و سینما         5  
3  علم و تکنولوژی         3  
4    بازی ویدیویی         0  


In [10]:
from sklearn.preprocessing import LabelEncoder

# Initialize the label encoder
label_encoder = LabelEncoder()

# Transform the 'label' column in the training dataset and fit the label encoder
train_df['label'] = label_encoder.fit_transform(train_df['label'])

# Transform the 'label' column in the development and test datasets using the fitted encoder
dev_df['label'] = label_encoder.transform(dev_df['label'])
test_df['label'] = label_encoder.transform(test_df['label'])

# Display unique labels to confirm encoding
print("Unique labels in the training data:", train_df['label'].unique())
print("Unique labels in the development data:", dev_df['label'].unique())
print("Unique labels in the test data:", test_df['label'].unique())

Unique labels in the training data: [3 5 0 6 2 4 1]
Unique labels in the development data: [5 0 3 2 6 4 1]
Unique labels in the test data: [2 3 5 0 6 1 4]


In [12]:
def preprocess_dataframe(df):
    """
    Applies text preprocessing to a specified dataframe.

    Parameters:
    df (pandas.DataFrame): The dataframe containing the data to preprocess.

    Returns:
    pandas.DataFrame: A dataframe with preprocessed text and corresponding labels.

    Raises:
    KeyError: If the expected columns are not in the dataframe.
    """
    try:
        # Apply the additional_preprocessing function to the 'content' column
        df['text'] = df['content'].apply(preprocess_persian_text)
    except KeyError:
        raise KeyError("DataFrame must include a 'content' column.")

    try:
        # Return a new dataframe containing only the preprocessed text and labels
        return df[['text', 'label']]
    except KeyError:
        raise KeyError("DataFrame must include a 'label' column.")


# Apply preprocessing to each dataframe
train_df = preprocess_dataframe(train_df)
dev_df = preprocess_dataframe(dev_df)
test_df = preprocess_dataframe(test_df)

In [13]:
# Calculate the count of occurrences for each unique label in the 'label' column of train_df
label_counts = train_df['label'].value_counts()

# Output the counts to the console, showing how many samples there are for each label
print("Count of data samples per label:")
print(label_counts)

Count of data samples per label:
label
3    2245
0    1593
5    1350
2    1304
6     206
1     101
4      97
Name: count, dtype: int64


In [14]:
import pandas as pd


# Define a dictionary to map existing class labels to new labels
class_mapping = {4: 6, 1: 6}

# Update the 'label' column in each dataset according to the class mapping
train_df['label'] = train_df['label'].replace(class_mapping)
dev_df['label'] = dev_df['label'].replace(class_mapping)
test_df['label'] = test_df['label'].replace(class_mapping)

# Calculate and store the count of occurrences for each label in the updated 'label' column of train_df
updated_label_counts = train_df['label'].value_counts()

# Print the count of data samples per updated label to the console for verification
print("Count of data samples per updated label:")
print(updated_label_counts)


Count of data samples per updated label:
label
3    2245
0    1593
5    1350
2    1304
6     404
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['label'] = train_df['label'].replace(class_mapping)


In [15]:
from datasets import Dataset  # Import Dataset class for conversion

# Convert pandas DataFrames into Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize text data directly within the map method using lambda functions for simplicity
# This applies the tokenizer with truncation and padding to ensure uniform length
train_dataset = train_dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
dev_dataset = dev_dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)
test_dataset = test_dataset.map(lambda examples: tokenizer(examples['text'], truncation=True, padding='max_length'), batched=True)


Map:   0%|          | 0/6896 [00:00<?, ? examples/s]

Map:   0%|          | 0/767 [00:00<?, ? examples/s]

Map:   0%|          | 0/852 [00:00<?, ? examples/s]

In [16]:
import torch  # Ensure PyTorch is imported

# Determine if CUDA (GPU support) is available, otherwise use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')  # Display the device that will be used for training

# Check if the GPU is available for training
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    # Inform the user if CUDA is not available, and CPU will be used
    print('CUDA is not available. Training on CPU ...')
else:
    # Confirm that CUDA is available and GPU will be used
    print('CUDA is available! Training on GPU ...')


device: cuda:0
CUDA is available! Training on GPU ...


In [17]:
import os  # Required for directory operations

# General configuration settings
MAX_LEN = 128  # Maximum length of the input tokens
TRAIN_BATCH_SIZE = 16  # Batch size for training
VALID_BATCH_SIZE = 16  # Batch size for validation
TEST_BATCH_SIZE = 16  # Batch size for testing

EPOCHS = 3  # Number of epochs to train the model
EEVERY_EPOCH = 1000  # Appears to be a typo, likely meant to be 'EVAL_EVERY_EPOCH' for evaluation frequency
LEARNING_RATE = 2e-5  # Learning rate for the optimizer
CLIP = 0.0  # Gradient clipping threshold

MODEL_NAME_OR_PATH = 'ViraIntelligentDataMining/AriaBERT'  # Path or name of the model to be used
OUTPUT_PATH = '/content/bert-fa-base-uncased-sentiment-taaghceh/pytorch_model.bin'  # Output path for the trained model

# Ensure the output directory exists, create if necessary
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)


In [None]:
# Print the classes managed by the label encoder to verify the encoding or for debugging purposes
print("Label encoder classes:", label_encoder.classes_)


Label encoder classes: ['بازی ویدیویی' 'راهنمای خرید' 'سلامت و زیبایی' 'علم و تکنولوژی' 'عمومی'
 'هنر و سینما' 'کتاب و ادبیات']


In [18]:
# Define a dictionary mapping numerical IDs to category labels in Persian
id_to_label = {
    0: "عمومی",        # General
    6: "بقیه",         # Others
    2: "سلامت و زیبایی",  # Health and Beauty
    3: "راهنمای خرید",    # Shopping Guide
    5: "کتاب و ادبیات"    # Books and Literature
}

# Create a reverse dictionary mapping category labels back to their numerical IDs
label_to_id = {label: id for id, label in id_to_label.items()}

# Generate a list of category labels from the dictionary for use in other operations
category_labels = list(label_to_id.keys())

# Reverse the order of category labels to handle any specific order requirement or preference
category_labels.reverse()

