# Process Twitter Datasets

## Read Data

In [7]:
import pandas as pd
import os

output_file_path = "./files/twitter_text.csv"

# Check if the output file already exists
if os.path.exists(output_file_path):
    print("Output file already exists. No further processing is done.")
else:
    try:
        # Load the CSV file
        telegram_p1 = pd.read_csv("./files/telegram_data.csv", 
                                  header=None, 
                                  quotechar='"', 
                                  engine='python', 
                                  error_bad_lines=False)

        # Extract the column with index 11
        text_column = telegram_p1.loc[:, 11]

        # Save the extracted column to a new CSV file
        text_column.to_csv(output_file_path, index=False, header=['text'])
        print("Text column saved successfully.")

    except Exception as e:
        print("Error processing file:", e)

Output file already exists. No further processing is done.


In [8]:
twitter_text = pd.read_csv("./files/twitter_text.csv", header=0)

In [9]:
print(f"Length before dropping Nans: {len(twitter_text)}")
twitter_text.dropna(inplace=True)
print(f"Length after dropping Nans: {len(twitter_text)}")

Length before dropping Nans: 590127
Length after dropping Nans: 506952


**Example:**

In [10]:
print(f"{twitter_text.iloc[0].text}")

#فوری 

مهم  
در جلسه هیأت دولت صورت گرفت؛

🔹اصلاح لایحه بودجه سال ۱۴۰۱ درخصوص نحوه محاسبه نرخ گاز خوراک پتروشیمی‌ها

🔹تصویب آیین نامه اجرایی استرداد مالیات و عوارض کالاهای همراه مسافران و بلیت پروازهای خارجی گردشگران خارجی خریداری شده از شرکت‌های هواپیمایی ایرانی

🔹هیأت وزیران در راستای مصوبه ستاد هماهنگی اقتصادی دولت درخصوص نحوه محاسبه نرخ گاز خوراک پتروشیمی‌ها، با اصلاح لایحه بودجه سال ۱۴۰۱ کل کشور در این خصوص موافقت کرد.

🔹جلسه هیأت دولت بعد از ظهر امروز یکشنبه به ریاست آیت الله سید ابراهیم رئیسی برگزار شد و در آن مصوب شد فرمول محاسبه نرخ گاز خوراک پتروشیمی‌ها مانند سال ۱۴۰۰ محاسبه و اعمال گردد.

🔹همچنین نرخ گاز سوخت پتروشیمی‌ها، پالایشگاه‌ها و صنایع پایین دستی، مجتمع‌های احیای فولاد و مصارف مربوط به یوتیلیتی شامل برق، آب، اکسیژن و غیره آنها معادل ۴۰ درصد نرخ خوراک گاز پتروشیمی میباشد و همچنین صنایع سیمان و سایر صنایع معادل ۱۰ درصد نرخ خوراک پتروشیمی‌ها تعیین میگردد. معادل کاهش در منابع ناشی از این تصمیم نیز میبایستی به صورت کاهش در مصارف تبصره (۱۴) اعمال گردد.


## Apply Rule-based Method to Create Dataset

In [11]:
import pandas as pd
from data.loaders import load_symbols_from_csv
from symbol_detector.rule_based_detector.detector import extract_symbol_spans
from tqdm import tqdm
from joblib import Parallel, delayed
from transformers import XLMRobertaTokenizerFast

# Load keywords
symbols_data = load_symbols_from_csv(columns=['symbol'], path='./files/symbols.csv')
keywords = [d['symbol'] for d in symbols_data]

# Initialize tokenizer
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

def tag_text(row, input_keywords, input_tokenizer):
    # Ensure the input is a string
    if not isinstance(row['text'], str):
        return [], row['text']

    text = row['text']
    spans = extract_symbol_spans(text, input_keywords)

    # Tokenize text and initialize labels
    encoded = input_tokenizer.encode_plus(text, return_offsets_mapping=True, add_special_tokens=False)
    tokens = encoded['input_ids']
    offsets = encoded['offset_mapping']
    labels = ['O'] * len(tokens)

    # Mark tokens that are part of a keyword
    for span_info in spans:
        span_start, span_end = span_info['span']
        for idx, (start, end) in enumerate(offsets):
            if start < span_end and end > span_start:
                labels[idx] = 'KEYWORD'

    return labels, input_tokenizer.convert_ids_to_tokens(tokens)

# Function to process a batch and save intermediate results
def process_batch(batch, batch_index, input_keywords, input_tokenizer):
    tqdm.pandas(desc=f"Processing Batch {batch_index}")
    batch[['labels', 'tokens']] = batch.progress_apply(lambda row: tag_text(row, input_keywords, input_tokenizer), axis=1, result_type='expand')
    batch.to_csv(f'./files/intermediate_artifacts/intermediate_batch_{batch_index}.csv', index=False)
    print(f"Batch {batch_index} processed and saved.")

In [12]:
len(twitter_text)

506952

In [13]:
# Assume twitter_text is your input DataFrame
batch_size = 10_000  # Adjust this based on your dataset size and memory capacity

# Process in batches
num_batches = (len(twitter_text) + batch_size - 1) // batch_size
Parallel(n_jobs=-1)(delayed(process_batch)(twitter_text.iloc[i * batch_size:(i + 1) * batch_size], i, keywords, tokenizer) for i in range(num_batches))

# Consolidate intermediate results
consolidated = pd.concat([pd.read_csv(f'./files/intermediate_artifacts/intermediate_batch_{i}.csv') for i in range(num_batches)])
consolidated.to_csv('./files/twitter_tagged.csv', index=False)
print("All batches processed. Final output saved.")

Processing Batch 0:   0%|          | 10/10000 [00:00<10:15, 16.23it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (701 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 5:   0%|          | 3/10000 [00:00<06:05, 27.38it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 1:   0%|          | 14/10000 [00:00<09:59, 16.65it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (874 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 0:   0%|          | 19/10000 [00:01<09:36, 17.32it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (770 > 512). Running this sequence through the model will result in indexing 

Batch 4 processed and saved.


Processing Batch 3: 100%|██████████| 10000/10000 [13:33<00:00, 12.29it/s]
Processing Batch 1: 100%|█████████▉| 9967/10000 [13:36<00:03,  9.01it/s]

Batch 3 processed and saved.


Processing Batch 1: 100%|██████████| 10000/10000 [13:40<00:00, 12.19it/s]
Processing Batch 7:  99%|█████████▉| 9905/10000 [13:42<00:10,  8.90it/s]

Batch 1 processed and saved.


Processing Batch 0: 100%|██████████| 10000/10000 [13:44<00:00, 12.13it/s]
Processing Batch 7:  99%|█████████▉| 9942/10000 [13:46<00:06,  9.41it/s]

Batch 0 processed and saved.


Processing Batch 6: 100%|█████████▉| 9980/10000 [13:47<00:01, 10.15it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 6: 100%|██████████| 10000/10000 [13:50<00:00, 12.05it/s]
Processing Batch 9:   2%|▏         | 165/10000 [00:16<15:52, 10.33it/s]]

Batch 6 processed and saved.


Processing Batch 7: 100%|██████████| 10000/10000 [13:52<00:00, 12.01it/s]
Processing Batch 5: 100%|██████████| 10000/10000 [13:53<00:00, 12.00it/s]
Processing Batch 8:   2%|▏         | 237/10000 [00:23<15:47, 10.30it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (641 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 11:   1%|          | 83/10000 [00:08<16:46,  9.85it/s]

Batch 7 processed and saved.


Processing Batch 14:   0%|          | 3/10000 [00:00<09:57, 16.73it/s]]]

Batch 5 processed and saved.


Processing Batch 12:   1%|          | 52/10000 [00:05<18:02,  9.19it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (546 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 2:  99%|█████████▊| 9852/10000 [14:31<00:16,  8.79it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (637 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 2: 100%|██████████| 10000/10000 [14:50<00:00, 11.23it/s]
Processing Batch 8:   8%|▊         | 780/10000 [01:21<16:54,  9.09it/s]]

Batch 2 processed and saved.


Processing Batch 13:   6%|▌         | 559/10000 [01:01<17:22,  9.06it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (971 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 13:   8%|▊         | 816/10000 [01:35<17:31,  8.73it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (525 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 12:   9%|▉         | 943/10000 [01:51<17:01,  8.86it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (592 > 512). Running this sequence through the model will result in indexing errors
Processing Batch 8:  13%|█▎        | 1253/10000 [02:17<15:58,  9.13it/s]]
Processing Batch 12:  10%|▉         | 972/10000 [01:55<17:56,  8.39it/s]
Processing Batch 10:  11%|█▏        | 1147/10000 [02:06<16:12,  9.10it/s]
Processing Batch 

KeyboardInterrupt: 

**TEST:**

In [None]:
def test_tokenizer(index):
    text = twitter_text.iloc[110].text
    print(f"Text:\n {text}")
    print(f"Extract Keywords:\n {extract_symbol_spans(text, keywords)}")
    print(f"Tokenized:")
    labels, tokens = tag_text(twitter_text.iloc[110], keywords, tokenizer)
    for token, label in zip(tokens, labels):
        if label == "KEYWORD":
            print(token)

In [None]:
test_tokenizer(110)

## PyTorch Dataset

In [None]:
from torch.utils.data import Dataset
import pandas as pd

class NERDataset(Dataset):
    def __init__(self, filename, input_tokenizer, max_len):
        self.data = pd.read_csv(filename)
        self.tokenizer = input_tokenizer
        self.max_len = max_len
        self.label_map = {'O': 0, 'KEYWORD': 1}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        text = row['text']
        word_labels = eval(row['labels'])  # Assuming this returns a list of labels for each word

        # Tokenize the text and align labels with tokens
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_offsets_mapping=True,  # We need this to align labels with tokens
            return_tensors='pt'
        )

        # Create a new list for labels aligned with tokenized input
        labels = []
        last_word_idx = -1
        for offset in encoding['offset_mapping'].squeeze().tolist():
            # Check if we are at the start of a new word
            if offset[0] == 0 or last_word_idx != offset[0]:
                labels.append(self.label_map[word_labels.pop(0)] if word_labels else self.label_map['O'])
            else:
                labels.append(labels[-1])  # Copy label from previous subword token
            last_word_idx = offset[1]

        # Padding labels if necessary
        labels = labels[:self.max_len]  # Truncate to max_len
        labels += [self.label_map['O']] * (self.max_len - len(labels))  # Pad if needed

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

# Usage
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')
dataset = NERDataset(filename='./files/twitter_tagged.csv', input_tokenizer=tokenizer, max_len=128)

In [None]:
len(dataset)

In [None]:
next(iter(dataset))

## Dataloaders

In [None]:
from torch.utils.data import DataLoader, random_split

# Assuming dataset is an instance of NERDataset
# Define the size of your training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

# Split the dataset into training and validation sets
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

## Training

In [None]:
import torch
import torch.nn as nn
from transformers import XLMRobertaForTokenClassification, AdamW
import numpy as np


# Initialize the model
model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=2)  # Adjust num_labels based on your task

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Early stopping parameters
patience = 3  # How many epochs to wait after last time validation loss improved.
best_loss = np.inf
counter = 0

# Training parameters
num_epochs = 1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model.to(device)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    # Add a progress bar for the training loop
    train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1} Training", leave=False)
    for batch in train_progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        train_progress_bar.set_postfix({'Train Loss': loss.item()})

    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    total_val_loss = 0

    # Add a progress bar for the validation loop
    val_progress_bar = tqdm(val_loader, desc=f"Epoch {epoch + 1} Validation", leave=False)
    with torch.no_grad():
        for batch in val_progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_val_loss += loss.item()
            val_progress_bar.set_postfix({'Val Loss': loss.item()})

    avg_val_loss = total_val_loss / len(val_loader)
    print(f'Epoch {epoch + 1} - Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

    # Early Stopping
    if avg_val_loss < best_loss:
        best_loss = avg_val_loss
        counter = 0
    else:
        counter += 1
        if counter >= patience:
            print("Early stopping triggered")
            break

## Save

In [None]:
# Saving the model
model_save_path = './models/xlm_roberta_ner_model.pt'
torch.save(model.state_dict(), model_save_path)

## Load and Inference

In [None]:
def load_model(model_path, num_labels):
    # Initialize the model
    stock_ner_model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-base', num_labels=num_labels)
    stock_ner_model.load_state_dict(torch.load(model_path))
    stock_ner_model.eval()
    return stock_ner_model

def predict(text, model, tokenizer, max_len=128):
    # Tokenize the input text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Move tensors to the same device as the model
    input_ids = encoding['input_ids'].to(model.device)
    attention_mask = encoding['attention_mask'].to(model.device)

    # Inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    # Convert logits to probabilities
    probabilities = torch.softmax(outputs.logits, dim=-1)

    # Convert probabilities to label IDs
    predictions = torch.argmax(probabilities, dim=-1)

    # Convert label IDs to label names
    label_map = {0: 'O', 1: 'KEYWORD'}  # Update this based on your label mapping
    predicted_labels = [label_map[label_id] for label_id in predictions.squeeze().tolist()]

    # Convert input IDs to tokens
    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze().tolist())

    return list(zip(tokens, predicted_labels))

In [None]:
# Usage
model = load_model('./models/xlm_roberta_ner_model.pt', num_labels=2)
tokenizer = XLMRobertaTokenizerFast.from_pretrained('xlm-roberta-base')

In [None]:
sample_text = "خساپا سهم بسیار خوبی است"
predictions = predict(sample_text, model, tokenizer)
print(predictions)

In [None]:
sample_text = "با خرید بورس شستا سود خوبی کردیم"
predictions = predict(sample_text, model, tokenizer)
print(predictions)