In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/authorid-training/AuthorshipClassficiationTrain.xlsx
/kaggle/input/authorid/AuthorshipClassficiationVal.xlsx


In [3]:
import pandas as pd
import os

# 1. Automatic detection of the uploaded files
# Based on your uploads, we search for Train and Val files
train_path = ""
val_path = ""

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        full_path = os.path.join(dirname, filename)
        if 'train' in filename.lower():
            train_path = full_path
        elif 'val' in filename.lower():
            val_path = full_path

# 2. Loading and Converting
try:
    if train_path and val_path:
        # Load Excel files
        train_df = pd.read_excel(train_path)
        val_df = pd.read_excel(val_path)
        
        # Save as CSV with correct encoding for Arabic
        train_df.to_csv('train_data.csv', index=False, encoding='utf-8-sig')
        val_df.to_csv('val_data.csv', index=False, encoding='utf-8-sig')
        
        print("Status: Success")
        print(f"Loaded Train: {train_path} ({len(train_df)} rows)")
        print(f"Loaded Val: {val_path} ({len(val_df)} rows)")
        print("-" * 30)
        print("New CSV files created: 'train_data.csv' and 'val_data.csv'")
    else:
        print("Status: Error - Could not find the files. Please check if they are added to the notebook.")
except Exception as e:
    print(f"Status: Exception - {str(e)}")

# 3. Preview to verify column names
if 'train_df' in locals():
    print("\nColumn names found:", train_df.columns.tolist())
    print("\nFirst 2 rows of Training Data:")
    print(train_df.head(2))

Status: Success
Loaded Train: /kaggle/input/authorid-training/AuthorshipClassficiationTrain.xlsx (35122 rows)
Loaded Val: /kaggle/input/authorid/AuthorshipClassficiationVal.xlsx (4157 rows)
------------------------------
New CSV files created: 'train_data.csv' and 'val_data.csv'

Column names found: ['id', 'text_in_author_style', 'author']

First 2 rows of Training Data:
     id                               text_in_author_style      author
0  5843  هذه الكتب التي أصدرتُها منذ بدأت كتابة باب "من...  يوسف إدريس
1  5844  صعب جِدًّا في ظل هذا التقسيم الإرهابي أن أقول ...  يوسف إدريس


Data Preprocessing

In [5]:
import re
import string

def clean_arabic_text(text):
    # Convert to string to avoid errors with non-text cells
    text = str(text)
    
    # 1. Remove URLs, Emails, and Mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'@\S+', '', text)
    
    # 2. Normalize Arabic letters (Alef, Yeh, and Teh Marbuta)
    # This helps the model see different spellings of the same word as one
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ة", "ه", text)
    
    # 3. Remove Arabic Diacritics (Tashkeel)
    tashkeel_pattern = re.compile(r"[\u064B-\u0652]")
    text = re.sub(tashkeel_pattern, "", text)
    
    # 4. Remove Tatweel (Kashida) like "جميــــل" -> "جميل"
    text = re.sub(r'ـ+', '', text)
    
    # 5. Remove Punctuation (Standard and Arabic)
    arabic_punctuation = '«»"\'()[]{}،؛؟!'
    all_punct = string.punctuation + arabic_punctuation
    text = text.translate(str.maketrans('', '', all_punct))
    
    # 6. Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply the cleaning to the training and validation dataframes
print("Starting Preprocessing... this might take a minute.")
train_df['text_in_author_style'] = train_df['text_in_author_style'].apply(clean_arabic_text)
val_df['text_in_author_style'] = val_df['text_in_author_style'].apply(clean_arabic_text)

print("Status: Preprocessing Complete!")
print("\nSample of Cleaned Text (First 150 characters):")
print(train_df['text_in_author_style'].iloc[0][:150])

Starting Preprocessing... this might take a minute.
Status: Preprocessing Complete!

Sample of Cleaned Text (First 150 characters):
هذه الكتب التي اصدرتها منذ بدات كتابه باب من مفكره يوسف ادريس في الاهرام كل اثنين القت علي شخصيا وعلي الحركه الثقافيه والفنيه المصريه والعربيه سؤالا ل


Label Encoding (Assigning IDs to Authors)

In [6]:
from sklearn.preprocessing import LabelEncoder
import json

# 1. Initialize the LabelEncoder
label_encoder = LabelEncoder()

# 2. Fit and transform the 'author' column to numbers
train_df['label'] = label_encoder.fit_transform(train_df['author'])
val_df['label'] = label_encoder.transform(val_df['author'])

# 3. Create a dictionary to remember which number belongs to which author
id2label = {int(i): label for i, label in enumerate(label_encoder.classes_)}
label2id = {label: int(i) for i, label in enumerate(label_encoder.classes_)}

# 4. Save the mapping to a file (important for the testing phase later)
with open('mapping.json', 'w', encoding='utf-8') as f:
    json.dump(id2label, f, ensure_ascii=False)

print("Status: Success")
print(f"Total Unique Authors: {len(label_encoder.classes_)}")
print("-" * 30)
print("Samples of the mapping (ID to Author Name):")
for i in range(min(5, len(label_encoder.classes_))):
    print(f"ID {i}  ==>  {id2label[i]}")

Status: Success
Total Unique Authors: 21
------------------------------
Samples of the mapping (ID to Author Name):
ID 0  ==>  أحمد أمين
ID 1  ==>  أحمد تيمور باشا
ID 2  ==>  أحمد شوقي
ID 3  ==>  أمين الريحاني
ID 4  ==>  ثروت أباظة


******Tokenization & Sliding Window******

In [7]:
import torch
from transformers import AutoTokenizer

# 1. Load the MARBERTv2 Tokenizer
model_name = "UBC-NLP/MARBERTv2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_with_sliding_window(df, max_len=512, stride=256):
    all_input_ids = []
    all_attention_masks = []
    all_labels = []
    
    print("Processing texts... this might take a few minutes.")
    
    for index, row in df.iterrows():
        text = str(row['text_in_author_style'])
        label = row['label']
        
        # Tokenize the whole text without truncation first
        full_tokens = tokenizer(text, truncation=False, add_special_tokens=True)
        input_ids = full_tokens['input_ids']
        
        # If text is short, just pad it
        if len(input_ids) <= max_len:
            chunks = [input_ids]
        else:
            # Create overlapping chunks
            chunks = [input_ids[i : i + max_len] for i in range(0, len(input_ids), stride)]
        
        for chunk in chunks:
            # Padding if the last chunk is smaller than max_len
            if len(chunk) < max_len:
                chunk = chunk + [tokenizer.pad_token_id] * (max_len - len(chunk))
            else:
                chunk = chunk[:max_len]
            
            mask = [1 if t != tokenizer.pad_token_id else 0 for t in chunk]
            
            all_input_ids.append(chunk)
            all_attention_masks.append(mask)
            all_labels.append(label)
            
    return {
        'input_ids': torch.tensor(all_input_ids),
        'attention_mask': torch.tensor(all_attention_masks),
        'labels': torch.tensor(all_labels)
    }

# Apply to Train and Validation
train_data_final = tokenize_with_sliding_window(train_df)
val_data_final = tokenize_with_sliding_window(val_df)

print(f"\nStatus: Success")
print(f"Original Training Rows: {len(train_df)}")
print(f"Final Training Samples (after sliding window): {len(train_data_final['labels'])}")
print(f"Final Validation Samples: {len(val_data_final['labels'])}")

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Processing texts... this might take a few minutes.
Processing texts... this might take a few minutes.

Status: Success
Original Training Rows: 35122
Final Training Samples (after sliding window): 35658
Final Validation Samples: 4208


Formatting Data for the Model****

In [8]:
import torch

class AuthorDataset(torch.utils.data.Dataset):
    def __init__(self, data_dict):
        # We store the dictionary of tensors (input_ids, attention_mask, labels)
        self.data = data_dict
        
    def __len__(self):
        # Return the total number of samples
        return len(self.data['labels'])
    
    def __getitem__(self, idx):
        # Return a single sample (one row) for the model to process
        return {key: val[idx] for key, val in self.data.items()}

# Create the final dataset objects
train_dataset = AuthorDataset(train_data_final)
val_dataset = AuthorDataset(val_data_final)

print("Status: Success")
print(f"Train Dataset: {len(train_dataset)} samples ready.")
print(f"Validation Dataset: {len(val_dataset)} samples ready.")

Status: Success
Train Dataset: 35658 samples ready.
Validation Dataset: 4208 samples ready.


Starting the Training Engine

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score, accuracy_score

# 1. Load MARBERTv2
model = AutoModelForSequenceClassification.from_pretrained(
    "UBC-NLP/MARBERTv2", 
    num_labels=21
)

# 2. Define Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, predictions, average='macro')
    acc = accuracy_score(labels, predictions)
    return {"macro_f1": f1, "accuracy": acc}

# 3. Training Configurations (Updated for compatibility)
training_args = TrainingArguments(
    output_dir='./author_id_model',
    num_train_epochs=3,
    per_device_train_batch_size=8,   # Reduced to 8 to be safer
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    eval_strategy="epoch",           # Changed from evaluation_strategy
    save_strategy="epoch",
    logging_steps=100,
    fp16=True,                       # GPU acceleration
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    report_to="none"
)

# 4. Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 5. EXECUTE
print("Starting training with updated settings...")
trainer.train()

In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.auto import tqdm

# 1. Load the trained model and tokenizer
model_path = "./author_id_model" # This is where the trainer saved it
tokenizer = AutoTokenizer.from_pretrained("UBC-NLP/MARBERTv2")
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.to("cuda") # Move to GPU for fast prediction
model.eval()

# 2. Load the Competition Test Data
test_df = pd.read_excel("/kaggle/input/authorid/AuthorshipClassficiationTest.xlsx")

# 3. Clean the test text (same way we did in training)
test_df['cleaned_text'] = test_df['text_in_author_style'].str.replace(r'[إأآا]', 'ا', regex=True).str.replace(r'ة', 'ه', regex=True)

predictions = []

# 4. Start Predicting
print("Predicting authors for the test set...")
with torch.no_grad():
    for text in tqdm(test_df['cleaned_text']):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to("cuda")
        outputs = model(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=-1).item()
        # Convert ID back to Author Name using our map from Cell 3
        author_name = id2author[pred_id]
        predictions.append(author_name)

# 5. Create the Submission File
submission = pd.DataFrame({
    'id': test_df['id'],
    'author': predictions
})

submission.to_csv("submission.csv", index=False)
print("Success! Your submission.csv is ready.")

tokenizer_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

HFValidationError: Repo id must use alphanumeric chars, '-', '_' or '.'. The name cannot start or end with '-' or '.' and the maximum length is 96: './author_id_model'.