In [None]:
import pandas as pd

In [18]:
df = pd.read_excel("/kaggle/input/authorshipclassficiationval/AuthorshipClassficiationVal.xlsx")

In [19]:
df.head()

Unnamed: 0,id,text_in_author_style,author
0,3835,من طفل في الخمسين\n\nعمري ما احتفلت أو حفلت بع...,يوسف إدريس
1,3836,ذلك الزمن العام هو العداد العام الذي\n\nدام يع...,يوسف إدريس
2,3837,مصر الغنية المثقفة المصنِّعة، والعرب\n\nوقد\n\...,يوسف إدريس
3,3838,ولأنها غريبة وراودتني فيها عن الناس وعن الحياة...,يوسف إدريس
4,3839,وليس ما ذكرته مرارة ولا ندمًا؛ فقد كان لا يمكن...,يوسف إدريس


In [20]:
len(df)

4157

In [21]:
print(df["author"].unique())

['يوسف إدريس' 'فؤاد زكريا' 'حسن حنفي' 'عبد الغفار مكاوي' 'كامل كيلاني'
 'نوال السعداوي' 'أحمد شوقي' 'أحمد تيمور باشا' 'ثروت أباظة' 'سلامة موسى'
 'جبران خليل جبران' 'روبرت بار' 'ويليام شيكسبير' 'طه حسين' 'أمين الريحاني'
 'غوستاف لوبون' 'نجيب محفوظ' 'أحمد أمين' 'محمد حسين هيكل' 'جُرجي زيدان'
 'عباس محمود العقاد']


In [22]:
df.isna().sum()

id                      0
text_in_author_style    0
author                  0
dtype: int64

In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
import json
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

API_KEYS = [
## api key
]

api_index = 0
api_lock = Lock()
request_times = []
request_lock = Lock()

RPM_LIMIT = 40
RATE_WINDOW = 60  

def get_next_api_key():
    """Rotate through API keys"""
    global api_index
    with api_lock:
        key = API_KEYS[api_index]
        api_index = (api_index + 1) % len(API_KEYS)
        return key

def rate_limit_check():
    """Ensure we don't exceed RPM limit"""
    with request_lock:
        current_time = time.time()
        request_times[:] = [t for t in request_times if current_time - t < RATE_WINDOW]
        
        if len(request_times) >= RPM_LIMIT:
            # Calculate sleep time
            oldest_request = request_times[0]
            sleep_time = RATE_WINDOW - (current_time - oldest_request)
            if sleep_time > 0:
                print(f"Rate limit reached. Sleeping for {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
        
        request_times.append(time.time())

def create_classification_prompt(text, authors_list):
    """
    Create a well-engineered prompt for authorship classification
    """
    prompt = f"""أنت خبير في تحليل الأسلوب الأدبي العربي. مهمتك هي تحديد كاتب النص التالي من بين قائمة من المؤلفين.

قائمة المؤلفين المحتملين:
{', '.join(authors_list)}

النص المراد تحليله:
\"\"\"{text}\"\"\"

قم بتحليل:
1. الأسلوب الأدبي والبلاغي
2. المفردات والتراكيب اللغوية المستخدمة
3. الموضوعات والأفكار المطروحة
4. السمات الأسلوبية المميزة

بناءً على تحليلك، أجب فقط باسم المؤلف من القائمة المعطاة. لا تضف أي تفسيرات أو نصوص إضافية.

الإجابة (اسم المؤلف فقط):"""
    
    return prompt

def classify_text(row_data, authors_list, max_retries=3):
    """
    Classify a single text using the LLM with retry logic
    """
    text_id = row_data['id']
    text = row_data['text_in_author_style']
    true_author = row_data['author']
    
    for attempt in range(max_retries):
        try:
            # Rate limiting
            rate_limit_check()
            
            # Get API key
            api_key = get_next_api_key()
            
            # Create client
            client = OpenAI(
                base_url="https://integrate.api.nvidia.com/v1",
                api_key=api_key
            )
            
            # Create prompt
            prompt = create_classification_prompt(text, authors_list)
            
            # Make API call
            completion = client.chat.completions.create(
                model="deepseek-ai/deepseek-v3.2",
                messages=[{"role": "user", "content": prompt}],
                temperature=0.3,  # Lower temperature for more consistent classification
                top_p=0.95,
                max_tokens=100,  # We only need the author name
                extra_body={"chat_template_kwargs": {"thinking": True}},
                stream=True
            )
            
            # Collect response
            predicted_author = ""
            for chunk in completion:
                if not getattr(chunk, "choices", None):
                    continue
                if chunk.choices and chunk.choices[0].delta.content is not None:
                    predicted_author += chunk.choices[0].delta.content
            
            # Clean the response
            predicted_author = predicted_author.strip()
            
            # Validate that the predicted author is in the list
            if predicted_author not in authors_list:
                # Try to find closest match
                for author in authors_list:
                    if author in predicted_author or predicted_author in author:
                        predicted_author = author
                        break
            
            print(f"✓ ID {text_id}: True={true_author}, Predicted={predicted_author}")
            
            return {
                'id': text_id,
                'text': text,
                'true_author': true_author,
                'predicted_author': predicted_author,
                'success': True
            }
            
        except Exception as e:
            print(f"✗ ID {text_id} - Attempt {attempt + 1} failed: {str(e)}")
            if attempt < max_retries - 1:
                time.sleep(2)  # Wait before retry
            else:
                print(f"✗ ID {text_id} - All attempts failed. Skipping.")
                return None
    
    return None

def parallel_classify(df, max_workers=5):
    """
    Classify texts in parallel using ThreadPoolExecutor
    """
    # Get unique authors
    authors_list = df['author'].unique().tolist()
    
    # Prepare data for parallel processing
    rows_data = []
    for idx, row in df.iterrows():
        rows_data.append({
            'id': row['id'],
            'text_in_author_style': row['text_in_author_style'],
            'author': row['author']
        })
    
    results = []
    
    # Process in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks
        future_to_row = {
            executor.submit(classify_text, row_data, authors_list): row_data 
            for row_data in rows_data
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_row):
            result = future.result()
            if result is not None and result['success']:
                results.append(result)
    
    return pd.DataFrame(results)

def evaluate_predictions(results_df):
    """
    Calculate evaluation metrics
    """
    y_true = results_df['true_author'].values
    y_pred = results_df['predicted_author'].values
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    
    # F1 Score (weighted average for multi-class)
    f1_weighted = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    f1_macro = f1_score(y_true, y_pred, average='macro', zero_division=0)
    
    print("\n" + "="*60)
    print("EVALUATION METRICS")
    print("="*60)
    print(f"Total Samples Processed: {len(results_df)}")
    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"F1 Score (Weighted): {f1_weighted:.4f}")
    print(f"F1 Score (Macro): {f1_macro:.4f}")
    print("="*60)
    
    # Detailed classification report
    print("\nCLASSIFICATION REPORT:")
    print(classification_report(y_true, y_pred, zero_division=0))
    
    # Confusion matrix
    print("\nCONFUSION MATRIX:")
    cm = confusion_matrix(y_true, y_pred)
    unique_authors = sorted(results_df['true_author'].unique())
    
    # Create a simple confusion matrix display
    print(f"\n{'':20s}", end='')
    for author in unique_authors:
        print(f"{author[:15]:>15s}", end=' ')
    print()
    
    for i, true_author in enumerate(unique_authors):
        print(f"{true_author[:20]:20s}", end='')
        for j in range(len(unique_authors)):
            print(f"{cm[i][j]:>15d}", end=' ')
        print()
    
    return {
        'accuracy': accuracy,
        'f1_weighted': f1_weighted,
        'f1_macro': f1_macro
    }

if __name__ == "__main__":
    print("Loading data...")
    
    print(f"Total samples: {len(df)}")
    print(f"Unique authors: {len(df['author'].unique())}")
    print(f"Authors: {df['author'].unique().tolist()}")
    
    print("\n" + "="*60)
    print("Starting parallel classification...")
    print(f"Using {len(API_KEYS)} API keys with rate limit of {RPM_LIMIT} RPM")
    print("="*60 + "\n")
    
    start_time = time.time()
    
    results_df = parallel_classify(df, max_workers=8)
    
    elapsed_time = time.time() - start_time
    
    print(f"\n✓ Classification completed in {elapsed_time:.2f} seconds")
    print(f"✓ Successfully processed {len(results_df)} out of {len(df)} samples")
    
    results_df.to_csv('authorship_predictions.csv', index=False, encoding='utf-8-sig')
    print("✓ Results saved to 'authorship_predictions.csv'")
    
    metrics = evaluate_predictions(results_df)
    
    with open('evaluation_metrics.json', 'w', encoding='utf-8') as f:
        json.dump(metrics, f, indent=2, ensure_ascii=False)
    print("✓ Metrics saved to 'evaluation_metrics.json'")

In [16]:
import json

with open('/kaggle/working/evaluation_metrics.json', 'r') as f:
    metrics = json.load(f)

accuracy = metrics['accuracy']

print(f"Accuracy: {accuracy}")

Accuracy: 0.12533333333333332


In [17]:
import json

with open('/kaggle/working/evaluation_metrics.json', 'r') as f:
    metrics = json.load(f)

print("All metrics:", metrics)

accuracy = metrics.get('accuracy', None)
print(f"Accuracy: {accuracy}")

All metrics: {'accuracy': 0.12533333333333332, 'f1_weighted': 0.02791785150078989, 'f1_macro': 0.05568720379146919}
Accuracy: 0.12533333333333332
