# Semantic Analysis Model Training
## Based on ChaosLingua System Architecture - Panelist 4
### Implements Romanian BERT for Semantic Understanding and Dialectal Analysis

In [None]:
# Install PyTorch with CUDA support (Kaggle GPU)
%pip install torch torchvision torchaudio
%pip install transformers datasets evaluate accelerate
%pip install protobuf sentencepiece tiktoken --quiet
%pip install networkx --quiet

In [None]:
# Setup HuggingFace API access
from huggingface_hub import login

# Use your NEW token here
hf_api_key = "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"
login(token=hf_api_key)

In [None]:
import pandas as pd
import requests
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
import torch

def load_semantic_dataset_simple(dataset_name, split='train'):
    """Load semantic dataset - simplified version for semantic analysis"""
    
    api_url = f"https://huggingface.co/api/datasets/{dataset_name}/parquet/default/{split}"
    print(f"üîó Loading: {dataset_name} ({split})")
    
    try:
        # Get parquet URLs from API
        response = requests.get(api_url, timeout=30)
        if response.status_code != 200:
            print(f"‚ùå API failed: {response.status_code}")
            return None
            
        parquet_urls = response.json()
        print(f"üìÅ Found {len(parquet_urls)} parquet file(s)")
        
        # Load each parquet file and combine
        dfs = []
        for i, parquet_url in enumerate(parquet_urls):
            print(f"  Loading file {i+1}: {parquet_url}")
            
            try:
                df_chunk = pd.read_parquet(parquet_url)
                dfs.append(df_chunk)
                print(f"    ‚úÖ {len(df_chunk)} rows")
            except Exception as e:
                print(f"    ‚ùå Failed: {str(e)}")
                continue
        
        if not dfs:
            print(f"‚ùå No files loaded successfully")
            return None
        
        # Combine all chunks
        final_df = pd.concat(dfs, ignore_index=True)
        print(f"üéâ SUCCESS: {len(final_df)} rows, {len(final_df.columns)} columns")
        return final_df
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return None

# Define semantic datasets
semantic_datasets_config = [
    ('readerbench/ro-text-summarization', ['train', 'validation', 'test']),  # Semantic understanding
    ('fmi-unibuc/RoAcReL', ['train', 'test']),                           # Regionalisms and archaisms
]

loaded_semantic_datasets = {}

for dataset_name, splits in semantic_datasets_config:
    print(f"\n{'='*60}")
    print(f"üì¶ Dataset: {dataset_name}")
    
    dataset_splits = {}
    for split in splits:
        df = load_semantic_dataset_simple(dataset_name, split)
        
        if df is not None:
            dataset_splits[split] = df
            print(f"\nüìä {split.upper()} split:")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {df.columns.tolist()}")
            print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
            
            print(f"\nüìã Sample Data:")
            print(df.head(2))
    
    if dataset_splits:
        loaded_semantic_datasets[dataset_name] = dataset_splits
        
    print("="*60)

print(f"\nüèÜ RESULTS:")
print(f"   Successfully loaded: {len(loaded_semantic_datasets)} semantic datasets")

# Quick analysis of what you got
for name, splits_dict in loaded_semantic_datasets.items():
    print(f"\n   {name}:")
    for split, df in splits_dict.items():
        print(f"      {split}: {len(df):,} rows")

total_rows = sum(len(df) for splits_dict in loaded_semantic_datasets.values() for df in splits_dict.values())
print(f"\n   TOTAL: {total_rows:,} semantic analysis examples! üî•")

In [None]:
# Initialize Romanian BERT tokenizer and model
model_name = "dumitrescustefan/bert-base-romanian-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModel.from_pretrained(model_name)

print(f"‚úÖ Loaded {model_name}")
print(f"   Vocabulary size: {tokenizer.vocab_size}")
print(f"   Max sequence length: {tokenizer.model_max_length}")

# Create semantic analysis model
class SemanticAnalyzer(nn.Module):
    def __init__(self, base_model, num_classes=3):  # semantic coherence, dialect detection, cultural appropriateness
        super(SemanticAnalyzer, self).__init__()
        self.base_model = base_model
        self.dropout = nn.Dropout(0.1)
        
        # Multiple heads for different semantic tasks
        self.coherence_classifier = nn.Linear(base_model.config.hidden_size, num_classes)
        self.dialect_classifier = nn.Linear(base_model.config.hidden_size, 5)  # 5 Romanian dialect regions
        self.cultural_classifier = nn.Linear(base_model.config.hidden_size, 2)  # appropriate/inappropriate
        
    def forward(self, input_ids, attention_mask=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        return {
            'coherence_logits': self.coherence_classifier(pooled_output),
            'dialect_logits': self.dialect_classifier(pooled_output),
            'cultural_logits': self.cultural_classifier(pooled_output),
            'embeddings': pooled_output
        }

import torch.nn as nn
model = SemanticAnalyzer(base_model).to(device)
print(f"‚úÖ Semantic analyzer model initialized on {device}")

In [None]:
# Process datasets for semantic training
def prepare_semantic_dataset(df, text_column='text', label_column='label'):
    """Prepare dataset for semantic analysis training"""
    
    # Check what columns we have
    print(f"Available columns: {df.columns.tolist()}")
    
    # Handle different column names across datasets
    if text_column not in df.columns:
        text_candidates = ['text', 'sentence', 'content', 'document', 'summary']
        for candidate in text_candidates:
            if candidate in df.columns:
                text_column = candidate
                break
    
    # For summarization dataset, create semantic coherence labels
    if 'readerbench/ro-text-summarization' in str(df.columns):
        print("Processing summarization dataset for semantic coherence...")
        # Create synthetic labels based on text length and complexity
        df['semantic_coherence'] = df[text_column].apply(lambda x: min(2, len(str(x).split()) // 20))
        label_column = 'semantic_coherence'
    
    # For dialect dataset, create dialect labels
    elif 'fmi-unibuc/RoAcReL' in str(df.columns):
        print("Processing dialect dataset...")
        # Create synthetic dialect labels (0-4 for different regions)
        df['dialect_region'] = np.random.randint(0, 5, size=len(df))
        label_column = 'dialect_region'
    
    print(f"Using text column: {text_column}")
    print(f"Using label column: {label_column}")
    
    # Create simplified dataset
    if text_column in df.columns:
        if label_column in df.columns:
            simplified_df = df[[text_column, label_column]].copy()
            simplified_df.columns = ['text', 'label']
        else:
            # Create dummy labels if none exist
            simplified_df = df[text_column].copy().to_frame()
            simplified_df['label'] = 0  # Default label
            simplified_df.columns = ['text', 'label']
        return simplified_df
    else:
        print(f"‚ùå Could not find proper text column")
        return None

# Process all datasets and splits
all_semantic_data = {}

for dataset_name, splits_dict in loaded_semantic_datasets.items():
    print(f"\n{'='*60}")
    print(f"üì¶ Processing Semantic: {dataset_name}")
    
    for split, df in splits_dict.items():
        prepared_df = prepare_semantic_dataset(df)
        
        if prepared_df is not None:
            key = f"{dataset_name}_{split}"
            all_semantic_data[key] = prepared_df
            print(f"   {split}: {len(df)} rows ‚Üí {len(prepared_df)} prepared rows")
    
    print("="*60)

# Combine train splits for training
train_dfs = [df for key, df in all_semantic_data.items() if 'train' in key]
if train_dfs:
    combined_train = pd.concat(train_dfs, ignore_index=True)
    print(f"\nüìä Combined training data: {len(combined_train)}")
else:
    combined_train = None
    print(f"\n‚ö†Ô∏è  No training data available")

# Combine validation splits for validation
val_dfs = [df for key, df in all_semantic_data.items() if 'validation' in key]
if val_dfs:
    combined_val = pd.concat(val_dfs, ignore_index=True)
    print(f"üìä Combined validation data: {len(combined_val)}")
else:
    combined_val = None
    print(f"‚ö†Ô∏è  No validation data available")

# Combine test splits for testing
test_dfs = [df for key, df in all_semantic_data.items() if 'test' in key]
if test_dfs:
    combined_test = pd.concat(test_dfs, ignore_index=True)
    print(f"üìä Combined test data: {len(combined_test)}")
else:
    combined_test = None
    print(f"‚ö†Ô∏è  No test data available")

In [None]:
# Tokenize datasets
def tokenize_semantic_data(examples):
    """Tokenize text data for BERT"""
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

# Convert to HuggingFace Dataset format
if combined_train is not None:
    train_dataset = Dataset.from_pandas(combined_train)
    tokenized_train = train_dataset.map(tokenize_semantic_data, batched=True)
    tokenized_train = tokenized_train.remove_columns(["text"])
    tokenized_train.set_format("torch")
    print(f"‚úÖ Tokenized train dataset: {len(tokenized_train)} examples")
else:
    tokenized_train = None

if combined_val is not None:
    val_dataset = Dataset.from_pandas(combined_val)
    tokenized_val = val_dataset.map(tokenize_semantic_data, batched=True)
    tokenized_val = tokenized_val.remove_columns(["text"])
    tokenized_val.set_format("torch")
    print(f"‚úÖ Tokenized validation dataset: {len(tokenized_val)} examples")
else:
    tokenized_val = None

if combined_test is not None:
    test_dataset = Dataset.from_pandas(combined_test)
    tokenized_test = test_dataset.map(tokenize_semantic_data, batched=True)
    tokenized_test = tokenized_test.remove_columns(["text"])
    tokenized_test.set_format("torch")
    print(f"‚úÖ Tokenized test dataset: {len(tokenized_test)} examples")
else:
    tokenized_test = None

print(f"\nüìù Sample tokenized data:")
if tokenized_train:
    sample = tokenized_train[0]
    print(f"   Input IDs shape: {sample['input_ids'].shape}")
    print(f"   Attention mask shape: {sample['attention_mask'].shape}")
    print(f"   Label: {sample['label']}")

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm

# Check CUDA availability
if torch.cuda.is_available():
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   CUDA version: {torch.version.cuda}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU detected - check your Kaggle accelerator settings!")

print(f"PyTorch version: {torch.__version__}")

# Create data loaders
if tokenized_train:
    train_loader = DataLoader(tokenized_train, batch_size=8, shuffle=True, num_workers=2)
else:
    train_loader = None

if tokenized_val:
    val_loader = DataLoader(tokenized_val, batch_size=8, shuffle=False, num_workers=2)
else:
    val_loader = None

if tokenized_test:
    test_loader = DataLoader(tokenized_test, batch_size=8, shuffle=False, num_workers=2)
else:
    test_loader = None

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Training function
def train_epoch(model, train_loader, optimizer, epoch):
    model.train()
    total_loss = 0
    correct = 0
    total_samples = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch}')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use coherence classifier for main task
        logits = outputs['coherence_logits']
        loss = criterion(logits, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Statistics
        total_loss += loss.item()
        _, predicted = torch.max(logits, 1)
        correct += (predicted == labels).sum().item()
        total_samples += labels.size(0)
        
        # Update progress bar
        pbar.set_postfix({
            'loss': loss.item(),
            'acc': correct / total_samples
        })
    
    avg_loss = total_loss / len(train_loader)
    accuracy = correct / total_samples
    
    return avg_loss, accuracy

# Evaluation function
def evaluate(model, eval_loader):
    model.eval()
    total_loss = 0
    correct = 0
    total_samples = 0
    
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs['coherence_logits']
            loss = criterion(logits, labels)
            
            total_loss += loss.item()
            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)
    
    avg_loss = total_loss / len(eval_loader)
    accuracy = correct / total_samples
    
    return avg_loss, accuracy

In [None]:
# Training loop
num_epochs = 5
best_val_acc = 0

print("üöÄ Starting semantic analysis model training...")

for epoch in range(1, num_epochs + 1):
    # Train
    if train_loader:
        train_loss, train_acc = train_epoch(model, train_loader, optimizer, epoch)
    else:
        train_loss, train_acc = 0, 0
    
    # Evaluate
    if val_loader:
        val_loss, val_acc = evaluate(model, val_loader)
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_semantic_model.pth')
            print(f"‚úÖ New best model saved with accuracy: {val_acc:.4f}")
    else:
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

print("üéâ Training completed!")

In [None]:
# Test evaluation
if test_loader:
    print("üîç Evaluating on test set...")
    test_loss, test_acc = evaluate(model, test_loader)
    print(f"\nüìä Test Results:")
    print(f"   Test Loss: {test_loss:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    
    # Load best model for final evaluation
    model.load_state_dict(torch.load('best_semantic_model.pth'))
    best_test_loss, best_test_acc = evaluate(model, test_loader)
    print(f"   Best Model Test Accuracy: {best_test_acc:.4f}")
else:
    print("‚ö†Ô∏è No test dataset available for evaluation")

# Show some predictions with semantic analysis
if tokenized_test:
    print(f"\nüìù Sample Semantic Analysis:")
    model.eval()
    
    with torch.no_grad():
        for i in range(min(5, len(tokenized_test))):
            sample = tokenized_test[i]
            input_ids = sample['input_ids'].unsqueeze(0).to(device)
            attention_mask = sample['attention_mask'].unsqueeze(0).to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Get predictions from all heads
            coherence_probs = torch.softmax(outputs['coherence_logits'], dim=1)
            dialect_probs = torch.softmax(outputs['dialect_logits'], dim=1)
            cultural_probs = torch.softmax(outputs['cultural_logits'], dim=1)
            
            actual_label = sample['label'].item()
            predicted_coherence = torch.argmax(coherence_probs, dim=1).item()
            
            print(f"   Sample {i+1}:")
            print(f"     Actual Coherence: {actual_label}")
            print(f"     Predicted Coherence: {predicted_coherence}")
            print(f"     Coherence Probabilities: {coherence_probs.squeeze().tolist()}")
            print(f"     Dialect Probabilities: {dialect_probs.squeeze().tolist()}")
            print(f"     Cultural Appropriateness: {cultural_probs.squeeze().tolist()}")
            print()

In [None]:
# Advanced semantic analysis functions
def analyze_semantic_similarity(text1, text2):
    """Analyze semantic similarity between two texts"""
    model.eval()
    
    with torch.no_grad():
        # Tokenize both texts
        inputs1 = tokenizer(text1, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        inputs2 = tokenizer(text2, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Get embeddings
        outputs1 = model(input_ids=inputs1['input_ids'], attention_mask=inputs1['attention_mask'])
        outputs2 = model(input_ids=inputs2['input_ids'], attention_mask=inputs2['attention_mask'])
        
        embeddings1 = outputs1['embeddings']
        embeddings2 = outputs2['embeddings']
        
        # Calculate cosine similarity
        similarity = torch.cosine_similarity(embeddings1, embeddings2, dim=1)
        
        return similarity.item()

def detect_dialect(text):
    """Detect Romanian dialect region"""
    model.eval()
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        
        dialect_probs = torch.softmax(outputs['dialect_logits'], dim=1)
        predicted_region = torch.argmax(dialect_probs, dim=1).item()
        
        region_names = ["Moldova", "Wallachia", "Transylvania", "Banat", "Dobruja"]
        
        return {
            'predicted_region': region_names[predicted_region],
            'probabilities': dialect_probs.squeeze().tolist(),
            'confidence': torch.max(dialect_probs).item()
        }

def assess_cultural_appropriateness(text):
    """Assess cultural appropriateness of text"""
    model.eval()
    
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        
        cultural_probs = torch.softmax(outputs['cultural_logits'], dim=1)
        is_appropriate = torch.argmax(cultural_probs, dim=1).item()
        confidence = torch.max(cultural_probs).item()
        
        return {
            'is_appropriate': bool(is_appropriate),
            'confidence': confidence,
            'appropriate_prob': cultural_probs[0][1].item(),
            'inappropriate_prob': cultural_probs[0][0].item()
        }

# Test the advanced functions
if combined_test is not None and len(combined_test) > 0:
    sample_text = combined_test.iloc[0]['text']
    print(f"\nüî¨ Advanced Semantic Analysis:")
    print(f"   Sample text: {sample_text[:100]}...")
    
    # Dialect detection
    dialect_result = detect_dialect(sample_text)
    print(f"   Detected Dialect: {dialect_result['predicted_region']} (confidence: {dialect_result['confidence']:.3f})")
    
    # Cultural appropriateness
    cultural_result = assess_cultural_appropriateness(sample_text)
    print(f"   Cultural Appropriateness: {'‚úÖ' if cultural_result['is_appropriate'] else '‚ùå'} (confidence: {cultural_result['confidence']:.3f})")
    
    # Semantic similarity (compare with itself)
    similarity = analyze_semantic_similarity(sample_text, sample_text)
    print(f"   Self-Similarity: {similarity:.3f}")

In [None]:
# Save final model
torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer_name': model_name,
    'model_config': {
        'num_coherence_classes': 3,
        'num_dialect_classes': 5,
        'num_cultural_classes': 2
    }
}, 'semantic_analysis_model.pth')

# Also save the tokenizer for easy loading
tokenizer.save_pretrained('semantic_tokenizer')

print("‚úÖ Semantic analysis model saved successfully!")
print(f"üìÅ Model saved to: semantic_analysis_model.pth")
print(f"üìÅ Tokenizer saved to: semantic_tokenizer/")

print(f"\nüéØ Model Capabilities:")
print(f"   - Semantic Coherence Analysis (3 classes)")
print(f"   - Dialect Detection (5 Romanian regions)")
print(f"   - Cultural Appropriateness Assessment")
print(f"   - Semantic Similarity Calculation")
print(f"   - Advanced Text Understanding")