# Code Classification Pipeline - Data Exploration

This notebook explores the code classification dataset and demonstrates the pipeline usage.

In [None]:
import sys
sys.path.insert(0, '../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Data

In [None]:
# Load configuration
from src.utils import load_config
config = load_config('../configs/config.yaml')

# Load data
from src.data_loader import DataLoader
data_loader = DataLoader(config)

try:
    train_df, val_df, test_df = data_loader.load_all_data()
    print(f"Train: {len(train_df)} samples")
    print(f"Val: {len(val_df)} samples")
    print(f"Test: {len(test_df)} samples")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please make sure data files are in the correct location.")

## 2. Data Overview

In [None]:
# Display first few samples
if 'train_df' in locals():
    print("Training Data Sample:")
    display(train_df.head())
    
    print("\nData Info:")
    print(train_df.info())

## 3. Label Distribution

In [None]:
if 'train_df' in locals() and 'label' in train_df.columns:
    # Count labels
    label_counts = train_df['label'].value_counts().sort_index()
    
    # Plot distribution
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Bar plot
    label_counts.plot(kind='bar', ax=ax1, color='skyblue')
    ax1.set_title('Label Distribution (Count)')
    ax1.set_xlabel('Label')
    ax1.set_ylabel('Count')
    ax1.grid(axis='y', alpha=0.3)
    
    # Pie chart
    label_counts.plot(kind='pie', ax=ax2, autopct='%1.1f%%')
    ax2.set_title('Label Distribution (Percentage)')
    ax2.set_ylabel('')
    
    plt.tight_layout()
    plt.show()
    
    print("\nLabel Statistics:")
    print(label_counts)

## 4. Code Length Analysis

In [None]:
if 'train_df' in locals():
    # Calculate code lengths
    train_df['code_length'] = train_df['code'].str.len()
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Histogram
    train_df['code_length'].hist(bins=50, ax=axes[0], color='coral', edgecolor='black')
    axes[0].set_title('Code Length Distribution')
    axes[0].set_xlabel('Length (characters)')
    axes[0].set_ylabel('Frequency')
    axes[0].axvline(train_df['code_length'].median(), color='red', linestyle='--', label='Median')
    axes[0].legend()
    
    # Box plot by label
    if 'label' in train_df.columns:
        train_df.boxplot(column='code_length', by='label', ax=axes[1])
        axes[1].set_title('Code Length by Label')
        axes[1].set_xlabel('Label')
        axes[1].set_ylabel('Length (characters)')
    
    plt.tight_layout()
    plt.show()
    
    print("\nCode Length Statistics:")
    print(train_df['code_length'].describe())

## 5. Sample Code Snippets

In [None]:
if 'train_df' in locals():
    print("Sample Code Snippets:\n")
    
    for i, row in train_df.head(3).iterrows():
        print(f"--- Sample {i+1} ---")
        if 'label' in row:
            print(f"Label: {row['label']}")
        print(f"Code Length: {len(row['code'])} characters")
        print(f"Code Preview:\n{row['code'][:300]}...\n")

## 6. Pipeline Demo - Language Detection

In [None]:
from src.pipeline import CodeClassificationPipeline

# Initialize pipeline
pipeline = CodeClassificationPipeline(config)

# Test on a few samples
if 'test_df' in locals():
    sample_codes = test_df['code'].head(5).tolist()
    
    print("Running language detection on 5 samples...\n")
    
    # Note: This will download the model if not already cached
    # Uncomment to run:
    # detected_languages = pipeline.run_language_detection(sample_codes)
    # for i, (code, lang) in enumerate(zip(sample_codes, detected_languages)):
    #     print(f"Sample {i+1}: {lang}")
    #     print(f"Code: {code[:100]}...\n")

## 7. Data Quality Checks

In [None]:
if 'train_df' in locals():
    print("Data Quality Report:\n")
    
    # Missing values
    print("Missing Values:")
    print(train_df.isnull().sum())
    
    # Duplicates
    n_duplicates = train_df.duplicated(subset=['code']).sum()
    print(f"\nDuplicate Code Samples: {n_duplicates}")
    
    # Empty or very short codes
    short_codes = (train_df['code'].str.len() < 10).sum()
    print(f"Very Short Codes (<10 chars): {short_codes}")
    
    # Class balance
    if 'label' in train_df.columns:
        class_balance = train_df['label'].value_counts(normalize=True) * 100
        print(f"\nClass Balance (%):\n{class_balance}")

## 8. Next Steps

To train the pipeline:

```bash
python ../main.py --mode train --output_dir ../models/checkpoints
```

To run predictions:

```bash
python ../main.py --mode predict --checkpoint ../models/checkpoints/bert_best.pt
```