# Reference Classification Model

This notebook implements a machine learning pipeline to classify academic references by publication type.

**Objective**: Automatically categorize bibliographic references into types like journal articles, books, theses, etc.

**Approach**: 
- Text preprocessing and cleaning
- TF-IDF feature extraction
- Multinomial Naive Bayes classification
- Model evaluation and performance analysis

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import sys
import os

# Add src directory to path for importing utils
sys.path.append('../src')
from utils import clean_reference

# Cell 3 - Data Loading
# Load the reference data
try:
    df = pd.read_csv('../data/references.csv')
    print(f"Data loaded successfully. Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("\nFirst few rows:")
    print(df.head())
    
    # Handle different column name formats - ADD THIS FIX
    if 'type' in df.columns and 'publication_type' not in df.columns:
        df['publication_type'] = df['type']
        print("✅ Column 'type' renamed to 'publication_type'")
    elif 'publication_type' not in df.columns and 'type' not in df.columns:
        print("❌ ERROR: No 'type' or 'publication_type' column found!")
        print("Available columns:", df.columns.tolist())
        raise KeyError("Missing required label column")
        
except FileNotFoundError:
    print("references.csv not found. Please ensure your data file is in the data/ directory")
    print("Expected columns: 'reference_text', 'publication_type'")
    # Create sample data structure for demonstration
    df = pd.DataFrame({
        'reference_text': ['Sample journal reference...', 'Sample book reference...'],
        'publication_type': ['journal', 'book']
    })

# Cell 4 - Data Preprocessing
# Clean the reference texts
print("\nCleaning reference texts...")
df['cleaned_reference'] = df['reference_text'].apply(clean_reference)

# Remove any empty references after cleaning
df = df[df['cleaned_reference'].str.len() > 0]

print(f"Data after cleaning: {df.shape}")
print("\nPublication type distribution:")
print(df['publication_type'].value_counts())

# Check if we have enough data for train-test split
if len(df) < 4:
    print("⚠️  WARNING: Very small dataset. Consider adding more samples for better results.")

# Cell 5 - Train-Test Split
# Split the data
X = df['cleaned_reference']
y = df['publication_type']

# Adjust test_size based on dataset size
test_size = 0.2 if len(df) >= 10 else 0.3
min_samples_per_class = y.value_counts().min()

if min_samples_per_class < 2:
    print("⚠️  WARNING: Some classes have only 1 sample. Stratification disabled.")
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42
    )
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=42, stratify=y
    )

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training distribution:")
print(y_train.value_counts())

# Cell 6 - Model Training
# Create and train the classification pipeline
# Adjust max_features based on dataset size
max_features = min(5000, len(X_train) * 10)  # Reasonable upper limit

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=max_features, 
        stop_words='english', 
        ngram_range=(1, 2),
        min_df=1  # Allow features that appear in at least 1 document
    )),
    ('classifier', MultinomialNB(alpha=1.0))
])

print("\nTraining the model...")
pipeline.fit(X_train, y_train)
print("✅ Model training completed!")

# Cell 7 - Model Evaluation
# Make predictions
y_pred = pipeline.predict(X_test)

# Print classification report
print("\nClassification Report:")
print("=" * 50)
print(classification_report(y_test, y_pred, zero_division=0))

# Print confusion matrix
print("\nConfusion Matrix:")
print("=" * 30)
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Calculate and display accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Cell 8 - Feature Analysis
# Get feature names and importance (top TF-IDF features)
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
tfidf_matrix = pipeline.named_steps['tfidf'].transform(X_train)

# Calculate mean TF-IDF scores across all documents
tfidf_scores = tfidf_matrix.mean(axis=0).A1

# Get top features (limit to available features)
n_top_features = min(20, len(feature_names))
top_features_idx = tfidf_scores.argsort()[-n_top_features:][::-1]
top_features = [(feature_names[i], tfidf_scores[i]) for i in top_features_idx]

print(f"\nTop {n_top_features} TF-IDF Features:")
print("=" * 40)
for feature, score in top_features:
    print(f"{feature}: {score:.4f}")

# Cell 9 - Sample Predictions (NEW ADDITION)
print("\n🎯 Testing Model on Sample References:")
print("=" * 45)

# Test the model on a few sample references
sample_refs = [
    "Smith, J. (2023). Machine Learning in Practice. Journal of AI Research, 15(3), 45-67.",
    "Brown, A. (2022). Data Science Fundamentals. MIT Press, Cambridge.",
    "Johnson, M. (2024). Deep Learning Applications. PhD Thesis, University of Technology.",
    "Wilson, K. et al. (2023). Neural Networks in Healthcare. Proceedings of IEEE Conference, pp. 123-135."
]

for i, ref in enumerate(sample_refs, 1):
    try:
        cleaned_ref = clean_reference(ref)
        if cleaned_ref:  # Only predict if cleaning was successful
            prediction = pipeline.predict([cleaned_ref])[0]
            confidence = max(pipeline.predict_proba([cleaned_ref])[0])
            
            print(f"\nSample {i}:")
            print(f"Reference: {ref[:60]}...")
            print(f"Predicted: {prediction} (Confidence: {confidence:.3f})")
        else:
            print(f"\nSample {i}: Could not process reference")
    except Exception as e:
        print(f"\nSample {i}: Error in prediction - {e}")

# Cell 10 - Model Summary (NEW ADDITION)
print(f"\n📊 MODEL SUMMARY")
print("=" * 30)
print(f"Dataset size: {len(df)} references")
print(f"Classes: {len(y.unique())}")
print(f"Features extracted: {len(feature_names)}")
print(f"Training accuracy: {pipeline.score(X_train, y_train):.4f}")
print(f"Test accuracy: {accuracy:.4f}")
print(f"\nClass distribution:")
for class_name, count in y.value_counts().items():
    print(f"  {class_name}: {count} samples")

print(f"\n✅ Model is ready for classifying new references!")
print("To use: pipeline.predict([clean_reference('your reference here')])")

Data loaded successfully. Shape: (10, 2)
Columns: ['reference_text', 'type']

First few rows:
                                      reference_text              type
0  Smith J. (2015). Deep Learning Advances. Journ...   Journal Article
1    Brown, A. (2020). Modern Agriculture. Springer.              Book
2  Zimba, M. (2022). Smart Farming in Zambia. MSc...            Thesis
3  Mwale, K. (2018). Cyber Security. Proc. of IEE...  Conference Paper
4    WHO. (2021). Global Health Report. Geneva: WHO.            Report
✅ Column 'type' renamed to 'publication_type'

Cleaning reference texts...
Data after cleaning: (10, 4)

Publication type distribution:
publication_type
Journal Article     2
Book                2
Thesis              2
Report              2
Conference Paper    1
Web Resource        1
Name: count, dtype: int64

Training set size: 8
Test set size: 2
Training distribution:
publication_type
Journal Article     2
Report              2
Web Resource        1
Thesis              1
Co