# Reference Classification Model

This notebook implements a machine learning pipeline to classify academic references by publication type.

**Objective**: Automatically categorize bibliographic references into types like journal articles, books, theses, etc.

**Approach**: 
- Text preprocessing and cleaning
- TF-IDF feature extraction
- Multinomial Naive Bayes classification
- Model evaluation and performance analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import sys
import os

# Add src directory to path for importing utils
sys.path.append('../src')
from utils import clean_reference

# Cell 3 - Data Loading
# Load the reference data
try:
    df = pd.read_csv('../data/references.csv')
    print(f"Data loaded successfully. Shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print("\nFirst few rows:")
    print(df.head())
except FileNotFoundError:
    print("references.csv not found. Please ensure your data file is in the data/ directory")
    print("Expected columns: 'reference_text', 'publication_type'")
    # Create sample data structure for demonstration
    df = pd.DataFrame({
        'reference_text': ['Sample journal reference...', 'Sample book reference...'],
        'publication_type': ['journal', 'book']
    })

# Cell 4 - Data Preprocessing
# Clean the reference texts
print("Cleaning reference texts...")
df['cleaned_reference'] = df['reference_text'].apply(clean_reference)

# Remove any empty references after cleaning
df = df[df['cleaned_reference'].str.len() > 0]

print(f"Data after cleaning: {df.shape}")
print("\nPublication type distribution:")
print(df['publication_type'].value_counts())

# Cell 5 - Train-Test Split
# Split the data
X = df['cleaned_reference']
y = df['publication_type']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Cell 6 - Model Training
# Create and train the classification pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))),
    ('classifier', MultinomialNB(alpha=1.0))
])

print("Training the model...")
pipeline.fit(X_train, y_train)
print("Model training completed!")

# Cell 7 - Model Evaluation
# Make predictions
y_pred = pipeline.predict(X_test)

# Print classification report
print("Classification Report:")
print("=" * 50)
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:")
print("=" * 30)
print(confusion_matrix(y_test, y_pred))

# Cell 8 - Feature Analysis
# Get feature names and importance (top TF-IDF features)
feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
tfidf_scores = pipeline.named_steps['tfidf'].transform(X_train).mean(axis=0).A1

# Get top features
top_features_idx = tfidf_scores.argsort()[-20:][::-1]
top_features = [(feature_names[i], tfidf_scores[i]) for i in top_features_idx]

print("Top 20 TF-IDF Features:")
print("=" * 40)
for feature, score in top_features:
    print(f"{feature}: {score:.4f}")