# News Article Classification - Part 5: Predict New Articles

## Overview
This notebook demonstrates how to use the trained model to predict categories for new news articles.

## Steps:
1. Load trained model and vectorizers
2. Preprocess new article text
3. Make category predictions
4. Get prediction probabilities for all categories


## Step 1: Import Libraries and Load Preprocessing Functions


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import pickle
import re

# Text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data if needed
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

# Initialize text preprocessing components
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Clean text by removing HTML tags, special characters, and extra whitespace"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

def preprocess_text(text):
    """Complete text preprocessing pipeline"""
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

print("Libraries and preprocessing functions loaded!")


## Step 2: Load Trained Model and Vectorizers


In [None]:
# Load TF-IDF vectorizer
with open('models/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load feature scaler
with open('models/feature_scaler.pkl', 'rb') as f:
    feature_scaler = pickle.load(f)

# Load label encoder
with open('models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

# Load best model
with open('models/best_model.pkl', 'rb') as f:
    model = pickle.load(f)

print("Model and vectorizers loaded successfully!")
print(f"Model type: {type(model).__name__}")
print(f"Label classes: {label_encoder.classes_}")


## Step 3: Prediction Function


In [None]:
def predict_category(article_text):
    """
    Predict category for a given article text
    
    Parameters:
    -----------
    article_text : str
        The news article text to analyze
    
    Returns:
    --------
    dict : Dictionary containing prediction, probability, and confidence
    """
    # Preprocess the text
    cleaned_text = preprocess_text(article_text)
    
    # Extract textual features
    char_count = len(cleaned_text)
    word_count = len(cleaned_text.split())
    avg_word_length = char_count / (word_count + 1) if word_count > 0 else 0
    exclamation_count = cleaned_text.count('!')
    question_count = cleaned_text.count('?')
    
    # Transform text using TF-IDF
    text_tfidf = tfidf_vectorizer.transform([cleaned_text])
    
    # Scale textual features
    textual_features = np.array([[char_count, word_count, avg_word_length, 
                                  exclamation_count, question_count]])
    textual_features_scaled = feature_scaler.transform(textual_features)
    
    # Combine features
    from scipy.sparse import hstack
    features = hstack([text_tfidf, textual_features_scaled])
    
    # Make prediction
    prediction = model.predict(features)[0]
    category = label_encoder.inverse_transform([prediction])[0]
    
    # Get prediction probability if available
    if hasattr(model, 'predict_proba'):
        probabilities = model.predict_proba(features)[0]
        prob_dict = {label_encoder.classes_[i]: probabilities[i] 
                    for i in range(len(label_encoder.classes_))}
        confidence = max(probabilities)
    else:
        prob_dict = None
        confidence = None
    
    return {
        'category': category,
        'prediction': prediction,
        'probabilities': prob_dict,
        'confidence': confidence,
        'original_text': article_text,
        'cleaned_text': cleaned_text
    }

print("Prediction function created!")


## Step 4: Predict Category for Sample Articles


In [None]:
# Sample articles for testing
sample_articles = [
    "Scientists discover new breakthrough in renewable energy technology that could revolutionize solar power efficiency.",
    "Local basketball team wins championship after thrilling overtime victory in the final game of the season.",
    "New study reveals benefits of meditation and mindfulness practices for mental health and stress reduction.",
    "Political leaders meet to discuss climate change policies and international cooperation agreements.",
    "Tech company announces revolutionary AI system that can understand and process natural language more accurately."
]

print("="*80)
print("PREDICTING CATEGORY FOR SAMPLE ARTICLES")
print("="*80)

for i, article in enumerate(sample_articles, 1):
    result = predict_category(article)
    
    print(f"\nArticle {i}:")
    print(f"Text: {article[:100]}...")
    print(f"Predicted Category: {result['category'].upper()}")
    if result['probabilities']:
        # Show top 3 probabilities
        sorted_probs = sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True)[:3]
        print(f"Top 3 Probabilities:")
        for cat, prob in sorted_probs:
            print(f"  {cat}: {prob:.2%}")
        print(f"Confidence: {result['confidence']:.2%}")
    print("-" * 80)


## Step 5: Interactive Prediction


In [None]:
# Enter your own article here
new_article = input("Enter a news article to analyze: ")

if new_article.strip():
    result = predict_category(new_article)
    
    print("\n" + "="*80)
    print("PREDICTION RESULT")
    print("="*80)
    print(f"\nArticle: {new_article}")
    print(f"\nPredicted Category: {result['category'].upper()}")
    
    if result['probabilities']:
        print(f"\nConfidence: {result['confidence']:.2%}")
        print(f"\nTop 5 Category Probabilities:")
        sorted_probs = sorted(result['probabilities'].items(), key=lambda x: x[1], reverse=True)[:5]
        for category, prob in sorted_probs:
            print(f"  {category}: {prob:.2%}")
    
    print("="*80)
else:
    print("No article entered.")


## Step 6: Batch Prediction from File


In [None]:
# Example: Predict category for multiple articles from a CSV file
# Uncomment and modify the code below to use your own file

"""
# Load articles from CSV file
articles_df = pd.read_csv('path/to/your/articles.csv')

# Make predictions
predictions = []
for article in articles_df['article']:
    result = predict_category(article)
    predictions.append(result['category'])

# Add predictions to dataframe
articles_df['predicted_category'] = predictions

# Save results
articles_df.to_csv('predictions_results.csv', index=False)
print(f"Predictions saved for {len(articles_df)} articles!")
"""


## Summary

### Key Features:
1. ✅ Load trained model and preprocessing components
2. ✅ Preprocess new article text
3. ✅ Make category predictions
4. ✅ Get prediction probabilities and confidence scores
5. ✅ Support for single and batch predictions

### Usage Tips:
- The model works best with articles similar to the training data
- Longer articles generally provide better predictions
- The confidence score indicates how certain the model is about its prediction
- Lower confidence scores may indicate ambiguous articles
