# News Article Classification - Part 2: Feature Engineering

## Overview
This notebook covers:
1. Loading preprocessed data
2. Extracting textual features (word count, character count, etc.)
3. TF-IDF Vectorization
4. Word2Vec Embeddings (optional)
5. Preparing features for model training


## Step 1: Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os
import pickle

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

# Utilities
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")


## Step 2: Load Preprocessed Data


In [None]:
# Load preprocessed data
df = pd.read_csv('data/processed_articles.csv')

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"First few rows:")
df.head()


## Step 3: Extract Textual Features


In [None]:
# Extract various textual features
def extract_text_features(df):
    """Extract textual features from reviews"""
    features = df.copy()
    
    # Basic length features
    features['char_count'] = features['cleaned_text'].str.len()
    features['word_count'] = features['cleaned_text'].str.split().str.len()
    features['sentence_count'] = features['cleaned_text'].str.split('.').str.len()
    
    # Average word length
    features['avg_word_length'] = features['char_count'] / (features['word_count'] + 1)
    
    # Count of uppercase letters (if any remain after preprocessing)
    features['uppercase_count'] = features['cleaned_text'].str.findall(r'[A-Z]').str.len()
    
    # Count of digits (if any remain)
    features['digit_count'] = features['cleaned_text'].str.findall(r'\d').str.len()
    
    # Count of special characters
    features['special_char_count'] = features['cleaned_text'].str.findall(r'[^a-zA-Z0-9\s]').str.len()
    
    # Count of exclamation marks and question marks (category indicators)
    features['exclamation_count'] = features['cleaned_text'].str.count('!')
    features['question_count'] = features['cleaned_text'].str.count('?')
    
    return features

# Extract features
df_features = extract_text_features(df)

print("Textual features extracted!")
print("\nFeature Statistics:")
print(df_features[['char_count', 'word_count', 'avg_word_length']].describe())
print("\nSample features:")
df_features[['cleaned_text', 'char_count', 'word_count', 'avg_word_length']].head()


## Step 4: TF-IDF Vectorization


In [None]:
# Initialize TF-IDF Vectorizer
# Using common parameters for text classification
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,  # Top 5000 features
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Minimum document frequency
    max_df=0.95,  # Maximum document frequency (ignore very common words)
    sublinear_tf=True  # Apply sublinear tf scaling
)

# Fit and transform the cleaned reviews
print("Fitting TF-IDF vectorizer...")
X_tfidf = tfidf_vectorizer.fit_transform(df_features['cleaned_text'])

print(f"TF-IDF matrix shape: {X_tfidf.shape}")
print(f"Number of features: {X_tfidf.shape[1]}")

# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nSample feature names (first 20):")
print(feature_names[:20])


## Step 5: Word2Vec Embeddings (Optional)


In [None]:
# Prepare tokenized sentences for Word2Vec
tokenized_reviews = [review.split() for review in df_features['cleaned_text']]

# Train Word2Vec model
print("Training Word2Vec model...")
word2vec_model = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=100,  # Dimension of word vectors
    window=5,  # Context window size
    min_count=2,  # Minimum word frequency
    workers=4,  # Number of threads
    sg=0  # 0 for CBOW, 1 for Skip-gram
)

print(f"Word2Vec model trained!")
print(f"Vocabulary size: {len(word2vec_model.wv.key_to_index)}")

# Create document vectors by averaging word vectors
def get_document_vector(words, model):
    """Get document vector by averaging word vectors"""
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Create document vectors
print("Creating document vectors...")
X_word2vec = np.array([get_document_vector(review, word2vec_model) for review in tokenized_reviews])

print(f"Word2Vec matrix shape: {X_word2vec.shape}")


## Step 6: Combine Features


In [None]:
# Extract textual features as numpy array
textual_features = df_features[['char_count', 'word_count', 'avg_word_length', 
                                 'exclamation_count', 'question_count']].values

# Normalize textual features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
textual_features_scaled = scaler.fit_transform(textual_features)

print(f"Textual features shape: {textual_features_scaled.shape}")

# Option 1: Combine TF-IDF with textual features
from scipy.sparse import hstack
X_combined_tfidf = hstack([X_tfidf, textual_features_scaled])

print(f"Combined TF-IDF + Textual features shape: {X_combined_tfidf.shape}")

# Option 2: Combine Word2Vec with textual features
X_combined_word2vec = np.hstack([X_word2vec, textual_features_scaled])

print(f"Combined Word2Vec + Textual features shape: {X_combined_word2vec.shape}")


## Step 7: Prepare Target Variable


In [None]:
# Encode target variable
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df_features['category'])

print(f"Target variable shape: {y.shape}")
print(f"Class distribution:")
print(pd.Series(y).value_counts())
print(f"\nLabel mapping:")
for i, label in enumerate(label_encoder.classes_):
    print(f"  {i}: {label}")


## Step 8: Split Data into Train and Test Sets


In [None]:
# Split data for TF-IDF features
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_combined_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

# Split data for Word2Vec features
X_train_word2vec, X_test_word2vec, _, _ = train_test_split(
    X_combined_word2vec, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split completed!")
print(f"Training set size (TF-IDF): {X_train_tfidf.shape}")
print(f"Test set size (TF-IDF): {X_test_tfidf.shape}")
print(f"Training labels: {y_train.shape}")
print(f"Test labels: {y_test.shape}")


## Step 9: Save Features and Vectorizers


In [None]:
# Create models directory
os.makedirs('models', exist_ok=True)

# Save TF-IDF vectorizer
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Save Word2Vec model
word2vec_model.save('models/word2vec_model.model')

# Save scaler
with open('models/feature_scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

# Save label encoder
with open('models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

# Save processed features
np.savez('data/features.npz',
         X_train_tfidf=X_train_tfidf,
         X_test_tfidf=X_test_tfidf,
         X_train_word2vec=X_train_word2vec,
         X_test_word2vec=X_test_word2vec,
         y_train=y_train,
         y_test=y_test)

print("All features and vectorizers saved successfully!")
print("Files saved:")
print("- models/tfidf_vectorizer.pkl")
print("- models/word2vec_model.model")
print("- models/feature_scaler.pkl")
print("- models/label_encoder.pkl")
print("- data/features.npz")


## Summary

### Key Accomplishments:
1. ✅ Extracted textual features (word count, character count, etc.)
2. ✅ Created TF-IDF vectors with 5000 features
3. ✅ Generated Word2Vec embeddings (100 dimensions)
4. ✅ Combined features for model training
5. ✅ Split data into train and test sets
6. ✅ Saved all vectorizers and features

### Next Steps:
- Proceed to Model Development notebook
