# News Article Classification - Part 1: Data Exploration & Preprocessing

## Overview
This notebook covers:
1. Data Loading and Initial Exploration
2. Data Quality Assessment
3. Exploratory Data Analysis (EDA)
4. Text Preprocessing
5. Data Cleaning


## Step 1: Import Libraries


In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Text processing
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download NLTK data (run once)
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except LookupError:
    nltk.download('averaged_perceptron_tagger')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 200)

print("Libraries imported successfully!")
print("NLTK data downloaded/verified!")


## Step 2: Load Dataset


In [None]:
# Load the dataset
df = pd.read_csv('data/news_data.csv')

print(f"Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"Column names: {list(df.columns)}")


## Step 3: Initial Data Exploration


In [None]:
# Display first few rows
df.head(10)


In [None]:
# Display column information
df.info()


In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percent.values
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

if len(missing_df) > 0:
    print("Missing Values Found:")
    print(missing_df)
else:
    print("No missing values found in the dataset!")


In [None]:
# Check for duplicate rows
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

if duplicate_count > 0:
    print("Removing duplicates...")
    df = df.drop_duplicates()
    print(f"Dataset shape after removing duplicates: {df.shape}")


## Step 4: Target Variable Analysis (Category)


In [None]:
# Normalize category names (convert to lowercase and strip whitespace)
df['category'] = df['category'].str.strip().str.upper()

# Category distribution
category_counts = df['category'].value_counts()
category_percentages = df['category'].value_counts(normalize=True) * 100

print("Category Distribution:")
print(category_counts)
print("\nCategory Percentages:")
print(category_percentages)
print(f"\nTotal number of categories: {df['category'].nunique()}")

# Visualize category distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
top_categories = category_counts.head(15)
top_categories.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Top 15 Categories Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)

# Pie chart for top 10 categories
top_10_percentages = category_percentages.head(10)
top_10_percentages.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Top 10 Categories Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.show()


## Step 5: Combine Text Features


In [None]:
# Combine headline and short_description for better classification
# Fill NaN values with empty strings
df['headline'] = df['headline'].fillna('')
df['short_description'] = df['short_description'].fillna('')

# Combine headline and description
df['combined_text'] = df['headline'] + ' ' + df['short_description']

# Calculate text length statistics
df['text_length'] = df['combined_text'].str.len()
df['word_count'] = df['combined_text'].str.split().str.len()

print("Text Length Statistics:")
print(df[['text_length', 'word_count']].describe())

# Visualize text length distribution by category
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Character count distribution
top_categories_list = category_counts.head(5).index.tolist()
df_top = df[df['category'].isin(top_categories_list)]
df_top.boxplot(column='text_length', by='category', ax=axes[0])
axes[0].set_title('Text Length (Characters) by Category', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Category', fontsize=10)
axes[0].set_ylabel('Character Count', fontsize=10)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45)

# Word count distribution
df_top.boxplot(column='word_count', by='category', ax=axes[1])
axes[1].set_title('Word Count by Category', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Category', fontsize=10)
axes[1].set_ylabel('Word Count', fontsize=10)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45)

plt.suptitle('Text Length Analysis by Category', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## Step 6: Word Cloud Visualization by Category


In [None]:
# Create word clouds for top categories
def create_wordcloud(text, title, ax):
    if len(text) > 0:
        wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(text)
        ax.imshow(wordcloud, interpolation='bilinear')
        ax.set_title(title, fontsize=12, fontweight='bold')
        ax.axis('off')

# Get top 4 categories
top_4_categories = category_counts.head(4).index.tolist()

# Create word clouds
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

for idx, category in enumerate(top_4_categories):
    row = idx // 2
    col = idx % 2
    category_text = ' '.join(df[df['category'] == category]['combined_text'].astype(str))
    create_wordcloud(category_text, f'{category} Articles', axes[row, col])

plt.tight_layout()
os.makedirs('models', exist_ok=True)
plt.savefig('models/wordclouds_by_category.png', dpi=300, bbox_inches='tight')
plt.show()


## Step 7: Text Preprocessing Functions


In [None]:
# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Clean text by removing HTML tags, special characters, and extra whitespace"""
    if pd.isna(text):
        return ""
    
    # Convert to string
    text = str(text)
    
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and digits (keep only letters and spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def tokenize_text(text):
    """Tokenize text into words"""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove stop words from tokens"""
    return [token for token in tokens if token not in stop_words]

def lemmatize_text(tokens):
    """Lemmatize tokens (convert to base form)"""
    return [lemmatizer.lemmatize(token) for token in tokens]

def preprocess_text(text, use_lemmatization=True, remove_stop=True):
    """Complete text preprocessing pipeline"""
    # Clean text
    text = clean_text(text)
    
    # Tokenize
    tokens = tokenize_text(text)
    
    # Remove stop words
    if remove_stop:
        tokens = remove_stopwords(tokens)
    
    # Lemmatize
    if use_lemmatization:
        tokens = lemmatize_text(tokens)
    
    # Join tokens back to string
    return ' '.join(tokens)

print("Text preprocessing functions created!")


## Step 8: Apply Text Preprocessing


In [None]:
# Apply preprocessing to combined text
print("Starting text preprocessing...")
print("This may take a few minutes for large datasets...")

# Create a copy of the dataframe
df_processed = df.copy()

# Apply preprocessing (using lemmatization)
df_processed['cleaned_text'] = df_processed['combined_text'].apply(
    lambda x: preprocess_text(x, use_lemmatization=True, remove_stop=True)
)

print("Text preprocessing completed!")
print(f"\nSample original text:")
print(df_processed['combined_text'].iloc[0][:200])
print(f"\nSample cleaned text:")
print(df_processed['cleaned_text'].iloc[0][:200])


## Step 9: Save Preprocessed Data


In [None]:
# Save preprocessed data
os.makedirs('data', exist_ok=True)

# Save full dataframe with cleaned text
df_processed[['cleaned_text', 'category']].to_csv('data/processed_articles.csv', index=False)

# Also save original for reference
df_processed.to_csv('data/full_processed_data.csv', index=False)

print("Preprocessed data saved successfully!")
print("Files saved:")
print("- data/processed_articles.csv")
print("- data/full_processed_data.csv")


## Summary

### Key Findings:
1. Dataset loaded and explored
2. Category distribution analyzed
3. Text length statistics calculated
4. Text preprocessing completed (cleaning, tokenization, lemmatization)
5. Preprocessed data saved for feature engineering

### Next Steps:
- Proceed to Feature Engineering notebook
