# NOS NL Articles Dataset Documentation

This notebook documents the structure, format, and contents of the `NOS_NL_articles_2015_mar_2025.feather` dataset. 

The dataset contains Dutch news articles from NOS (Nederlandse Omroep Stichting) spanning from 2015 to March 2025, stored in Apache Feather format for efficient data processing.

## 1. Import Required Libraries

Import pandas for data manipulation and other necessary libraries for analyzing the feather dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Display settings for better data exploration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## 2. Load the Feather Dataset

Load the NOS_NL_articles_2015_mar_2025.feather file using pandas and display basic information about the dataset size.

In [None]:
# Load the feather dataset
file_path = "data/NOS_NL_articles_2015_mar_2025.feather"

print(f"Loading dataset from: {file_path}")
print(f"File exists: {os.path.exists(file_path)}")

if os.path.exists(file_path):
    # Get file size
    file_size = os.path.getsize(file_path)
    print(f"File size: {file_size / (1024**2):.2f} MB")
    
    # Load the dataset
    df = pd.read_feather(file_path)
    
    print(f"\nDataset loaded successfully!")
    print(f"Shape: {df.shape} (rows, columns)")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / (1024**2):.2f} MB")
else:
    print("File not found! Please check the file path.")

## 3. Examine Dataset Structure

Display the dataset shape, column names, and basic structure to understand the overall organization of the data.

In [None]:
print("=== DATASET STRUCTURE ===")
print(f"Shape: {df.shape}")
print(f"\nColumn names ({len(df.columns)} total):")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print(f"\n=== BASIC INFO ===")
print(df.info())

print(f"\n=== FIRST FEW ROWS ===")
df.head()

## 4. Analyze Column Data Types

Examine each column's data type, check for null values, and understand the data structure of each field.

In [None]:
print("=== COLUMN DATA TYPES AND NULL VALUES ===")
column_info = pd.DataFrame({
    'Column': df.columns,
    'Data Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2),
    'Unique Values': [df[col].nunique() for col in df.columns]
})

print(column_info.to_string(index=False))

print(f"\n=== SAMPLE VALUES FOR EACH COLUMN ===")
for col in df.columns:
    print(f"\n{col} (Type: {df[col].dtype}):")
    print(f"  Sample values: {df[col].dropna().head(3).tolist()}")
    if df[col].dtype == 'object':
        print(f"  Max length: {df[col].astype(str).str.len().max()}")
        print(f"  Min length: {df[col].astype(str).str.len().min()}")

## 5. Explore Article Content

Sample and display article content to understand the text format, language, and typical article structure.

In [None]:
# Look for text content columns (likely containing article text)
text_columns = []
for col in df.columns:
    if df[col].dtype == 'object':
        # Check if this looks like text content
        sample_values = df[col].dropna().head(5)
        avg_length = df[col].astype(str).str.len().mean()
        if avg_length > 50:  # Likely text content if average length > 50 chars
            text_columns.append(col)

print(f"=== POTENTIAL TEXT CONTENT COLUMNS ===")
for col in text_columns:
    print(f"\n{col}:")
    print(f"  Average length: {df[col].astype(str).str.len().mean():.1f} characters")
    print(f"  Max length: {df[col].astype(str).str.len().max()} characters")

print(f"\n=== SAMPLE ARTICLE CONTENT ===")
# Display a few sample articles
for i in range(min(3, len(df))):
    print(f"\n--- Article {i+1} ---")
    for col in df.columns:
        value = df.iloc[i][col]
        if pd.isna(value):
            print(f"{col}: [NULL]")
        elif isinstance(value, str) and len(value) > 100:
            print(f"{col}: {value[:100]}...")
        else:
            print(f"{col}: {value}")

## 6. Check Date Range and Distribution

Analyze the date columns to verify the 2015-2025 range and examine the temporal distribution of articles.

In [None]:
# Look for date columns
date_columns = []
for col in df.columns:
    col_lower = col.lower()
    if any(keyword in col_lower for keyword in ['date', 'time', 'published', 'created']):
        date_columns.append(col)
    elif df[col].dtype == 'object':
        # Check if values look like dates
        sample = df[col].dropna().head(10)
        try:
            pd.to_datetime(sample)
            date_columns.append(col)
        except:
            pass

print(f"=== POTENTIAL DATE COLUMNS ===")
print(f"Found columns: {date_columns}")

for col in date_columns:
    print(f"\n{col}:")
    try:
        # Try to convert to datetime
        dates = pd.to_datetime(df[col], errors='coerce')
        valid_dates = dates.dropna()
        
        if len(valid_dates) > 0:
            print(f"  Date range: {valid_dates.min()} to {valid_dates.max()}")
            print(f"  Valid dates: {len(valid_dates)}/{len(df)} ({len(valid_dates)/len(df)*100:.1f}%)")
            
            # Year distribution
            years = valid_dates.dt.year.value_counts().sort_index()
            print(f"  Articles per year:")
            for year, count in years.items():
                print(f"    {year}: {count}")
        else:
            print("  No valid dates found")
    except Exception as e:
        print(f"  Error parsing dates: {e}")
        print(f"  Sample values: {df[col].head(3).tolist()}")

## 7. Sample Data Exploration

Display sample rows and examine specific articles to understand the data quality and content format.

In [None]:
print("=== RANDOM SAMPLE OF ARTICLES ===")
# Show a random sample of articles
sample_df = df.sample(n=min(5, len(df)), random_state=42)

for idx, (index, row) in enumerate(sample_df.iterrows()):
    print(f"\n--- Sample Article {idx+1} (Row {index}) ---")
    for col in df.columns:
        value = row[col]
        if pd.isna(value):
            print(f"{col}: [NULL]")
        elif isinstance(value, str):
            if len(value) > 200:
                print(f"{col}: {value[:200]}... ({len(value)} chars total)")
            else:
                print(f"{col}: {value}")
        else:
            print(f"{col}: {value}")

print(f"\n=== DATA QUALITY ASSESSMENT ===")
print(f"Total rows: {len(df)}")
print(f"Completely empty rows: {df.isnull().all(axis=1).sum()}")
print(f"Rows with all text fields filled: {df.select_dtypes(include='object').notna().all(axis=1).sum()}")

# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")
if len(df.columns) > 1:
    print(f"Duplicate articles (by first text column): {df.duplicated(subset=[df.select_dtypes(include='object').columns[0]]).sum()}")

## 8. Dataset Statistics and Summary

Generate descriptive statistics, word counts, and other relevant metrics to summarize the dataset characteristics.

In [None]:
print("=== DATASET SUMMARY STATISTICS ===")

# Text length statistics for string columns
for col in df.select_dtypes(include='object').columns:
    lengths = df[col].astype(str).str.len()
    print(f"\n{col} - Text Length Statistics:")
    print(f"  Mean: {lengths.mean():.1f} characters")
    print(f"  Median: {lengths.median():.1f} characters") 
    print(f"  Min: {lengths.min()} characters")
    print(f"  Max: {lengths.max()} characters")
    print(f"  Standard deviation: {lengths.std():.1f}")

# Word count analysis for likely content columns
print(f"\n=== WORD COUNT ANALYSIS ===")
for col in text_columns:
    word_counts = df[col].astype(str).str.split().str.len()
    print(f"\n{col} - Word Count Statistics:")
    print(f"  Mean: {word_counts.mean():.1f} words")
    print(f"  Median: {word_counts.median():.1f} words")
    print(f"  Min: {word_counts.min()} words")
    print(f"  Max: {word_counts.max()} words")

print(f"\n=== FINAL SUMMARY ===")
print(f"Dataset: NOS_NL_articles_2015_mar_2025.feather")
print(f"Total articles: {len(df):,}")
print(f"Total columns: {len(df.columns)}")
print(f"File format: Apache Feather (.feather)")
print(f"Primary language: Dutch (Nederlandse)")
print(f"Source: NOS (Nederlandse Omroep Stichting)")
print(f"Time period: 2015 to March 2025")
print(f"Storage efficiency: {df.memory_usage(deep=True).sum() / (1024**2):.1f} MB in memory")