# Notebook 02: Data Preprocessing and Feature Engineering

## Objective
Clean and prepare the collected data for machine learning:
1. Parse text fields to numeric/datetime values
2. Handle missing values
3. Engineer features
4. Save processed datasets

In [None]:
import sys
import os
import pandas as pd

# Add source directory to path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src import preprocess

print("="*80)
print("DATA PREPROCESSING")
print("="*80)

## Part 1: Preprocess Scraped Data

Parse and clean data collected via web scraping.

In [None]:
print("\nPART 1: PREPROCESSING SCRAPED DATA")
print("="*80)

df_scraped = preprocess.preprocess_scraped_data(
    filename="../data/raw/scraped_videos.csv",
    output="../data/processed/scraped_processed.csv"
)

print("\nScraped Data Sample (after preprocessing):")
print(df_scraped.head(3))

print("\nScraped Data Info:")
print(f"  Shape: {df_scraped.shape}")
print(f"  Columns: {list(df_scraped.columns)}")

## Part 2: Preprocess API Data

Parse and clean data collected via YouTube Data API.

In [None]:
print("\nPART 2: PREPROCESSING API DATA")
print("="*80)

df_api = preprocess.preprocess_api_data(
    filename="../data/raw/api_videos.csv",
    output="../data/processed/api_processed.csv"
)

print("\nAPI Data Sample (after preprocessing):")
print(df_api.head(3))

print("\nAPI Data Info:")
print(f"  Shape: {df_api.shape}")
print(f"  Columns: {list(df_api.columns)}")

## Summary

In [None]:
print("\n" + "="*80)
print("PREPROCESSING SUMMARY")
print("="*80)

print("\nScraped Data Features:")
for col in df_scraped.columns:
    print(f"  - {col}")

print("\nAPI Data Features:")
for col in df_api.columns:
    print(f"  - {col}")

print("\n" + "="*80)
print(f"Scraped: {len(df_scraped):,} videos processed")
print(f"API: {len(df_api):,} videos processed")
print("\nFiles saved:")
print("  - data/processed/scraped_processed.csv")
print("  - data/processed/api_processed.csv")
print("\nNext: Run notebook 03 & 04 to train models")
print("="*80)