# Phase 1: Exploratory Data Analysis

**Course**: IME 565 - Predictive Data Analytics for Engineers  
**Team**: Nicolo DiFerdinando, Joe Mascher, Rithvik Shetty  
**Phase**: Foundation Analytics

This notebook uses modular code from `src/` for clean, reusable analysis.

## 1. Setup and Imports

In [None]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent))

# Import our custom modules
from src.data_processing import (
    load_spotify_data,
    identify_audio_features,
    clean_dataset,
    identify_column_names,
    get_dataset_summary
)
from src.feature_engineering import (
    create_composite_features,
    add_context_classification,
    get_normalized_features,
    get_composite_features
)
from src.visualization import (
    plot_feature_distributions,
    plot_correlation_matrix,
    plot_top_items,
    plot_context_distribution,
    print_summary_stats
)

# Settings
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All packages imported successfully!")

## 2. Load Data

In [None]:
# Load dataset
df, filename = load_spotify_data('../data/raw')
print(f"\nSuccessfully loaded: {filename}")

In [None]:
# Identify available audio features
audio_features = identify_audio_features(df)

In [None]:
# Preview data
print("\nFirst 10 rows:")
df.head(10)

In [None]:
# Dataset info
df.info()

## 3. Data Cleaning

In [None]:
# Clean the dataset
df_clean = clean_dataset(df, audio_features)

In [None]:
# Check for missing values
missing = df_clean.isnull().sum()
if missing.sum() > 0:
    print("\nRemaining missing values:")
    print(missing[missing > 0])
else:
    print("\nNo missing values in cleaned dataset!")

## 4. Feature Engineering

In [None]:
# Create composite features
df_clean = create_composite_features(df_clean)

In [None]:
# Add context classification
df_clean = add_context_classification(df_clean)

## 5. Audio Feature Analysis

In [None]:
# Get normalized features for visualization
normalized_features = get_normalized_features(audio_features)
print(f"\nNormalized features for visualization: {normalized_features}")

In [None]:
# Plot feature distributions
if normalized_features:
    plot_feature_distributions(df_clean, normalized_features)

In [None]:
# Correlation matrix
if len(audio_features) > 1:
    plot_correlation_matrix(df_clean, audio_features)

In [None]:
# Basic statistics
df_clean[audio_features].describe()

## 6. Composite Features Analysis

In [None]:
# Get composite features that were created
composite_features = get_composite_features(df_clean)

if composite_features:
    print("\nComposite Feature Statistics:")
    print(df_clean[composite_features].describe())

In [None]:
# Visualize composite features
if composite_features:
    plot_feature_distributions(df_clean, composite_features, figsize=(12, 8))

## 7. Top Charts Analysis

In [None]:
# Identify column names
columns = identify_column_names(df_clean)

In [None]:
# Top Artists
if columns['artist']:
    top_artists = plot_top_items(
        df_clean,
        columns['artist'],
        "Top 20 Artists by Track Count",
        top_n=20
    )
    print("\nTop 20 Artists:")
    print(top_artists)

In [None]:
# Top Genres
if columns['genre']:
    top_genres = plot_top_items(
        df_clean,
        columns['genre'],
        "Top 20 Genres by Track Count",
        top_n=20,
        color='lightcoral',
        horizontal=False
    )
    print("\nTop 20 Genres:")
    print(top_genres)

## 8. Context Analysis

In [None]:
# Plot context distribution
plot_context_distribution(df_clean)

In [None]:
# Average audio features by context
feature_cols = ['energy', 'valence', 'danceability', 'acousticness', 'instrumentalness']
available_cols = [col for col in feature_cols if col in df_clean.columns]

if available_cols:
    print("\nAverage Audio Features by Context:")
    context_features = df_clean.groupby('context')[available_cols].mean()
    print(context_features.round(3))

## 9. Export Processed Data

In [None]:
# Save processed data
output_path = Path('../data/processed/processed_spotify_data.csv')
df_clean.to_csv(output_path, index=False)

print(f"\n✓ Processed data saved to: {output_path}")
print(f"  Rows: {len(df_clean):,}")
print(f"  Columns: {len(df_clean.columns)}")
print(f"  File size: {output_path.stat().st_size / (1024*1024):.2f} MB")

## 10. Summary

In [None]:
# Print comprehensive summary
print_summary_stats(df_clean, audio_features, columns)

print("\n✅ Phase 1 Analysis Complete!")
print("\nNext Steps:")
print("  1. Review visualizations and insights")
print("  2. Run Streamlit dashboard: streamlit run app/spotify_dashboard.py")
print("  3. Prepare for Phase 2 (Playlist Intelligence)")