# 01. Data Exploration

This notebook explores the Spotify dataset to understand user listening patterns, track features, and data distributions.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## Load Data

In [None]:
# Load datasets
listening_history = pd.read_csv('../data/raw/listening_history.csv')
track_features = pd.read_csv('../data/raw/track_features.csv')

print(f"Listening History: {listening_history.shape}")
print(f"Track Features: {track_features.shape}")

## Basic Data Overview

In [None]:
# Listening history overview
print("=== Listening History ===\n")
print(listening_history.info())
print("\nFirst 5 rows:")
listening_history.head()

In [None]:
# Track features overview
print("=== Track Features ===\n")
print(track_features.info())
print("\nFirst 5 rows:")
track_features.head()

## User Listening Patterns

In [None]:
# User activity distribution
user_stats = listening_history.groupby('user_id').agg({
    'track_id': 'count',
    'timestamp': ['min', 'max']
}).reset_index()

user_stats.columns = ['user_id', 'total_plays', 'first_play', 'last_play']
user_stats['active_days'] = (pd.to_datetime(user_stats['last_play']) - 
                             pd.to_datetime(user_stats['first_play'])).dt.days

print(f"Total users: {user_stats.shape[0]:,}")
print(f"Average plays per user: {user_stats['total_plays'].mean():.1f}")
print(f"Median plays per user: {user_stats['total_plays'].median():.1f}")

In [None]:
# Visualize user activity distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Play count distribution
axes[0, 0].hist(user_stats['total_plays'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Total Plays')
axes[0, 0].set_ylabel('Number of Users')
axes[0, 0].set_title('Distribution of User Play Counts')
axes[0, 0].set_yscale('log')

# Top users
top_users = user_stats.nlargest(20, 'total_plays')
axes[0, 1].bar(range(len(top_users)), top_users['total_plays'])
axes[0, 1].set_xlabel('User Rank')
axes[0, 1].set_ylabel('Total Plays')
axes[0, 1].set_title('Top 20 Most Active Users')

# Active days distribution
axes[1, 0].hist(user_stats['active_days'], bins=50, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Active Days')
axes[1, 0].set_ylabel('Number of Users')
axes[1, 0].set_title('User Activity Period Distribution')

# Plays vs active days
axes[1, 1].scatter(user_stats['active_days'], user_stats['total_plays'], alpha=0.5)
axes[1, 1].set_xlabel('Active Days')
axes[1, 1].set_ylabel('Total Plays')
axes[1, 1].set_title('Plays vs Activity Period')
axes[1, 1].set_yscale('log')

plt.tight_layout()
plt.show()

## Track Popularity Analysis

In [None]:
# Track play counts
track_popularity = listening_history['track_id'].value_counts().reset_index()
track_popularity.columns = ['track_id', 'play_count']

# Merge with track features
track_analysis = track_popularity.merge(track_features, on='track_id', how='left')

print(f"Total unique tracks: {track_popularity.shape[0]:,}")
print(f"Tracks with features: {track_analysis.dropna().shape[0]:,}")
print(f"\nTop 10 most played tracks:")
track_analysis.head(10)[['track_id', 'track_name', 'artist_name', 'play_count']]

In [None]:
# Track popularity distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Play count distribution
axes[0].hist(track_popularity['play_count'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Play Count')
axes[0].set_ylabel('Number of Tracks')
axes[0].set_title('Track Popularity Distribution')
axes[0].set_yscale('log')
axes[0].set_xscale('log')

# Cumulative distribution
sorted_plays = np.sort(track_popularity['play_count'])[::-1]
cumulative_pct = np.cumsum(sorted_plays) / np.sum(sorted_plays) * 100
track_pct = np.arange(1, len(sorted_plays) + 1) / len(sorted_plays) * 100

axes[1].plot(track_pct, cumulative_pct)
axes[1].axhline(y=80, color='r', linestyle='--', label='80% of plays')
axes[1].set_xlabel('Percentage of Tracks')
axes[1].set_ylabel('Cumulative Percentage of Plays')
axes[1].set_title('Cumulative Play Distribution')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Find the 80-20 point
idx_80 = np.where(cumulative_pct >= 80)[0][0]
print(f"{idx_80/len(sorted_plays)*100:.1f}% of tracks account for 80% of all plays")

## Audio Feature Distributions

In [None]:
# Select audio features
audio_features = ['danceability', 'energy', 'loudness', 'speechiness', 
                  'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

# Create feature distribution plots
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.ravel()

for i, feature in enumerate(audio_features):
    if feature in track_features.columns:
        track_features[feature].hist(bins=50, ax=axes[i], edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{feature.capitalize()} Distribution')
        axes[i].set_xlabel(feature.capitalize())
        axes[i].set_ylabel('Count')
        
        # Add mean line
        mean_val = track_features[feature].mean()
        axes[i].axvline(mean_val, color='red', linestyle='--', 
                        label=f'Mean: {mean_val:.2f}')
        axes[i].legend()

plt.tight_layout()
plt.show()

In [None]:
# Feature correlation heatmap
if all(f in track_features.columns for f in audio_features):
    plt.figure(figsize=(10, 8))
    correlation_matrix = track_features[audio_features].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Audio Feature Correlations')
    plt.tight_layout()
    plt.show()

## Temporal Patterns

In [None]:
# Convert timestamp to datetime
listening_history['datetime'] = pd.to_datetime(listening_history['timestamp'])
listening_history['hour'] = listening_history['datetime'].dt.hour
listening_history['day_of_week'] = listening_history['datetime'].dt.dayofweek
listening_history['month'] = listening_history['datetime'].dt.month

# Hourly listening patterns
hourly_plays = listening_history.groupby('hour').size()

# Day of week patterns
days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
daily_plays = listening_history.groupby('day_of_week').size()

# Create temporal visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Hourly pattern
axes[0].bar(hourly_plays.index, hourly_plays.values, edgecolor='black')
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Number of Plays')
axes[0].set_title('Listening Activity by Hour')
axes[0].set_xticks(range(0, 24, 2))

# Daily pattern
axes[1].bar(range(7), daily_plays.values, edgecolor='black')
axes[1].set_xlabel('Day of Week')
axes[1].set_ylabel('Number of Plays')
axes[1].set_title('Listening Activity by Day of Week')
axes[1].set_xticks(range(7))
axes[1].set_xticklabels(days)

plt.tight_layout()
plt.show()

## Genre Analysis

In [None]:
# Genre distribution (if available)
if 'genre' in track_features.columns:
    # Merge listening history with track features to get genres
    history_with_features = listening_history.merge(track_features[['track_id', 'genre']], 
                                                   on='track_id', how='left')
    
    # Genre play counts
    genre_plays = history_with_features['genre'].value_counts().head(20)
    
    plt.figure(figsize=(12, 6))
    genre_plays.plot(kind='bar', edgecolor='black')
    plt.xlabel('Genre')
    plt.ylabel('Number of Plays')
    plt.title('Top 20 Genres by Play Count')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("Genre information not available in the dataset")

## Summary Statistics

In [None]:
# Create summary report
print("=== DATASET SUMMARY ===")
print(f"\nTotal listening events: {listening_history.shape[0]:,}")
print(f"Unique users: {listening_history['user_id'].nunique():,}")
print(f"Unique tracks: {listening_history['track_id'].nunique():,}")
print(f"Date range: {listening_history['datetime'].min()} to {listening_history['datetime'].max()}")

print("\n=== USER STATISTICS ===")
print(f"Avg plays per user: {user_stats['total_plays'].mean():.1f}")
print(f"Median plays per user: {user_stats['total_plays'].median():.1f}")
print(f"Max plays by single user: {user_stats['total_plays'].max():,}")

print("\n=== TRACK STATISTICS ===")
print(f"Avg plays per track: {track_popularity['play_count'].mean():.1f}")
print(f"Median plays per track: {track_popularity['play_count'].median():.1f}")
print(f"Most played track: {track_popularity['play_count'].max():,} plays")

if all(f in track_features.columns for f in audio_features):
    print("\n=== AUDIO FEATURE AVERAGES ===")
    for feature in audio_features:
        print(f"{feature.capitalize()}: {track_features[feature].mean():.3f}")

## Data Quality Check

In [None]:
# Check for missing values
print("=== MISSING VALUES ===")
print("\nListening History:")
print(listening_history.isnull().sum())
print("\nTrack Features:")
print(track_features.isnull().sum())

# Check for duplicates
print("\n=== DUPLICATES ===")
dup_history = listening_history.duplicated().sum()
dup_tracks = track_features.duplicated(subset=['track_id']).sum()
print(f"Duplicate listening events: {dup_history:,}")
print(f"Duplicate track entries: {dup_tracks:,}")

# Track coverage
tracks_in_history = set(listening_history['track_id'].unique())
tracks_with_features = set(track_features['track_id'].unique())
coverage = len(tracks_in_history.intersection(tracks_with_features)) / len(tracks_in_history) * 100
print(f"\n=== FEATURE COVERAGE ===")
print(f"Tracks with audio features: {coverage:.1f}%")

## Save Processed Data

In [None]:
# Save user statistics for future use
user_stats.to_csv('../data/processed/user_statistics.csv', index=False)
print("User statistics saved to ../data/processed/user_statistics.csv")

# Save track popularity data
track_analysis.to_csv('../data/processed/track_popularity.csv', index=False)
print("Track popularity data saved to ../data/processed/track_popularity.csv")