# Drake Billboard Hot 100 - Exploratory Data Analysis

An in-depth analysis of Drake's complete Billboard Hot 100 chart history from 2009 to 2025.

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from datetime import datetime

# Set style
plt.style.use('dark_background')
sns.set_palette(['#f59e0b', '#fbbf24', '#fcd34d', '#fef3c7'])

# Custom colors matching our visualization
AMBER = '#f59e0b'
BG_COLOR = '#0a0f14'
TEXT_COLOR = '#e8e6e3'

In [None]:
# Load the dataset
with open('viz/public/data/drake_billboard_data.json', 'r') as f:
    data = json.load(f)

print(f"Dataset loaded successfully!")
print(f"Artist: {data['metadata']['artist']}")
print(f"Total songs found: {data['metadata']['total_songs_found']}")
print(f"Data collected at: {data['metadata']['collected_at']}")

## 1. Dataset Overview

Let's examine the structure and summary statistics of our dataset.

In [None]:
# Extract summary statistics
hook_data = data['section_1_hook']

print("=" * 50)
print("DRAKE'S BILLBOARD HOT 100 SUMMARY")
print("=" * 50)
print(f"Total Chart Entries: {hook_data['total_entries']}")
print(f"#1 Hits: {hook_data['number_one_hits']}")
print(f"Top 10 Hits: {hook_data['top_ten_hits']}")
print(f"Total Weeks on Chart: {hook_data['total_weeks_on_chart']}")
print(f"Average Weeks per Song: {hook_data['average_weeks_per_song']}")
print(f"Average Peak Position: {hook_data['average_peak_position']}")
print(f"\nLongest Running Song: {hook_data['longest_running_song']['title']} ({hook_data['longest_running_song']['weeks']} weeks)")
print(f"First Chart Entry: {hook_data['first_chart_entry']['title']} ({hook_data['first_chart_entry']['date']})")

In [None]:
# Combine all songs from all sections into a single DataFrame
all_songs = []

sections = [
    ('section_2_origin', '2009-2011'),
    ('section_3_ascent', '2012-2015'),
    ('section_4_domination', '2016-2018'),
    ('section_5_collab', '2019-2025'),
]

for section_key, era in sections:
    if section_key in data:
        section = data[section_key]
        for song in section.get('songs', []):
            song['era'] = era
            all_songs.append(song)

# Also check for additional sections
for key in data.keys():
    if key.startswith('section_') and key not in ['section_1_hook'] + [s[0] for s in sections]:
        section = data[key]
        if isinstance(section, dict) and 'songs' in section:
            for song in section['songs']:
                if song not in all_songs:
                    song['era'] = section.get('name', 'Unknown')
                    all_songs.append(song)

df = pd.DataFrame(all_songs)
print(f"Total songs in DataFrame: {len(df)}")
df.head(10)

In [None]:
# Data types and missing values
print("\nDataset Info:")
print(f"Columns: {list(df.columns)}")
print(f"\nMissing Values:")
print(df.isnull().sum())
print(f"\nData Types:")
print(df.dtypes)

## 2. Peak Position Distribution

Analyzing how Drake's songs performed in terms of their highest chart position.

In [None]:
# Peak position distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5), facecolor=BG_COLOR)

# Histogram of peak positions
ax1 = axes[0]
ax1.set_facecolor(BG_COLOR)
ax1.hist(df['peak_pos'], bins=20, color=AMBER, edgecolor='black', alpha=0.8)
ax1.set_xlabel('Peak Position', color=TEXT_COLOR)
ax1.set_ylabel('Number of Songs', color=TEXT_COLOR)
ax1.set_title('Distribution of Peak Positions', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax1.tick_params(colors=TEXT_COLOR)
ax1.invert_xaxis()  # Lower position is better

# Peak position breakdown
ax2 = axes[1]
ax2.set_facecolor(BG_COLOR)
peak_bins = [
    ('#1', len(df[df['peak_pos'] == 1])),
    ('#2-10', len(df[(df['peak_pos'] >= 2) & (df['peak_pos'] <= 10)])),
    ('#11-40', len(df[(df['peak_pos'] >= 11) & (df['peak_pos'] <= 40)])),
    ('#41-100', len(df[df['peak_pos'] > 40]))
]
labels, values = zip(*peak_bins)
colors = [AMBER, '#fbbf24', '#78716c', '#44403c']
ax2.bar(labels, values, color=colors, edgecolor='black')
ax2.set_xlabel('Peak Position Range', color=TEXT_COLOR)
ax2.set_ylabel('Number of Songs', color=TEXT_COLOR)
ax2.set_title('Songs by Peak Position Category', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax2.tick_params(colors=TEXT_COLOR)

# Add value labels on bars
for i, v in enumerate(values):
    ax2.text(i, v + 2, str(v), ha='center', color=TEXT_COLOR, fontweight='bold')

plt.tight_layout()
plt.savefig('eda_peak_distribution.png', facecolor=BG_COLOR, dpi=150, bbox_inches='tight')
plt.show()

## 3. Weeks on Chart Analysis

Understanding the longevity of Drake's songs on the Billboard Hot 100.

In [None]:
# Weeks on chart statistics
print("Weeks on Chart Statistics:")
print(f"Mean: {df['weeks_on_chart'].mean():.1f} weeks")
print(f"Median: {df['weeks_on_chart'].median():.1f} weeks")
print(f"Min: {df['weeks_on_chart'].min()} weeks")
print(f"Max: {df['weeks_on_chart'].max()} weeks")
print(f"Standard Deviation: {df['weeks_on_chart'].std():.1f} weeks")

# Top 10 longest charting songs
print("\n" + "=" * 50)
print("TOP 10 LONGEST CHARTING SONGS")
print("=" * 50)
top_longevity = df.nlargest(10, 'weeks_on_chart')[['title', 'weeks_on_chart', 'peak_pos']]
for i, row in top_longevity.iterrows():
    print(f"{row['title']}: {row['weeks_on_chart']} weeks (peaked at #{row['peak_pos']})")

In [None]:
# Weeks on chart distribution
fig, ax = plt.subplots(figsize=(12, 5), facecolor=BG_COLOR)
ax.set_facecolor(BG_COLOR)

ax.hist(df['weeks_on_chart'], bins=30, color=AMBER, edgecolor='black', alpha=0.8)
ax.axvline(df['weeks_on_chart'].mean(), color='white', linestyle='--', label=f'Mean: {df["weeks_on_chart"].mean():.1f}')
ax.axvline(df['weeks_on_chart'].median(), color='#fbbf24', linestyle='--', label=f'Median: {df["weeks_on_chart"].median():.1f}')

ax.set_xlabel('Weeks on Chart', color=TEXT_COLOR)
ax.set_ylabel('Number of Songs', color=TEXT_COLOR)
ax.set_title('Distribution of Weeks on Chart', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax.tick_params(colors=TEXT_COLOR)
ax.legend(facecolor=BG_COLOR, edgecolor=TEXT_COLOR, labelcolor=TEXT_COLOR)

plt.tight_layout()
plt.savefig('eda_weeks_distribution.png', facecolor=BG_COLOR, dpi=150, bbox_inches='tight')
plt.show()

## 4. Yearly Trend Analysis

Examining Drake's chart entries over time.

In [None]:
# Extract year from first_chart_date
df['chart_year'] = pd.to_datetime(df['first_chart_date']).dt.year

# Entries per year
yearly_entries = df.groupby('chart_year').size()

fig, ax = plt.subplots(figsize=(14, 6), facecolor=BG_COLOR)
ax.set_facecolor(BG_COLOR)

bars = ax.bar(yearly_entries.index, yearly_entries.values, color=AMBER, edgecolor='black')

# Highlight peak year
peak_year = yearly_entries.idxmax()
for bar, year in zip(bars, yearly_entries.index):
    if year == peak_year:
        bar.set_color('#fbbf24')
        bar.set_edgecolor(AMBER)
        bar.set_linewidth(2)

ax.set_xlabel('Year', color=TEXT_COLOR)
ax.set_ylabel('Number of Chart Entries', color=TEXT_COLOR)
ax.set_title('Drake\'s Billboard Hot 100 Entries by Year', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax.tick_params(colors=TEXT_COLOR)

# Add value labels
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height + 0.5, f'{int(height)}',
            ha='center', va='bottom', color=TEXT_COLOR, fontsize=9)

plt.tight_layout()
plt.savefig('eda_yearly_entries.png', facecolor=BG_COLOR, dpi=150, bbox_inches='tight')
plt.show()

print(f"\nPeak year: {peak_year} with {yearly_entries[peak_year]} entries")

## 5. Collaboration Analysis

Analyzing Drake's collaborative patterns and most frequent partners.

In [None]:
# Analyze collaborations
all_collaborators = []
for collabs in df['collaborators']:
    if isinstance(collabs, list):
        all_collaborators.extend(collabs)

collab_counts = Counter(all_collaborators)

# Solo vs Collaboration
solo_songs = len(df[df['collaborators'].apply(lambda x: len(x) == 0 if isinstance(x, list) else True)])
collab_songs = len(df) - solo_songs

print(f"Solo Songs: {solo_songs} ({solo_songs/len(df)*100:.1f}%)")
print(f"Collaborations: {collab_songs} ({collab_songs/len(df)*100:.1f}%)")
print(f"\nUnique Collaborators: {len(collab_counts)}")

print("\n" + "=" * 50)
print("TOP 15 COLLABORATORS")
print("=" * 50)
for artist, count in collab_counts.most_common(15):
    print(f"{artist}: {count} songs")

In [None]:
# Top collaborators visualization
top_collabs = dict(collab_counts.most_common(10))

fig, ax = plt.subplots(figsize=(12, 6), facecolor=BG_COLOR)
ax.set_facecolor(BG_COLOR)

bars = ax.barh(list(top_collabs.keys())[::-1], list(top_collabs.values())[::-1], color=AMBER, edgecolor='black')

ax.set_xlabel('Number of Songs', color=TEXT_COLOR)
ax.set_title('Drake\'s Top 10 Collaborators', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax.tick_params(colors=TEXT_COLOR)

# Add value labels
for bar in bars:
    width = bar.get_width()
    ax.text(width + 0.3, bar.get_y() + bar.get_height()/2., f'{int(width)}',
            ha='left', va='center', color=TEXT_COLOR, fontweight='bold')

plt.tight_layout()
plt.savefig('eda_collaborators.png', facecolor=BG_COLOR, dpi=150, bbox_inches='tight')
plt.show()

## 6. Era Comparison

Comparing performance across different periods of Drake's career.

In [None]:
# Era-based analysis
era_stats = df.groupby('era').agg({
    'title': 'count',
    'peak_pos': 'mean',
    'weeks_on_chart': 'mean',
    'reached_number_one': 'sum'
}).rename(columns={
    'title': 'total_songs',
    'peak_pos': 'avg_peak',
    'weeks_on_chart': 'avg_weeks',
    'reached_number_one': 'number_ones'
})

print("Performance by Era:")
print(era_stats.round(1))

In [None]:
# Peak position vs Weeks on chart scatter plot
fig, ax = plt.subplots(figsize=(12, 8), facecolor=BG_COLOR)
ax.set_facecolor(BG_COLOR)

# Color by whether it reached #1
colors = [AMBER if x else '#78716c' for x in df['reached_number_one']]
sizes = [100 if x else 30 for x in df['reached_number_one']]

scatter = ax.scatter(df['weeks_on_chart'], df['peak_pos'], c=colors, s=sizes, alpha=0.7, edgecolors='black')

ax.set_xlabel('Weeks on Chart', color=TEXT_COLOR, fontsize=12)
ax.set_ylabel('Peak Position', color=TEXT_COLOR, fontsize=12)
ax.set_title('Peak Position vs. Longevity', color=TEXT_COLOR, fontsize=14, fontweight='bold')
ax.tick_params(colors=TEXT_COLOR)
ax.invert_yaxis()  # Lower position is better

# Add legend
from matplotlib.lines import Line2D
legend_elements = [
    Line2D([0], [0], marker='o', color='w', markerfacecolor=AMBER, markersize=12, label='#1 Hit'),
    Line2D([0], [0], marker='o', color='w', markerfacecolor='#78716c', markersize=8, label='Other Songs')
]
ax.legend(handles=legend_elements, facecolor=BG_COLOR, edgecolor=TEXT_COLOR, labelcolor=TEXT_COLOR)

plt.tight_layout()
plt.savefig('eda_peak_vs_weeks.png', facecolor=BG_COLOR, dpi=150, bbox_inches='tight')
plt.show()

## 7. Key Findings Summary

In [None]:
print("=" * 60)
print("KEY FINDINGS FROM EXPLORATORY DATA ANALYSIS")
print("=" * 60)

print("\nüìä DATASET OVERVIEW")
print(f"   ‚Ä¢ Total chart entries: {len(df)}")
print(f"   ‚Ä¢ Time span: 2009-2025 ({df['chart_year'].max() - df['chart_year'].min() + 1} years)")

print("\nüèÜ CHART PERFORMANCE")
print(f"   ‚Ä¢ #1 Hits: {hook_data['number_one_hits']}")
print(f"   ‚Ä¢ Top 10 Hits: {hook_data['top_ten_hits']} ({hook_data['top_ten_hits']/len(df)*100:.1f}% hit rate)")
print(f"   ‚Ä¢ Average peak position: #{hook_data['average_peak_position']:.1f}")

print("\n‚è±Ô∏è LONGEVITY")
print(f"   ‚Ä¢ Total weeks on chart: {hook_data['total_weeks_on_chart']:,}")
print(f"   ‚Ä¢ Average weeks per song: {df['weeks_on_chart'].mean():.1f}")
print(f"   ‚Ä¢ Longest charting: {hook_data['longest_running_song']['title']} ({hook_data['longest_running_song']['weeks']} weeks)")

print("\nü§ù COLLABORATIONS")
print(f"   ‚Ä¢ Unique collaborators: {len(collab_counts)}")
print(f"   ‚Ä¢ Collaboration rate: {collab_songs/len(df)*100:.1f}%")
print(f"   ‚Ä¢ Top collaborator: {collab_counts.most_common(1)[0][0]} ({collab_counts.most_common(1)[0][1]} songs)")

print("\nüìà PEAK PRODUCTIVITY")
print(f"   ‚Ä¢ Most entries in a year: {yearly_entries.max()} ({peak_year})")
print(f"   ‚Ä¢ Average entries per year: {yearly_entries.mean():.1f}")