# üìä Feb 08: Choosing the Right Chart - Practical Examples

Learn to select the perfect visualization for your data story through real-world scenarios.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

sns.set_theme(style="whitegrid", palette="husl")
%matplotlib inline

## Scenario 1: Comparing Sales Across Products

**Data**: Sales figures for 6 products  
**Goal**: Compare which products perform best  
**Best Choice**: Bar Chart

In [None]:
# Product sales data
products = ['Product A', 'Product B', 'Product C', 'Product D', 'Product E', 'Product F']
sales = [85000, 62000, 95000, 48000, 71000, 53000]

# Sort by sales for better readability
sorted_data = sorted(zip(products, sales), key=lambda x: x[1], reverse=True)
products_sorted = [x[0] for x in sorted_data]
sales_sorted = [x[1] for x in sorted_data]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Unsorted
ax1.bar(products, sales, color='#95a5a6', edgecolor='black')
ax1.set_title('‚ùå Unsorted - Harder to Compare', fontweight='bold', fontsize=12)
ax1.set_ylabel('Sales ($)')
ax1.tick_params(axis='x', rotation=45)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

# Sorted
bars = ax2.bar(products_sorted, sales_sorted, color='#3498db', edgecolor='black')
ax2.set_title('‚úÖ Sorted - Easy to Compare', fontweight='bold', fontsize=12)
ax2.set_ylabel('Sales ($)')
ax2.tick_params(axis='x', rotation=45)
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

# Highlight top performer
bars[0].set_color('#2ecc71')

plt.suptitle('Scenario 1: Product Sales Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Sorting makes the ranking immediately clear!")

## Scenario 2: Monthly Revenue Trend

**Data**: Monthly revenue over 12 months  
**Goal**: Show trend and growth  
**Best Choice**: Line Chart

In [None]:
# Monthly revenue data
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
revenue = [45000, 48000, 52000, 50000, 55000, 58000, 62000, 60000, 65000, 68000, 72000, 75000]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart (less effective for trends)
ax1.bar(months, revenue, color='#95a5a6', edgecolor='black')
ax1.set_title('‚ùå Bar Chart - Trend Less Clear', fontweight='bold', fontsize=12)
ax1.set_ylabel('Revenue ($)')
ax1.tick_params(axis='x', rotation=45)
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

# Line chart (better for trends)
ax2.plot(months, revenue, marker='o', linewidth=2.5, markersize=8, color='#3498db')
ax2.fill_between(range(len(months)), revenue, alpha=0.3, color='#3498db')
ax2.set_title('‚úÖ Line Chart - Trend is Clear', fontweight='bold', fontsize=12)
ax2.set_ylabel('Revenue ($)')
ax2.tick_params(axis='x', rotation=45)
ax2.grid(True, alpha=0.3)
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

plt.suptitle('Scenario 2: Monthly Revenue Trend', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Line charts make trends and patterns immediately visible!")

## Scenario 3: Customer Age Distribution

**Data**: Ages of 500 customers  
**Goal**: Understand age demographics  
**Best Choice**: Histogram with KDE

In [None]:
# Generate customer age data
np.random.seed(42)
ages = np.concatenate([
    np.random.normal(35, 8, 200),   # Young adults
    np.random.normal(50, 10, 200),  # Middle-aged
    np.random.normal(65, 7, 100)    # Seniors
])
ages = ages[(ages >= 18) & (ages <= 80)]  # Filter realistic ages

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of age ranges (less informative)
age_ranges = ['18-30', '31-40', '41-50', '51-60', '61-70', '71+']
counts = [
    sum((ages >= 18) & (ages <= 30)),
    sum((ages >= 31) & (ages <= 40)),
    sum((ages >= 41) & (ages <= 50)),
    sum((ages >= 51) & (ages <= 60)),
    sum((ages >= 61) & (ages <= 70)),
    sum(ages >= 71)
]
ax1.bar(age_ranges, counts, color='#95a5a6', edgecolor='black')
ax1.set_title('‚ùå Grouped Ranges - Loses Detail', fontweight='bold', fontsize=12)
ax1.set_xlabel('Age Range')
ax1.set_ylabel('Count')

# Histogram with KDE (better for distribution)
sns.histplot(ages, bins=30, kde=True, color='#3498db', ax=ax2)
ax2.set_title('‚úÖ Histogram + KDE - Shows Full Distribution', fontweight='bold', fontsize=12)
ax2.set_xlabel('Age')
ax2.set_ylabel('Count')

plt.suptitle('Scenario 3: Customer Age Distribution', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Histograms reveal the full distribution shape!")

## Scenario 4: Salary Comparison Across Departments

**Data**: Salaries for employees in 4 departments  
**Goal**: Compare salary distributions  
**Best Choice**: Box Plot or Violin Plot

In [None]:
# Generate salary data
np.random.seed(42)
departments = ['Sales', 'Engineering', 'Marketing', 'HR']
salary_data = []

for dept in departments:
    if dept == 'Engineering':
        salaries = np.random.normal(80000, 15000, 50)
    elif dept == 'Sales':
        salaries = np.random.normal(65000, 20000, 50)
    elif dept == 'Marketing':
        salaries = np.random.normal(60000, 12000, 50)
    else:  # HR
        salaries = np.random.normal(55000, 10000, 50)
    
    for sal in salaries:
        salary_data.append({'Department': dept, 'Salary': max(30000, sal)})

df_salary = pd.DataFrame(salary_data)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(16, 5))

# Bar chart (only shows mean)
sns.barplot(data=df_salary, x='Department', y='Salary', ax=ax1, palette='Set2')
ax1.set_title('‚ùå Bar Chart - Only Shows Mean', fontweight='bold', fontsize=11)
ax1.set_ylabel('Salary ($)')
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

# Box plot (shows distribution)
sns.boxplot(data=df_salary, x='Department', y='Salary', ax=ax2, palette='Set2')
ax2.set_title('‚úÖ Box Plot - Shows Quartiles', fontweight='bold', fontsize=11)
ax2.set_ylabel('Salary ($)')
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

# Violin plot (shows full distribution)
sns.violinplot(data=df_salary, x='Department', y='Salary', ax=ax3, palette='Set2')
ax3.set_title('‚úÖ Violin Plot - Shows Full Shape', fontweight='bold', fontsize=11)
ax3.set_ylabel('Salary ($)')
ax3.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))

plt.suptitle('Scenario 4: Salary Distribution by Department', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Box/Violin plots reveal distribution patterns that averages hide!")

## Scenario 5: Marketing Budget Allocation

**Data**: Budget percentages across 4 channels  
**Goal**: Show how budget is divided  
**Best Choice**: Pie Chart (for few categories) or Horizontal Bar

In [None]:
# Budget allocation data
channels = ['Social Media', 'Email', 'Content', 'Paid Ads']
budget = [35, 20, 25, 20]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart (good for composition)
colors = ['#3498db', '#e74c3c', '#2ecc71', '#f39c12']
explode = (0.1, 0, 0, 0)  # Highlight largest slice
ax1.pie(budget, labels=channels, autopct='%1.1f%%', startangle=90, 
        colors=colors, explode=explode, shadow=True)
ax1.set_title('‚úÖ Pie Chart - Clear Proportions', fontweight='bold', fontsize=12)

# Horizontal bar (alternative)
ax2.barh(channels, budget, color=colors, edgecolor='black')
ax2.set_xlabel('Budget (%)')
ax2.set_title('‚úÖ Horizontal Bar - Easy to Compare', fontweight='bold', fontsize=12)
for i, v in enumerate(budget):
    ax2.text(v + 1, i, f'{v}%', va='center', fontweight='bold')

plt.suptitle('Scenario 5: Marketing Budget Allocation', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: For few categories, both pie and bar charts work well!")

## Scenario 6: Advertising Spend vs Sales

**Data**: Advertising spend and sales for 50 campaigns  
**Goal**: Find relationship between spending and sales  
**Best Choice**: Scatter Plot with Regression Line

In [None]:
# Generate advertising data
np.random.seed(42)
ad_spend = np.random.uniform(5000, 50000, 50)
sales = ad_spend * 2.5 + np.random.normal(0, 10000, 50)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Scatter plot without trend
ax1.scatter(ad_spend, sales, alpha=0.6, s=80, color='#95a5a6', edgecolor='black')
ax1.set_xlabel('Advertising Spend ($)')
ax1.set_ylabel('Sales ($)')
ax1.set_title('‚ùå Scatter Only - Trend Not Clear', fontweight='bold', fontsize=12)
ax1.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax1.grid(True, alpha=0.3)

# Scatter plot with regression line
sns.regplot(x=ad_spend, y=sales, ax=ax2, 
            scatter_kws={'alpha':0.6, 's':80, 'edgecolor':'black'},
            line_kws={'color':'red', 'linewidth':2.5})
ax2.set_xlabel('Advertising Spend ($)')
ax2.set_ylabel('Sales ($)')
ax2.set_title('‚úÖ With Regression - Trend is Clear', fontweight='bold', fontsize=12)
ax2.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax2.grid(True, alpha=0.3)

plt.suptitle('Scenario 6: Advertising Spend vs Sales', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Regression lines make relationships and correlations obvious!")

## Scenario 7: Multi-Year Quarterly Comparison

**Data**: Quarterly sales for 3 years  
**Goal**: Compare performance across years and quarters  
**Best Choice**: Grouped Bar Chart or Multiple Lines

In [None]:
# Quarterly sales data
quarters = ['Q1', 'Q2', 'Q3', 'Q4']
sales_2023 = [45000, 52000, 48000, 58000]
sales_2024 = [50000, 58000, 55000, 65000]
sales_2025 = [55000, 62000, 60000, 70000]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Grouped bar chart
x = np.arange(len(quarters))
width = 0.25

ax1.bar(x - width, sales_2023, width, label='2023', color='#95a5a6', edgecolor='black')
ax1.bar(x, sales_2024, width, label='2024', color='#3498db', edgecolor='black')
ax1.bar(x + width, sales_2025, width, label='2025', color='#2ecc71', edgecolor='black')
ax1.set_xlabel('Quarter')
ax1.set_ylabel('Sales ($)')
ax1.set_title('‚úÖ Grouped Bars - Easy Comparison', fontweight='bold', fontsize=12)
ax1.set_xticks(x)
ax1.set_xticklabels(quarters)
ax1.legend()
ax1.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax1.grid(True, alpha=0.3, axis='y')

# Line chart
ax2.plot(quarters, sales_2023, marker='o', linewidth=2.5, label='2023', color='#95a5a6')
ax2.plot(quarters, sales_2024, marker='s', linewidth=2.5, label='2024', color='#3498db')
ax2.plot(quarters, sales_2025, marker='^', linewidth=2.5, label='2025', color='#2ecc71')
ax2.set_xlabel('Quarter')
ax2.set_ylabel('Sales ($)')
ax2.set_title('‚úÖ Line Chart - Shows Trends', fontweight='bold', fontsize=12)
ax2.legend()
ax2.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: f'${x/1000:.0f}K'))
ax2.grid(True, alpha=0.3)

plt.suptitle('Scenario 7: Multi-Year Quarterly Sales', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("üí° Insight: Both work! Grouped bars for comparison, lines for trends.")

## üèÜ Practice Exercise: Choose the Right Chart

For each scenario below, create the most appropriate visualization:

1. **Website Traffic**: Daily visitors for the past 30 days
2. **Product Ratings**: Distribution of 1-5 star ratings from 1000 customers
3. **Regional Sales**: Total sales for 8 different regions
4. **Employee Tenure**: Years of service for 200 employees across 3 departments
5. **Price vs Quality**: Relationship between price and quality score for 40 products

In [None]:
# Sample data for practice
np.random.seed(42)

# 1. Website traffic (time series)
days = list(range(1, 31))
visitors = [1000 + i*50 + np.random.randint(-200, 200) for i in range(30)]

# 2. Product ratings (distribution)
ratings = np.random.choice([1, 2, 3, 4, 5], 1000, p=[0.05, 0.1, 0.2, 0.35, 0.3])

# 3. Regional sales (comparison)
regions = ['North', 'South', 'East', 'West', 'Central', 'Northeast', 'Southeast', 'Northwest']
regional_sales = [125000, 98000, 87000, 110000, 75000, 82000, 91000, 68000]

# 4. Employee tenure (distribution by category)
tenure_data = []
for dept in ['Sales', 'Engineering', 'Marketing']:
    for _ in range(70):
        tenure_data.append({'Department': dept, 'Years': np.random.randint(0, 20)})
df_tenure = pd.DataFrame(tenure_data)

# 5. Price vs Quality (relationship)
price = np.random.uniform(10, 200, 40)
quality = price * 0.3 + np.random.normal(0, 10, 40)

# Your code here - create 5 appropriate visualizations


## Decision Framework Summary

| Your Goal | Data Type | Best Chart |
|-----------|-----------|------------|
| **Compare** categories | Categorical + Numerical | Bar Chart (sorted) |
| **Show trend** over time | Time Series | Line Chart |
| **Show distribution** | Continuous | Histogram + KDE |
| **Compare distributions** | Categorical + Continuous | Box/Violin Plot |
| **Show composition** | Parts of whole | Pie (few) or Stacked Bar |
| **Find relationship** | Two continuous | Scatter + Regression |
| **Multi-series comparison** | Multiple series | Grouped Bar or Multiple Lines |

## Key Takeaways

‚úÖ **Always ask**: What am I trying to show?  
‚úÖ **Comparison** ‚Üí Bar charts (sorted when possible)  
‚úÖ **Trends** ‚Üí Line charts  
‚úÖ **Distribution** ‚Üí Histograms, box plots, violin plots  
‚úÖ **Composition** ‚Üí Pie charts (few categories) or stacked bars  
‚úÖ **Relationships** ‚Üí Scatter plots with regression lines  
‚úÖ **Simplicity wins** ‚Üí Start simple, add complexity only if needed  
‚úÖ **Test readability** ‚Üí Can someone understand it in 5 seconds?  