# Data Visualization with Pandas and Matplotlib - Solutions

This notebook contains solutions to all exercises from Lecture 4: Data Visualization.
Try solving them yourself first before looking at the solutions!


## Part 1: Pandas Plotting Basics


### Exercise: Create Your First Plot Step by Step


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Create sample data
sample_data = pd.DataFrame({
    'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
    'Sales': [100, 120, 140, 130, 150]
})
sample_df = sample_data.set_index('Month')

# Step 2: Create basic plot
fig, ax = plt.subplots(figsize=(8, 5))
sample_df['Sales'].plot(ax=ax)

# Step 3: Add title
ax.set_title('Monthly Sales Trend')

# Step 4: Add axis labels
ax.set_xlabel('Month')
ax.set_ylabel('Sales')

# Step 5: Add grid
ax.grid(True, alpha=0.3)

# Step 6: Display plot
plt.tight_layout()
plt.show()
print("‚úÖ Plot created step by step!")


## Part 2: Line Plots


### ‚úèÔ∏è Challenge: Create Monthly Sales Line Plot


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load and prepare data
sales = pd.read_csv(sales_file)
sales['Date'] = pd.to_datetime(sales['Date'])
sales_clean = sales.dropna(subset=['Sales'])

# Create monthly sales
sales_clean['Month'] = sales_clean['Date'].dt.to_period('M')
monthly_sales = sales_clean.groupby('Month')['Sales'].sum()

# Create line plot
fig, ax = plt.subplots(figsize=(10, 6))
monthly_sales.plot(ax=ax, kind='line', color='#2ecc71', linewidth=3, marker='o', markersize=8)
ax.set_title('Monthly Sales Trend', fontsize=14, fontweight='bold')
ax.set_xlabel('Month', fontsize=12)
ax.set_ylabel('Total Sales', fontsize=12)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print("‚úÖ Monthly sales trend visualized!")


## Part 3: Bar Charts


### ‚úèÔ∏è Challenge: Create Top 5 Products Bar Chart


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load data
sales = pd.read_csv(sales_file)
sales_clean = sales.dropna(subset=['Sales'])

# Group by Product and get top 5
product_sales = sales_clean.groupby('Product')['Sales'].sum().sort_values(ascending=False)
top_5_products = product_sales.head(5)

# Create bar chart
fig, ax = plt.subplots(figsize=(10, 6))
top_5_products.plot(kind='bar', ax=ax, color='#2ecc71', edgecolor='black', linewidth=1.5)
ax.set_title('Top 5 Products by Sales', fontsize=14, fontweight='bold')
ax.set_xlabel('Product', fontsize=12)
ax.set_ylabel('Total Sales', fontsize=12)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for i, v in enumerate(top_5_products.values):
    ax.text(i, v, f'${v:,.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()
print("‚úÖ Top 5 products visualized with value labels!")


## Part 4: Histograms


### ‚úèÔ∏è Challenge: Compare Sales Distributions


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load data
sales = pd.read_csv(sales_file)
sales_clean = sales.dropna(subset=['Sales'])

# Filter North and South regions
north_sales = sales_clean[sales_clean['Region'] == 'North']['Sales']
south_sales = sales_clean[sales_clean['Region'] == 'South']['Sales']

# Create overlapping histograms
fig, ax = plt.subplots(figsize=(10, 6))
north_sales.plot(kind='hist', ax=ax, bins=20, alpha=0.6, label='North', 
                color='#3498db', edgecolor='black', linewidth=1)
south_sales.plot(kind='hist', ax=ax, bins=20, alpha=0.6, label='South', 
                color='#e74c3c', edgecolor='black', linewidth=1)
ax.set_title('Sales Distribution: North vs South', fontsize=14, fontweight='bold')
ax.set_xlabel('Sales', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
print("‚úÖ Regional sales distributions compared!")


## Part 5: Scatter Plots


### ‚úèÔ∏è Challenge: Create Correlation Scatter Plot


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

data_dir = os.path.join('..', 'data')
customer_file = os.path.join(data_dir, 'customer_data.csv')

# Load and clean data
customers = pd.read_csv(customer_file)
customers_clean = customers.dropna(subset=['Age', 'TotalSpent'])

# Calculate correlation
correlation = customers_clean['Age'].corr(customers_clean['TotalSpent'])

# Create scatter plot
fig, ax = plt.subplots(figsize=(10, 6))
customers_clean.plot(kind='scatter', x='Age', y='TotalSpent', ax=ax,
                    s=80, alpha=0.6, color='#2ecc71', edgecolors='black', linewidth=0.5)

# Add trend line
z = np.polyfit(customers_clean['Age'], customers_clean['TotalSpent'], 1)
p = np.poly1d(z)
ax.plot(customers_clean['Age'], p(customers_clean['Age']), 
       "r--", alpha=0.8, linewidth=2, label='Trend Line')

# Add title with correlation value
ax.set_title(f'Age vs Total Spent (Correlation: {correlation:.2f})', 
            fontsize=14, fontweight='bold')
ax.set_xlabel('Age', fontsize=12)
ax.set_ylabel('Total Spent', fontsize=12)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
print(f"‚úÖ Scatter plot with trend line! Correlation: {correlation:.2f}")


## Part 6: Box Plots


### ‚úèÔ∏è Challenge: Compare Sales by Region


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load and clean data
sales = pd.read_csv(sales_file)
sales_clean = sales.dropna(subset=['Sales'])

# Create box plots by Region
fig, ax = plt.subplots(figsize=(10, 6))
sales_clean.boxplot(column='Sales', by='Region', ax=ax, patch_artist=True,
                    boxprops=dict(facecolor='#2ecc71', alpha=0.7),
                    medianprops=dict(color='red', linewidth=2))
ax.set_title('Sales Distribution by Region', fontsize=14, fontweight='bold')
ax.set_xlabel('Region', fontsize=12)
ax.set_ylabel('Sales', fontsize=12)

# Remove default suptitle
plt.suptitle('')
plt.tight_layout()
plt.show()
print("‚úÖ Regional sales distributions compared!")


## Part 7: Pie Charts


### ‚úèÔ∏è Challenge: Create Product Sales Pie Chart


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load data
sales = pd.read_csv(sales_file)
sales_clean = sales.dropna(subset=['Sales'])

# Group by Product and get top 3
product_sales = sales_clean.groupby('Product')['Sales'].sum().sort_values(ascending=False)
top_3_products = product_sales.head(3)

# Create pie chart with exploded slices
fig, ax = plt.subplots(figsize=(8, 8))
top_3_products.plot(kind='pie', ax=ax, autopct='%1.1f%%', startangle=90,
                   colors=['#3498db', '#e74c3c', '#2ecc71'],
                   explode=(0.05, 0.05, 0.05),  # Explode slices
                   shadow=True,
                   textprops={'fontsize': 12, 'fontweight': 'bold'})
ax.set_title('Top 3 Products Sales Share', fontsize=14, fontweight='bold')
ax.set_ylabel('')
plt.tight_layout()
plt.show()
print("‚úÖ Pie chart with exploded slices!")


## Part 8: Time Series Visualization


### ‚úèÔ∏è Challenge: Create Sales Trend with Moving Average


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load and prepare data
sales = pd.read_csv(sales_file)
sales['Date'] = pd.to_datetime(sales['Date'])
sales_clean = sales.dropna(subset=['Sales'])

# Calculate daily sales
daily_sales = sales_clean.groupby('Date')['Sales'].sum().sort_index()

# Calculate 7-day moving average
moving_avg = daily_sales.rolling(window=7).mean()

# Plot both on same axes
fig, ax = plt.subplots(figsize=(14, 6))
daily_sales.plot(ax=ax, color='#3498db', linewidth=1, alpha=0.5, label='Daily Sales')
moving_avg.plot(ax=ax, color='#e74c3c', linewidth=2, label='7-Day Moving Average')
ax.set_title('Daily Sales with Moving Average', fontsize=14, fontweight='bold')
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Sales', fontsize=12)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
print("‚úÖ Time series with moving average smoothing!")


## Part 9: Subplots and Layouts


### ‚úèÔ∏è Challenge: Create Customer Analysis Dashboard


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
customer_file = os.path.join(data_dir, 'customer_data.csv')

# Load and clean data
customers = pd.read_csv(customer_file)
customers_clean = customers.dropna(subset=['Age', 'TotalSpent'])

# Create 2x2 subplot layout
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Plot 1: Age distribution histogram
customers_clean['Age'].plot(kind='hist', ax=axes[0, 0], bins=20, 
                           color='#3498db', edgecolor='black')
axes[0, 0].set_title('Age Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Age')
axes[0, 0].grid(True, alpha=0.3, axis='y')

# Plot 2: TotalSpent distribution histogram
customers_clean['TotalSpent'].plot(kind='hist', ax=axes[0, 1], bins=20,
                                  color='#e74c3c', edgecolor='black')
axes[0, 1].set_title('Total Spent Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Total Spent')
axes[0, 1].grid(True, alpha=0.3, axis='y')

# Plot 3: Top 5 cities bar chart
city_sales = customers_clean.groupby('City')['TotalSpent'].sum().sort_values(ascending=False).head(5)
city_sales.plot(kind='bar', ax=axes[1, 0], color='#2ecc71')
axes[1, 0].set_title('Top 5 Cities by Spending', fontweight='bold')
axes[1, 0].set_xticklabels(axes[1, 0].get_xticklabels(), rotation=45, ha='right')
axes[1, 0].grid(True, alpha=0.3, axis='y')

# Plot 4: Age vs TotalSpent scatter plot
customers_clean.plot(kind='scatter', x='Age', y='TotalSpent', ax=axes[1, 1],
                   s=30, alpha=0.6, color='#f39c12')
axes[1, 1].set_title('Age vs Total Spent', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

# Add overall title
plt.suptitle('Customer Demographics Dashboard', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()
print("‚úÖ Customer analysis dashboard created!")


## üéì Project 1: Sales Analysis Dashboard


### Comprehensive Sales Analysis Dashboard


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
sales_file = os.path.join(data_dir, 'sales_data.csv')

# Load and prepare data
sales = pd.read_csv(sales_file)
sales['Date'] = pd.to_datetime(sales['Date'])
sales_clean = sales.dropna(subset=['Sales'])

# Create comprehensive dashboard
fig = plt.figure(figsize=(18, 12))
gs = fig.add_gridspec(3, 3, hspace=0.4, wspace=0.3)

# Top: Time series
ax1 = fig.add_subplot(gs[0, :])
daily_sales = sales_clean.groupby('Date')['Sales'].sum().sort_index()
daily_sales.plot(ax=ax1, color='#3498db', linewidth=2, marker='o', markersize=3)
ax1.set_title('Daily Sales Trend', fontsize=14, fontweight='bold')
ax1.set_xlabel('Date', fontsize=11)
ax1.set_ylabel('Sales', fontsize=11)
ax1.grid(True, alpha=0.3)

# Middle row
ax2 = fig.add_subplot(gs[1, 0])
product_sales = sales_clean.groupby('Product')['Sales'].sum().sort_values(ascending=False).head(5)
product_sales.plot(kind='bar', ax=ax2, color='#e74c3c')
ax2.set_title('Top 5 Products', fontsize=12, fontweight='bold')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=45, ha='right', fontsize=9)
ax2.grid(True, alpha=0.3, axis='y')

ax3 = fig.add_subplot(gs[1, 1])
region_sales = sales_clean.groupby('Region')['Sales'].sum()
region_sales.plot(kind='pie', ax=ax3, autopct='%1.1f%%', fontsize=9)
ax3.set_title('Sales by Region', fontsize=12, fontweight='bold')
ax3.set_ylabel('')

ax4 = fig.add_subplot(gs[1, 2])
sales_clean['Sales'].plot(kind='box', ax=ax4, vert=True, patch_artist=True,
                         boxprops=dict(facecolor='#2ecc71', alpha=0.7))
ax4.set_title('Sales Distribution', fontsize=12, fontweight='bold')
ax4.grid(True, alpha=0.3, axis='y')

# Bottom row
ax5 = fig.add_subplot(gs[2, 0])
monthly_sales = sales_clean.groupby(sales_clean['Date'].dt.to_period('M'))['Sales'].sum()
monthly_sales.plot(kind='bar', ax=ax5, color='#f39c12')
ax5.set_title('Monthly Sales', fontsize=12, fontweight='bold')
ax5.set_xticklabels(ax5.get_xticklabels(), rotation=45, ha='right', fontsize=9)
ax5.grid(True, alpha=0.3, axis='y')

ax6 = fig.add_subplot(gs[2, 1])
sales_clean.plot(kind='scatter', x='Quantity', y='Sales', ax=ax6,
                s=30, alpha=0.6, color='#9b59b6')
ax6.set_title('Sales vs Quantity', fontsize=12, fontweight='bold')
ax6.grid(True, alpha=0.3)

ax7 = fig.add_subplot(gs[2, 2])
product_region = sales_clean.groupby(['Product', 'Region'])['Sales'].sum().unstack(fill_value=0)
product_region.plot(kind='bar', ax=ax7, width=0.8)
ax7.set_title('Sales by Product & Region', fontsize=12, fontweight='bold')
ax7.set_xticklabels(ax7.get_xticklabels(), rotation=45, ha='right', fontsize=9)
ax7.legend(title='Region', fontsize=8, title_fontsize=9)
ax7.grid(True, alpha=0.3, axis='y')

plt.suptitle('Comprehensive Sales Analysis Dashboard', fontsize=18, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()
print("‚úÖ Comprehensive sales dashboard created!")


### Customer Demographics Analysis Dashboard


In [None]:
# Solution
import pandas as pd
import matplotlib.pyplot as plt
import os

data_dir = os.path.join('..', 'data')
customer_file = os.path.join(data_dir, 'customer_data.csv')

# Load and prepare data
customers = pd.read_csv(customer_file)
customers_clean = customers.dropna(subset=['Age', 'TotalSpent'])

# Create age groups
customers_clean['AgeGroup'] = pd.cut(customers_clean['Age'], 
                                    bins=[0, 30, 50, 100],
                                    labels=['Young', 'Middle', 'Senior'])

# Create customer analysis dashboard
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(2, 3, hspace=0.35, wspace=0.3)

# Age distribution
ax1 = fig.add_subplot(gs[0, 0])
customers_clean['Age'].plot(kind='hist', ax=ax1, bins=20, color='#3498db',
                            edgecolor='black', linewidth=1)
ax1.set_title('Age Distribution', fontsize=12, fontweight='bold')
ax1.set_xlabel('Age', fontsize=10)
ax1.set_ylabel('Frequency', fontsize=10)
ax1.grid(True, alpha=0.3, axis='y')

# TotalSpent distribution
ax2 = fig.add_subplot(gs[0, 1])
customers_clean['TotalSpent'].plot(kind='hist', ax=ax2, bins=20, color='#e74c3c',
                                   edgecolor='black', linewidth=1)
ax2.set_title('Total Spent Distribution', fontsize=12, fontweight='bold')
ax2.set_xlabel('Total Spent', fontsize=10)
ax2.set_ylabel('Frequency', fontsize=10)
ax2.grid(True, alpha=0.3, axis='y')

# Sales by City
ax3 = fig.add_subplot(gs[0, 2])
city_sales = customers_clean.groupby('City')['TotalSpent'].sum().sort_values(ascending=False).head(5)
city_sales.plot(kind='bar', ax=ax3, color='#2ecc71', edgecolor='black', linewidth=1)
ax3.set_title('Top 5 Cities by Spending', fontsize=12, fontweight='bold')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=45, ha='right', fontsize=9)
ax3.grid(True, alpha=0.3, axis='y')

# Age vs TotalSpent scatter
ax4 = fig.add_subplot(gs[1, 0])
customers_clean.plot(kind='scatter', x='Age', y='TotalSpent', ax=ax4,
                   s=40, alpha=0.6, color='#f39c12', edgecolors='black', linewidth=0.5)
ax4.set_title('Age vs Total Spent', fontsize=12, fontweight='bold')
ax4.set_xlabel('Age', fontsize=10)
ax4.set_ylabel('Total Spent', fontsize=10)
ax4.grid(True, alpha=0.3)

# Box plot: TotalSpent by City
ax5 = fig.add_subplot(gs[1, 1])
top_cities = customers_clean.groupby('City')['TotalSpent'].sum().nlargest(3).index
city_data = customers_clean[customers_clean['City'].isin(top_cities)]
city_data.boxplot(column='TotalSpent', by='City', ax=ax5, patch_artist=True)
ax5.set_title('Spending by Top Cities', fontsize=12, fontweight='bold')
ax5.set_xlabel('City', fontsize=10)
ax5.set_ylabel('Total Spent', fontsize=10)
plt.suptitle('')
ax5.grid(True, alpha=0.3, axis='y')

# Age groups
ax6 = fig.add_subplot(gs[1, 2])
age_group_sales = customers_clean.groupby('AgeGroup')['TotalSpent'].sum()
age_group_sales.plot(kind='bar', ax=ax6, color='#9b59b6', edgecolor='black', linewidth=1)
ax6.set_title('Spending by Age Group', fontsize=12, fontweight='bold')
ax6.set_xlabel('Age Group', fontsize=10)
ax6.set_ylabel('Total Spent', fontsize=10)
ax6.set_xticklabels(ax6.get_xticklabels(), rotation=0, fontsize=9)
ax6.grid(True, alpha=0.3, axis='y')

plt.suptitle('Customer Demographics Analysis Dashboard', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.show()
print("‚úÖ Customer demographics dashboard created!")
print("\n**Key Insights:**")
print("1. Age distribution of customers")
print("2. Total spending distribution")
print("3. Top cities by customer spending")
print("4. Relationship between age and spending")
print("5. Spending patterns by city")
print("6. Spending by age groups")
