# NumPy and Pandas - Solutions

This notebook contains solutions to all exercises from Lecture 2: NumPy and Pandas.
Try solving them yourself first before looking at the solutions!

## Part 1: NumPy

### ✏️ Challenge: Create Identity Matrix

**Problem:** Create a 4x4 identity matrix (diagonal 1s, rest 0s)

In [None]:
# Solution
import numpy as np

identity = np.eye(4)
print(identity)
print("\nUse np.eye() to create identity matrix!")

### ✏️ Challenge: Temperature Conversion

In [None]:
# Solution
celsius = np.array([0, 10, 20, 30, 37, 100])
fahrenheit = (celsius * 9/5) + 32

print(f"Celsius: {celsius}")
print(f"Fahrenheit: {fahrenheit}")
print("\nConverted all temperatures in one line with NumPy!")

### ✏️ Challenge: Filter Valid Exam Scores

In [None]:
# Solution
all_scores = np.array([95, -5, 102, 88, 150, 76, 0, 100])

# Use boolean indexing with multiple conditions
valid_mask = (all_scores >= 0) & (all_scores <= 100)
valid_scores = all_scores[valid_mask]

print(f"All scores: {all_scores}")
print(f"Valid scores: {valid_scores}")
print(f"Found {len(valid_scores)} valid scores out of {len(all_scores)} total")
print("Boolean indexing makes filtering data super easy! ✅")

### 📊 Real-World Case: Student Grade Analysis

In [None]:
# Solution
scores = np.array([85, 92, 78, 90, 88])

# Calculate statistics
avg_score = np.mean(scores)
above_avg = scores[scores > avg_score]
highest = np.max(scores)
lowest = np.min(scores)

print(f"Test scores: {scores}")
print(f"Average: {avg_score:.2f}")
print(f"Above average scores: {above_avg}")
print(f"Highest: {highest}, Lowest: {lowest}")

## Part 2: Pandas

### ✏️ Challenge: Calculate Discounted Prices

In [None]:
# Solution
import pandas as pd

prices = pd.Series([1.5, 0.8, 1.2, 2.0], 
                  index=['apple', 'banana', 'orange', 'grape'])

# Create a copy to avoid modifying original
discounted_prices = prices.copy()

# Apply discount only to expensive items
discount_mask = discounted_prices > 1.0
discounted_prices[discount_mask] = discounted_prices[discount_mask] * 0.8

print("Original prices:")
print(prices)
print("\nDiscounted prices:")
print(discounted_prices)
print("\n20% discount applied to items over $1.00! 🎉")

### 📊 Real-World Case: Employee Salary Analysis

In [None]:
# Solution
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'Age': [25, 30, 35, 28],
    'City': ['New York', 'Paris', 'London', 'Tokyo'],
    'Salary': [70000, 80000, 75000, 85000]
}
df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)
print()

# Apply filters
young = df['Age'] < 30
high_salary = df['Salary'] > 75000

# Combine conditions
qualified_employees = df[young & high_salary]

# Select only needed columns
result_employees = qualified_employees[['Name', 'Salary']]

print("Qualified employees (Age < 30 AND Salary > $75,000):")
print(result_employees)
print(f"\nFound {len(result_employees)} employee(s) matching criteria")
print("These are young, high-earning employees! 💼")

### ✏️ Challenge: Identify Honor Students

In [None]:
# Solution
students = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'Math': [85, 92, 78, 95, 88],
    'Science': [90, 85, 82, 98, 91],
    'English': [88, 79, 85, 92, 87]
})

# Calculate average
students['Average'] = (students['Math'] + students['Science'] + students['English']) / 3

# Create conditions
high_avg = students['Average'] >= 90
all_scores_good = (students['Math'] >= 85) & (students['Science'] >= 85) & (students['English'] >= 85)

# Find honor students
honor_students = students[high_avg & all_scores_good]
honor_names = honor_students['Name'].tolist()

print("All students:")
print(students)
print(f"\n🎓 Honor students: {', '.join(honor_names)}")
print(f"Total: {len(honor_names)} students qualify for honor roll!")

### 📊 Real-World Case: Regional Sales Performance

In [None]:
# Solution
sales = pd.DataFrame({
    'Product': ['Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone', 'Tablet', 'Laptop', 'Phone'],
    'Region': ['East', 'East', 'East', 'West', 'West', 'West', 'East', 'West'],
    'Sales': [1200, 800, 500, 1100, 850, 480, 1250, 790],
    'Quantity': [3, 5, 2, 2, 6, 3, 4, 4]
})

print("Sales DataFrame:")
print(sales)
print()

# Calculate totals for both regions
west_sales = sales[sales['Region'] == 'West']
total_qty_west = west_sales['Quantity'].sum()

east_sales = sales[sales['Region'] == 'East']
total_qty_east = east_sales['Quantity'].sum()

# Determine better region
better_region = 'West' if total_qty_west > total_qty_east else 'East'

print(f"📦 West region: {total_qty_west} units sold")
print(f"📦 East region: {total_qty_east} units sold")
print(f"🏆 {better_region} region is performing better in quantity!")

## Final Capstone Project

### 🎓 Final Capstone Project: Sales Data Analysis

In [None]:
# Solution
import numpy as np
import pandas as pd

# Step 1: Create sample data
np.random.seed(42)  # For reproducibility

dates = pd.date_range('2024-01-01', periods=100, freq='D')
products = ['Laptop', 'Phone', 'Tablet', 'Monitor', 'Keyboard']
categories = ['Electronics', 'Electronics', 'Electronics', 'Accessories', 'Accessories']
regions = ['North', 'South', 'East', 'West']

sales_data = pd.DataFrame({
    'Date': np.random.choice(dates, 100),
    'Product': np.random.choice(products, 100),
    'Region': np.random.choice(regions, 100),
    'Sales': np.random.randint(100, 2000, 100),
    'Quantity': np.random.randint(1, 10, 100)
})

# Add category based on product
category_map = dict(zip(products, categories))
sales_data['Category'] = sales_data['Product'].map(category_map)

print("Step 1: Sample of our data:")
print(sales_data.head(10))
print()

In [None]:
# Step 2: Data Overview
data_shape = sales_data.shape
print(f"Step 2: Dataset has {data_shape[0]} rows and {data_shape[1]} columns")
print("\nBasic statistics:")
print(sales_data.describe())
print()

In [None]:
# Step 3: Total revenue per category
revenue_by_category = sales_data.groupby('Category')['Sales'].sum().sort_values(ascending=False)
print("Step 3: Total revenue per category:")
print(revenue_by_category)
print()

In [None]:
# Step 4: Top 3 products by revenue
top_products = sales_data.groupby('Product')['Sales'].sum().sort_values(ascending=False).head(3)
print("Step 4: Top 3 products by revenue:")
print(top_products)
print()

In [None]:
# Step 5: Average sales by region
avg_by_region = sales_data.groupby('Region')['Sales'].mean().sort_values(ascending=False)
print("Step 5: Average sales by region:")
print(avg_by_region)
print()

In [None]:
# Step 6: Monthly trend analysis
sales_data['Month'] = pd.to_datetime(sales_data['Date']).dt.month
monthly_sales = sales_data.groupby('Month')['Sales'].sum().sort_values(ascending=False)
print("Step 6: Monthly sales totals:")
print(monthly_sales)
print()

In [None]:
# Step 7: Summary Statistics
summary_stats = sales_data.groupby('Region').agg({
    'Sales': ['sum', 'mean', 'count'],
    'Quantity': 'sum'
})
print("Step 7: Comprehensive regional summary:")
print(summary_stats)
print()

In [None]:
# Step 8: Key Insights
total_revenue = sales_data['Sales'].sum()
total_items = sales_data['Quantity'].sum()
avg_transaction = sales_data['Sales'].mean()

print("Step 8: Executive Summary")
print(f"📊 Total Revenue: ${total_revenue:,.0f}")
print(f"📦 Total Items Sold: {total_items}")
print(f"💰 Average Transaction: ${avg_transaction:.2f}")
print(f"🏆 Best Product: {top_products.index[0]} (${top_products.iloc[0]:,.0f})")
print(f"🌍 Top Region by Avg Sale: {avg_by_region.index[0]} (${avg_by_region.iloc[0]:.2f})")
print()
print("🎯 Business Recommendations:")
print("1. Focus marketing on top-performing products")
print("2. Investigate why certain regions outperform others")
print("3. Increase inventory for best-selling months")
print("4. Consider bundling products from different categories")

**✏️ Your Turn - Solutions:**

In [None]:
# Answer 1: Total quantity sold for 'Laptop'
laptop_qty = sales_data[sales_data['Product'] == 'Laptop']['Quantity'].sum()
print(f"1. Total quantity sold for 'Laptop': {laptop_qty} laptops")

# Answer 2: Region with most transactions
region_transactions = sales_data['Region'].value_counts()
most_transactions_region = region_transactions.index[0]
print(f"\n2. Region with most transactions: {most_transactions_region} ({region_transactions.iloc[0]} transactions)")

# Answer 3: Average quantity per transaction
avg_qty_per_transaction = sales_data['Quantity'].mean()
print(f"\n3. Average quantity per transaction: {avg_qty_per_transaction:.2f} items")