# 第八章：實戰專案

本章將通過三個完整的實戰專案來綜合應用前面學到的 Spark 技術。每個專案都模擬真實的業務場景，包含完整的資料處理流程。

## 專案概覽
1. **日誌分析系統** - 分析網站訪問日誌，識別異常行為
2. **即時監控系統** - 監控系統指標，即時告警
3. **推薦系統** - 基於協同過濾的商品推薦

## 學習目標
- 掌握端到端的 Spark 專案開發流程
- 學習如何處理真實的業務問題
- 了解 Spark 在不同領域的應用
- 培養系統性思維和解決問題的能力

In [None]:
# 導入必要的庫
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
import re
from datetime import datetime, timedelta
import json
from collections import defaultdict

# 建立 SparkSession
spark = SparkSession.builder \
    .appName("Spark Real-world Projects") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.adaptive.skewJoin.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"Spark 版本: {spark.version}")
print(f"可用核心數: {spark.sparkContext.defaultParallelism}")

## 專案一：日誌分析系統

### 專案背景
網站每天產生大量訪問日誌，需要分析這些日誌來：
- 識別異常訪問模式
- 分析用戶行為
- 監控系統性能
- 檢測潛在的安全威脅

### 技術要點
- 日誌解析和資料清理
- 時間序列分析
- 異常檢測
- 資料視覺化

In [None]:
# 專案一：日誌分析系統

class LogAnalyzer:
    def __init__(self, spark):
        self.spark = spark
        self.log_df = None
        
    def generate_sample_logs(self, num_logs=100000):
        """
        生成模擬的網站訪問日誌
        """
        print(f"生成 {num_logs} 條模擬日誌...")
        
        # 定義常見的 IP 地址、用戶代理、請求路徑等
        ips = [f"192.168.1.{i}" for i in range(1, 100)] + \
              [f"10.0.0.{i}" for i in range(1, 50)] + \
              ["suspicious.ip.1", "suspicious.ip.2", "suspicious.ip.3"]  # 模擬異常 IP
        
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
            "bot/crawler",  # 模擬爬蟲
            "malicious-bot"  # 模擬惡意爬蟲
        ]
        
        paths = [
            "/", "/home", "/about", "/contact", "/products", "/login", "/register",
            "/api/users", "/api/products", "/api/orders",
            "/admin", "/admin/users", "/admin/config",  # 敏感路徑
            "/../../etc/passwd", "/admin/../../etc/passwd"  # 模擬攻擊
        ]
        
        methods = ["GET", "POST", "PUT", "DELETE", "OPTIONS"]
        status_codes = [200, 201, 301, 302, 400, 401, 403, 404, 500, 502, 503]
        
        logs = []
        base_time = datetime.now() - timedelta(days=7)
        
        for i in range(num_logs):
            # 生成時間戳（模擬一週的日誌）
            timestamp = base_time + timedelta(
                days=random.randint(0, 6),
                hours=random.randint(0, 23),
                minutes=random.randint(0, 59),
                seconds=random.randint(0, 59)
            )
            
            # 模擬異常行為
            if random.random() < 0.05:  # 5% 的請求是異常的
                ip = random.choice(["suspicious.ip.1", "suspicious.ip.2", "suspicious.ip.3"])
                path = random.choice(["/admin", "/../../etc/passwd", "/admin/../../etc/passwd"])
                status_code = random.choice([401, 403, 404])
                user_agent = random.choice(["bot/crawler", "malicious-bot"])
                response_size = random.randint(100, 500)
                response_time = random.uniform(0.1, 2.0)
            else:
                ip = random.choice(ips)
                path = random.choice(paths)
                status_code = random.choice(status_codes)
                user_agent = random.choice(user_agents)
                response_size = random.randint(500, 10000)
                response_time = random.uniform(0.05, 1.0)
            
            method = random.choice(methods)
            
            # 生成日誌行（Apache Common Log Format）
            log_line = f'{ip} - - [{timestamp.strftime("%d/%b/%Y:%H:%M:%S +0000")}] "{method} {path} HTTP/1.1" {status_code} {response_size} "{user_agent}" {response_time:.3f}'
            
            logs.append((
                timestamp,
                ip,
                method,
                path,
                status_code,
                response_size,
                user_agent,
                response_time,
                log_line
            ))
        
        # 建立 DataFrame
        schema = StructType([
            StructField("timestamp", TimestampType(), True),
            StructField("ip", StringType(), True),
            StructField("method", StringType(), True),
            StructField("path", StringType(), True),
            StructField("status_code", IntegerType(), True),
            StructField("response_size", IntegerType(), True),
            StructField("user_agent", StringType(), True),
            StructField("response_time", DoubleType(), True),
            StructField("raw_log", StringType(), True)
        ])
        
        self.log_df = self.spark.createDataFrame(logs, schema)
        print(f"成功生成 {self.log_df.count()} 條日誌記錄")
        
        return self.log_df
    
    def basic_statistics(self):
        """
        基本統計分析
        """
        print("\n=== 基本統計分析 ===")
        
        # 總體統計
        total_requests = self.log_df.count()
        unique_ips = self.log_df.select("ip").distinct().count()
        unique_paths = self.log_df.select("path").distinct().count()
        
        print(f"總請求數: {total_requests:,}")
        print(f"唯一 IP 數: {unique_ips:,}")
        print(f"唯一路徑數: {unique_paths:,}")
        
        # 狀態碼分佈
        print("\n狀態碼分佈:")
        status_dist = self.log_df.groupBy("status_code") \
                                .count() \
                                .orderBy("status_code") \
                                .collect()
        
        for row in status_dist:
            percentage = (row['count'] / total_requests) * 100
            print(f"  {row['status_code']}: {row['count']:,} ({percentage:.1f}%)")
        
        # 請求方法分佈
        print("\n請求方法分佈:")
        method_dist = self.log_df.groupBy("method") \
                                .count() \
                                .orderBy(col("count").desc()) \
                                .collect()
        
        for row in method_dist:
            percentage = (row['count'] / total_requests) * 100
            print(f"  {row['method']}: {row['count']:,} ({percentage:.1f}%)")
        
        # 響應時間統計
        print("\n響應時間統計:")
        response_time_stats = self.log_df.select("response_time").describe().collect()
        for row in response_time_stats:
            print(f"  {row['summary']}: {float(row['response_time']):.3f}s")
    
    def detect_anomalies(self):
        """
        異常檢測
        """
        print("\n=== 異常檢測 ===")
        
        # 1. 檢測高頻率請求的 IP（可能是攻擊或爬蟲）
        print("\n1. 高頻率請求 IP 檢測:")
        high_freq_ips = self.log_df.groupBy("ip") \
                                  .count() \
                                  .filter(col("count") > 1000) \
                                  .orderBy(col("count").desc())
        
        high_freq_ips.show()
        
        # 2. 檢測異常狀態碼（4xx, 5xx 錯誤）
        print("\n2. 異常狀態碼分析:")
        error_analysis = self.log_df.filter((col("status_code") >= 400) | (col("status_code") >= 500)) \
                                   .groupBy("ip", "status_code") \
                                   .count() \
                                   .filter(col("count") > 50) \
                                   .orderBy(col("count").desc())
        
        error_analysis.show()
        
        # 3. 檢測敏感路徑訪問
        print("\n3. 敏感路徑訪問檢測:")
        sensitive_paths = ["/admin", "/../../etc/passwd", "/admin/../../etc/passwd"]
        
        sensitive_access = self.log_df.filter(col("path").rlike("admin|etc/passwd|config")) \
                                     .groupBy("ip", "path") \
                                     .count() \
                                     .orderBy(col("count").desc())
        
        sensitive_access.show(truncate=False)
        
        # 4. 檢測異常響應時間
        print("\n4. 異常響應時間檢測:")
        
        # 計算響應時間的統計量
        response_time_stats = self.log_df.select(
            mean("response_time").alias("mean_response_time"),
            stddev("response_time").alias("std_response_time")
        ).collect()[0]
        
        mean_time = response_time_stats['mean_response_time']
        std_time = response_time_stats['std_response_time']
        threshold = mean_time + 2 * std_time  # 2 標準差作為閾值
        
        slow_requests = self.log_df.filter(col("response_time") > threshold) \
                                  .select("timestamp", "ip", "path", "response_time") \
                                  .orderBy(col("response_time").desc())
        
        print(f"響應時間異常閾值: {threshold:.3f}s")
        print(f"異常慢請求數量: {slow_requests.count()}")
        slow_requests.show(10)
        
        # 5. 檢測爬蟲行為
        print("\n5. 爬蟲行為檢測:")
        bot_behavior = self.log_df.filter(col("user_agent").rlike("bot|crawler|spider")) \
                                 .groupBy("ip", "user_agent") \
                                 .count() \
                                 .orderBy(col("count").desc())
        
        bot_behavior.show(truncate=False)
        
        return {
            'high_freq_ips': high_freq_ips,
            'error_analysis': error_analysis,
            'sensitive_access': sensitive_access,
            'slow_requests': slow_requests,
            'bot_behavior': bot_behavior
        }
    
    def time_series_analysis(self):
        """
        時間序列分析
        """
        print("\n=== 時間序列分析 ===")
        
        # 按小時統計請求量
        hourly_stats = self.log_df.withColumn("hour", hour("timestamp")) \
                                 .groupBy("hour") \
                                 .agg(
                                     count("*").alias("request_count"),
                                     avg("response_time").alias("avg_response_time"),
                                     countDistinct("ip").alias("unique_ips")
                                 ) \
                                 .orderBy("hour")
        
        print("\n每小時請求統計:")
        hourly_stats.show(24)
        
        # 按天統計
        daily_stats = self.log_df.withColumn("date", date_format("timestamp", "yyyy-MM-dd")) \
                                .groupBy("date") \
                                .agg(
                                    count("*").alias("request_count"),
                                    avg("response_time").alias("avg_response_time"),
                                    countDistinct("ip").alias("unique_ips")
                                ) \
                                .orderBy("date")
        
        print("\n每日請求統計:")
        daily_stats.show()
        
        # 視覺化時間序列
        hourly_data = hourly_stats.toPandas()
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 每小時請求量
        axes[0, 0].plot(hourly_data['hour'], hourly_data['request_count'], marker='o')
        axes[0, 0].set_title('每小時請求量')
        axes[0, 0].set_xlabel('小時')
        axes[0, 0].set_ylabel('請求數')
        axes[0, 0].grid(True)
        
        # 每小時平均響應時間
        axes[0, 1].plot(hourly_data['hour'], hourly_data['avg_response_time'], marker='o', color='red')
        axes[0, 1].set_title('每小時平均響應時間')
        axes[0, 1].set_xlabel('小時')
        axes[0, 1].set_ylabel('響應時間 (秒)')
        axes[0, 1].grid(True)
        
        # 每小時唯一 IP 數
        axes[1, 0].plot(hourly_data['hour'], hourly_data['unique_ips'], marker='o', color='green')
        axes[1, 0].set_title('每小時唯一 IP 數')
        axes[1, 0].set_xlabel('小時')
        axes[1, 0].set_ylabel('唯一 IP 數')
        axes[1, 0].grid(True)
        
        # 狀態碼分佈
        status_data = self.log_df.groupBy("status_code").count().toPandas()
        axes[1, 1].pie(status_data['count'], labels=status_data['status_code'], autopct='%1.1f%%')
        axes[1, 1].set_title('狀態碼分佈')
        
        plt.tight_layout()
        plt.show()
        
        return hourly_stats, daily_stats
    
    def generate_security_report(self):
        """
        生成安全報告
        """
        print("\n=== 安全分析報告 ===")
        
        # 威脅等級評估
        threats = []
        
        # 檢測 SQL 注入嘗試
        sql_injection = self.log_df.filter(
            col("path").rlike("(union|select|drop|insert|update|delete|script|alert)") |
            col("path").contains("'") |
            col("path").contains("--")
        ).count()
        
        if sql_injection > 0:
            threats.append(f"檢測到 {sql_injection} 次可能的 SQL 注入嘗試")
        
        # 檢測目錄遍歷攻擊
        directory_traversal = self.log_df.filter(
            col("path").contains("../") |
            col("path").contains("..\\")
        ).count()
        
        if directory_traversal > 0:
            threats.append(f"檢測到 {directory_traversal} 次可能的目錄遍歷攻擊")
        
        # 檢測暴力破解
        brute_force = self.log_df.filter(
            (col("path").contains("/login") | col("path").contains("/admin")) &
            (col("status_code") == 401)
        ).groupBy("ip").count().filter(col("count") > 10).count()
        
        if brute_force > 0:
            threats.append(f"檢測到 {brute_force} 個 IP 可能進行暴力破解攻擊")
        
        # 檢測 DDoS 攻擊
        ddos_threshold = 5000  # 每個 IP 超過 5000 次請求視為可疑
        ddos_ips = self.log_df.groupBy("ip").count().filter(col("count") > ddos_threshold).count()
        
        if ddos_ips > 0:
            threats.append(f"檢測到 {ddos_ips} 個 IP 可能進行 DDoS 攻擊")
        
        # 輸出威脅報告
        if threats:
            print("\n🚨 檢測到以下安全威脅:")
            for i, threat in enumerate(threats, 1):
                print(f"  {i}. {threat}")
        else:
            print("\n✅ 未檢測到明顯的安全威脅")
        
        # 生成建議
        print("\n💡 安全建議:")
        suggestions = [
            "定期監控異常 IP 地址的活動",
            "對敏感端點實施訪問控制",
            "設置速率限制以防止暴力破解",
            "啟用 Web 應用防火牆 (WAF)",
            "定期更新安全規則和簽名",
            "建立即時警報系統"
        ]
        
        for i, suggestion in enumerate(suggestions, 1):
            print(f"  {i}. {suggestion}")
        
        return threats

# 執行日誌分析系統
print("=== 專案一：日誌分析系統 ===")

log_analyzer = LogAnalyzer(spark)
log_data = log_analyzer.generate_sample_logs(50000)

# 顯示樣本資料
print("\n日誌樣本:")
log_data.show(10, truncate=False)

# 執行分析
log_analyzer.basic_statistics()
anomalies = log_analyzer.detect_anomalies()
hourly_stats, daily_stats = log_analyzer.time_series_analysis()
threats = log_analyzer.generate_security_report()

## 專案二：即時監控系統

### 專案背景
建立一個即時監控系統來監控服務器和應用程式的健康狀態：
- 監控系統指標（CPU、記憶體、磁碟、網路）
- 檢測異常情況並發送警報
- 提供歷史趨勢分析
- 預測潛在的系統問題

### 技術要點
- 即時資料處理
- 閾值監控
- 趨勢分析
- 異常預測

In [None]:
# 專案二：即時監控系統

class MonitoringSystem:
    def __init__(self, spark):
        self.spark = spark
        self.metrics_df = None
        self.alert_thresholds = {
            'cpu_usage': 80.0,
            'memory_usage': 85.0,
            'disk_usage': 90.0,
            'network_latency': 100.0,
            'error_rate': 5.0
        }
    
    def generate_metrics_data(self, num_records=50000):
        """
        生成模擬的系統指標資料
        """
        print(f"生成 {num_records} 條系統指標資料...")
        
        servers = [f"server-{i:02d}" for i in range(1, 21)]  # 20 台服務器
        services = ["web", "api", "database", "cache", "queue"]
        
        metrics = []
        base_time = datetime.now() - timedelta(hours=24)
        
        for i in range(num_records):
            timestamp = base_time + timedelta(seconds=i * 30)  # 每 30 秒一個資料點
            server = random.choice(servers)
            service = random.choice(services)
            
            # 生成基準值
            if service == "database":
                # 資料庫通常使用更多 CPU 和記憶體
                cpu_base = 60 + random.gauss(0, 10)
                memory_base = 70 + random.gauss(0, 10)
            elif service == "web":
                cpu_base = 30 + random.gauss(0, 10)
                memory_base = 40 + random.gauss(0, 10)
            else:
                cpu_base = 45 + random.gauss(0, 10)
                memory_base = 50 + random.gauss(0, 10)
            
            # 模擬異常情況
            if random.random() < 0.1:  # 10% 的時間有異常
                cpu_usage = min(100, max(0, cpu_base + random.gauss(30, 10)))
                memory_usage = min(100, max(0, memory_base + random.gauss(25, 10)))
                disk_usage = min(100, max(0, random.gauss(85, 10)))
                network_latency = max(0, random.gauss(120, 30))
                error_rate = max(0, random.gauss(8, 3))
            else:
                cpu_usage = min(100, max(0, cpu_base))
                memory_usage = min(100, max(0, memory_base))
                disk_usage = min(100, max(0, random.gauss(65, 15)))
                network_latency = max(0, random.gauss(25, 10))
                error_rate = max(0, random.gauss(1, 0.5))
            
            request_count = max(0, int(random.gauss(100, 30)))
            response_time = max(0, random.gauss(200, 50))
            
            metrics.append((
                timestamp,
                server,
                service,
                cpu_usage,
                memory_usage,
                disk_usage,
                network_latency,
                error_rate,
                request_count,
                response_time
            ))
        
        schema = StructType([
            StructField("timestamp", TimestampType(), True),
            StructField("server", StringType(), True),
            StructField("service", StringType(), True),
            StructField("cpu_usage", DoubleType(), True),
            StructField("memory_usage", DoubleType(), True),
            StructField("disk_usage", DoubleType(), True),
            StructField("network_latency", DoubleType(), True),
            StructField("error_rate", DoubleType(), True),
            StructField("request_count", IntegerType(), True),
            StructField("response_time", DoubleType(), True)
        ])
        
        self.metrics_df = self.spark.createDataFrame(metrics, schema)
        print(f"成功生成 {self.metrics_df.count()} 條系統指標")
        
        return self.metrics_df
    
    def monitor_alerts(self):
        """
        監控警報
        """
        print("\n=== 警報監控 ===")
        
        alerts = []
        
        # CPU 使用率警報
        cpu_alerts = self.metrics_df.filter(
            col("cpu_usage") > self.alert_thresholds['cpu_usage']
        ).select("timestamp", "server", "service", "cpu_usage")
        
        cpu_count = cpu_alerts.count()
        if cpu_count > 0:
            alerts.append(f"CPU 使用率警報: {cpu_count} 次")
            print(f"\n🚨 CPU 使用率超過 {self.alert_thresholds['cpu_usage']}% 的記錄:")
            cpu_alerts.orderBy(col("cpu_usage").desc()).show(10)
        
        # 記憶體使用率警報
        memory_alerts = self.metrics_df.filter(
            col("memory_usage") > self.alert_thresholds['memory_usage']
        ).select("timestamp", "server", "service", "memory_usage")
        
        memory_count = memory_alerts.count()
        if memory_count > 0:
            alerts.append(f"記憶體使用率警報: {memory_count} 次")
            print(f"\n🚨 記憶體使用率超過 {self.alert_thresholds['memory_usage']}% 的記錄:")
            memory_alerts.orderBy(col("memory_usage").desc()).show(10)
        
        # 磁碟使用率警報
        disk_alerts = self.metrics_df.filter(
            col("disk_usage") > self.alert_thresholds['disk_usage']
        ).select("timestamp", "server", "service", "disk_usage")
        
        disk_count = disk_alerts.count()
        if disk_count > 0:
            alerts.append(f"磁碟使用率警報: {disk_count} 次")
            print(f"\n🚨 磁碟使用率超過 {self.alert_thresholds['disk_usage']}% 的記錄:")
            disk_alerts.orderBy(col("disk_usage").desc()).show(10)
        
        # 網路延遲警報
        network_alerts = self.metrics_df.filter(
            col("network_latency") > self.alert_thresholds['network_latency']
        ).select("timestamp", "server", "service", "network_latency")
        
        network_count = network_alerts.count()
        if network_count > 0:
            alerts.append(f"網路延遲警報: {network_count} 次")
            print(f"\n🚨 網路延遲超過 {self.alert_thresholds['network_latency']}ms 的記錄:")
            network_alerts.orderBy(col("network_latency").desc()).show(10)
        
        # 錯誤率警報
        error_alerts = self.metrics_df.filter(
            col("error_rate") > self.alert_thresholds['error_rate']
        ).select("timestamp", "server", "service", "error_rate")
        
        error_count = error_alerts.count()
        if error_count > 0:
            alerts.append(f"錯誤率警報: {error_count} 次")
            print(f"\n🚨 錯誤率超過 {self.alert_thresholds['error_rate']}% 的記錄:")
            error_alerts.orderBy(col("error_rate").desc()).show(10)
        
        # 生成警報摘要
        print("\n📊 警報摘要:")
        if alerts:
            for alert in alerts:
                print(f"  • {alert}")
        else:
            print("  ✅ 沒有觸發警報")
        
        return alerts
    
    def system_health_dashboard(self):
        """
        系統健康儀表板
        """
        print("\n=== 系統健康儀表板 ===")
        
        # 計算各服務的平均指標
        service_health = self.metrics_df.groupBy("service").agg(
            avg("cpu_usage").alias("avg_cpu"),
            avg("memory_usage").alias("avg_memory"),
            avg("disk_usage").alias("avg_disk"),
            avg("network_latency").alias("avg_latency"),
            avg("error_rate").alias("avg_error_rate"),
            avg("response_time").alias("avg_response_time")
        ).orderBy("service")
        
        print("\n各服務健康狀態:")
        service_health.show()
        
        # 計算各服務器的平均指標
        server_health = self.metrics_df.groupBy("server").agg(
            avg("cpu_usage").alias("avg_cpu"),
            avg("memory_usage").alias("avg_memory"),
            avg("disk_usage").alias("avg_disk"),
            count("*").alias("data_points")
        ).orderBy("server")
        
        print("\n各服務器健康狀態:")
        server_health.show()
        
        # 識別最繁忙的服務器
        busy_servers = self.metrics_df.groupBy("server").agg(
            avg("cpu_usage").alias("avg_cpu"),
            avg("memory_usage").alias("avg_memory"),
            sum("request_count").alias("total_requests")
        ).orderBy(col("avg_cpu").desc())
        
        print("\n最繁忙的服務器 (按 CPU 使用率排序):")
        busy_servers.show(10)
        
        # 視覺化系統健康狀態
        service_data = service_health.toPandas()
        
        fig, axes = plt.subplots(2, 3, figsize=(18, 12))
        
        # CPU 使用率
        axes[0, 0].bar(service_data['service'], service_data['avg_cpu'])
        axes[0, 0].set_title('各服務平均 CPU 使用率')
        axes[0, 0].set_ylabel('CPU 使用率 (%)')
        axes[0, 0].tick_params(axis='x', rotation=45)
        
        # 記憶體使用率
        axes[0, 1].bar(service_data['service'], service_data['avg_memory'], color='orange')
        axes[0, 1].set_title('各服務平均記憶體使用率')
        axes[0, 1].set_ylabel('記憶體使用率 (%)')
        axes[0, 1].tick_params(axis='x', rotation=45)
        
        # 磁碟使用率
        axes[0, 2].bar(service_data['service'], service_data['avg_disk'], color='green')
        axes[0, 2].set_title('各服務平均磁碟使用率')
        axes[0, 2].set_ylabel('磁碟使用率 (%)')
        axes[0, 2].tick_params(axis='x', rotation=45)
        
        # 網路延遲
        axes[1, 0].bar(service_data['service'], service_data['avg_latency'], color='red')
        axes[1, 0].set_title('各服務平均網路延遲')
        axes[1, 0].set_ylabel('延遲 (ms)')
        axes[1, 0].tick_params(axis='x', rotation=45)
        
        # 錯誤率
        axes[1, 1].bar(service_data['service'], service_data['avg_error_rate'], color='purple')
        axes[1, 1].set_title('各服務平均錯誤率')
        axes[1, 1].set_ylabel('錯誤率 (%)')
        axes[1, 1].tick_params(axis='x', rotation=45)
        
        # 響應時間
        axes[1, 2].bar(service_data['service'], service_data['avg_response_time'], color='brown')
        axes[1, 2].set_title('各服務平均響應時間')
        axes[1, 2].set_ylabel('響應時間 (ms)')
        axes[1, 2].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
        
        return service_health, server_health
    
    def trend_analysis(self):
        """
        趨勢分析
        """
        print("\n=== 趨勢分析 ===")
        
        # 按小時分析趨勢
        hourly_trends = self.metrics_df.withColumn("hour", hour("timestamp")) \
                                      .groupBy("hour") \
                                      .agg(
                                          avg("cpu_usage").alias("avg_cpu"),
                                          avg("memory_usage").alias("avg_memory"),
                                          avg("network_latency").alias("avg_latency"),
                                          sum("request_count").alias("total_requests")
                                      ) \
                                      .orderBy("hour")
        
        print("\n每小時趨勢:")
        hourly_trends.show(24)
        
        # 檢測異常趨勢
        print("\n異常趨勢檢測:")
        
        # 計算移動平均
        window_spec = Window.orderBy("timestamp").rowsBetween(-4, 0)  # 5 點移動平均
        
        with_moving_avg = self.metrics_df.withColumn(
            "cpu_moving_avg", avg("cpu_usage").over(window_spec)
        ).withColumn(
            "cpu_deviation", abs(col("cpu_usage") - col("cpu_moving_avg"))
        )
        
        # 找出偏離移動平均較大的點
        anomalies = with_moving_avg.filter(col("cpu_deviation") > 20) \
                                  .select("timestamp", "server", "service", "cpu_usage", "cpu_moving_avg", "cpu_deviation") \
                                  .orderBy(col("cpu_deviation").desc())
        
        print(f"檢測到 {anomalies.count()} 個 CPU 使用率異常點:")
        anomalies.show(10)
        
        # 視覺化趨勢
        trend_data = hourly_trends.toPandas()
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # CPU 趨勢
        axes[0, 0].plot(trend_data['hour'], trend_data['avg_cpu'], marker='o')
        axes[0, 0].set_title('每小時平均 CPU 使用率趨勢')
        axes[0, 0].set_xlabel('小時')
        axes[0, 0].set_ylabel('CPU 使用率 (%)')
        axes[0, 0].grid(True)
        
        # 記憶體趨勢
        axes[0, 1].plot(trend_data['hour'], trend_data['avg_memory'], marker='o', color='orange')
        axes[0, 1].set_title('每小時平均記憶體使用率趨勢')
        axes[0, 1].set_xlabel('小時')
        axes[0, 1].set_ylabel('記憶體使用率 (%)')
        axes[0, 1].grid(True)
        
        # 網路延遲趨勢
        axes[1, 0].plot(trend_data['hour'], trend_data['avg_latency'], marker='o', color='red')
        axes[1, 0].set_title('每小時平均網路延遲趨勢')
        axes[1, 0].set_xlabel('小時')
        axes[1, 0].set_ylabel('延遲 (ms)')
        axes[1, 0].grid(True)
        
        # 請求量趨勢
        axes[1, 1].plot(trend_data['hour'], trend_data['total_requests'], marker='o', color='green')
        axes[1, 1].set_title('每小時總請求量趨勢')
        axes[1, 1].set_xlabel('小時')
        axes[1, 1].set_ylabel('請求數')
        axes[1, 1].grid(True)
        
        plt.tight_layout()
        plt.show()
        
        return hourly_trends, anomalies
    
    def generate_monitoring_report(self):
        """
        生成監控報告
        """
        print("\n=== 監控報告 ===")
        
        # 系統整體健康度評分
        avg_metrics = self.metrics_df.agg(
            avg("cpu_usage").alias("avg_cpu"),
            avg("memory_usage").alias("avg_memory"),
            avg("disk_usage").alias("avg_disk"),
            avg("network_latency").alias("avg_latency"),
            avg("error_rate").alias("avg_error_rate")
        ).collect()[0]
        
        # 計算健康度評分（0-100）
        cpu_score = max(0, 100 - avg_metrics['avg_cpu'])
        memory_score = max(0, 100 - avg_metrics['avg_memory'])
        disk_score = max(0, 100 - avg_metrics['avg_disk'])
        latency_score = max(0, 100 - avg_metrics['avg_latency'])
        error_score = max(0, 100 - avg_metrics['avg_error_rate'] * 10)
        
        overall_score = (cpu_score + memory_score + disk_score + latency_score + error_score) / 5
        
        print(f"\n🏥 系統整體健康度評分: {overall_score:.1f}/100")
        
        # 健康度等級
        if overall_score >= 80:
            health_status = "優秀 😊"
        elif overall_score >= 60:
            health_status = "良好 🙂"
        elif overall_score >= 40:
            health_status = "一般 😐"
        else:
            health_status = "需要關注 😰"
        
        print(f"健康度等級: {health_status}")
        
        # 詳細指標
        print("\n📊 詳細指標:")
        print(f"  CPU 使用率: {avg_metrics['avg_cpu']:.1f}% (評分: {cpu_score:.1f})")
        print(f"  記憶體使用率: {avg_metrics['avg_memory']:.1f}% (評分: {memory_score:.1f})")
        print(f"  磁碟使用率: {avg_metrics['avg_disk']:.1f}% (評分: {disk_score:.1f})")
        print(f"  網路延遲: {avg_metrics['avg_latency']:.1f}ms (評分: {latency_score:.1f})")
        print(f"  錯誤率: {avg_metrics['avg_error_rate']:.1f}% (評分: {error_score:.1f})")
        
        # 建議
        print("\n💡 優化建議:")
        recommendations = []
        
        if avg_metrics['avg_cpu'] > 70:
            recommendations.append("CPU 使用率偏高，建議優化程式碼或增加服務器資源")
        
        if avg_metrics['avg_memory'] > 80:
            recommendations.append("記憶體使用率偏高，建議檢查記憶體洩漏或增加記憶體")
        
        if avg_metrics['avg_disk'] > 85:
            recommendations.append("磁碟使用率偏高，建議清理或擴展磁碟空間")
        
        if avg_metrics['avg_latency'] > 50:
            recommendations.append("網路延遲較高，建議檢查網路配置或優化網路拓撲")
        
        if avg_metrics['avg_error_rate'] > 3:
            recommendations.append("錯誤率偏高，建議檢查應用程式日誌和錯誤處理")
        
        if recommendations:
            for i, rec in enumerate(recommendations, 1):
                print(f"  {i}. {rec}")
        else:
            print("  ✅ 系統運行良好，暫無特別建議")
        
        return {
            'overall_score': overall_score,
            'health_status': health_status,
            'avg_metrics': avg_metrics,
            'recommendations': recommendations
        }

# 執行監控系統
print("\n=== 專案二：即時監控系統 ===")

monitoring_system = MonitoringSystem(spark)
metrics_data = monitoring_system.generate_metrics_data(20000)

# 顯示樣本資料
print("\n系統指標樣本:")
metrics_data.show(10)

# 執行監控分析
alerts = monitoring_system.monitor_alerts()
service_health, server_health = monitoring_system.system_health_dashboard()
hourly_trends, anomalies = monitoring_system.trend_analysis()
monitoring_report = monitoring_system.generate_monitoring_report()

## 專案三：推薦系統

### 專案背景
建立一個基於協同過濾的商品推薦系統：
- 分析用戶行為和購買歷史
- 使用協同過濾演算法生成推薦
- 評估推薦系統的性能
- 提供個性化推薦結果

### 技術要點
- 協同過濾 (Collaborative Filtering)
- 矩陣分解 (Matrix Factorization)
- 推薦評估指標
- 大規模推薦系統

In [None]:
# 專案三：推薦系統

class RecommendationSystem:
    def __init__(self, spark):
        self.spark = spark
        self.ratings_df = None
        self.products_df = None
        self.users_df = None
        self.model = None
    
    def generate_sample_data(self, num_users=1000, num_products=500, num_ratings=50000):
        """
        生成模擬的用戶評分資料
        """
        print(f"生成推薦系統資料: {num_users} 用戶, {num_products} 商品, {num_ratings} 評分...")
        
        # 生成商品資料
        categories = ['Electronics', 'Books', 'Clothing', 'Sports', 'Home', 'Beauty', 'Automotive']
        products = []
        
        for i in range(num_products):
            products.append((
                i + 1,  # product_id
                f"Product {i+1}",
                random.choice(categories),
                random.uniform(10, 500),  # price
                random.uniform(3.0, 5.0)  # avg_rating
            ))
        
        products_schema = StructType([
            StructField("product_id", IntegerType(), True),
            StructField("product_name", StringType(), True),
            StructField("category", StringType(), True),
            StructField("price", DoubleType(), True),
            StructField("avg_rating", DoubleType(), True)
        ])
        
        self.products_df = self.spark.createDataFrame(products, products_schema)
        
        # 生成用戶資料
        users = []
        age_groups = ['18-25', '26-35', '36-45', '46-55', '56+']
        genders = ['M', 'F']
        
        for i in range(num_users):
            users.append((
                i + 1,  # user_id
                f"User {i+1}",
                random.choice(age_groups),
                random.choice(genders),
                random.choice(categories)  # preferred_category
            ))
        
        users_schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("user_name", StringType(), True),
            StructField("age_group", StringType(), True),
            StructField("gender", StringType(), True),
            StructField("preferred_category", StringType(), True)
        ])
        
        self.users_df = self.spark.createDataFrame(users, users_schema)
        
        # 生成評分資料
        ratings = []
        
        for _ in range(num_ratings):
            user_id = random.randint(1, num_users)
            product_id = random.randint(1, num_products)
            
            # 模擬用戶偏好（用戶更可能對偏好類別的商品給高分）
            user_preferred_category = users[user_id - 1][4]
            product_category = products[product_id - 1][2]
            
            if user_preferred_category == product_category:
                # 偏好類別，更高的評分
                rating = random.choices([3, 4, 5], weights=[0.2, 0.4, 0.4])[0]
            else:
                # 非偏好類別，更分散的評分
                rating = random.choices([1, 2, 3, 4, 5], weights=[0.1, 0.2, 0.4, 0.2, 0.1])[0]
            
            timestamp = datetime.now() - timedelta(days=random.randint(0, 365))
            
            ratings.append((
                user_id,
                product_id,
                float(rating),
                timestamp
            ))
        
        ratings_schema = StructType([
            StructField("user_id", IntegerType(), True),
            StructField("product_id", IntegerType(), True),
            StructField("rating", FloatType(), True),
            StructField("timestamp", TimestampType(), True)
        ])
        
        self.ratings_df = self.spark.createDataFrame(ratings, ratings_schema)
        
        print(f"生成完成: {self.users_df.count()} 用戶, {self.products_df.count()} 商品, {self.ratings_df.count()} 評分")
        
        return self.ratings_df, self.products_df, self.users_df
    
    def exploratory_analysis(self):
        """
        探索性資料分析
        """
        print("\n=== 探索性資料分析 ===")
        
        # 基本統計
        total_ratings = self.ratings_df.count()
        unique_users = self.ratings_df.select("user_id").distinct().count()
        unique_products = self.ratings_df.select("product_id").distinct().count()
        
        print(f"總評分數: {total_ratings:,}")
        print(f"活躍用戶數: {unique_users:,}")
        print(f"被評分商品數: {unique_products:,}")
        
        # 評分分佈
        print("\n評分分佈:")
        rating_dist = self.ratings_df.groupBy("rating").count().orderBy("rating")
        rating_dist.show()
        
        # 用戶活躍度分析
        print("\n用戶活躍度分析:")
        user_activity = self.ratings_df.groupBy("user_id").count().alias("rating_count")
        user_activity_stats = user_activity.describe("count")
        user_activity_stats.show()
        
        # 商品受歡迎程度
        print("\n最受歡迎的商品:")
        popular_products = self.ratings_df.groupBy("product_id").agg(
            count("rating").alias("rating_count"),
            avg("rating").alias("avg_rating")
        ).join(self.products_df, "product_id") \
         .orderBy(col("rating_count").desc())
        
        popular_products.select("product_name", "category", "rating_count", "avg_rating").show(10)
        
        # 分類偏好分析
        print("\n分類偏好分析:")
        category_preference = self.ratings_df.join(self.products_df, "product_id") \
                                           .groupBy("category").agg(
                                               count("rating").alias("rating_count"),
                                               avg("rating").alias("avg_rating")
                                           ).orderBy(col("rating_count").desc())
        
        category_preference.show()
        
        # 視覺化
        rating_data = rating_dist.toPandas()
        category_data = category_preference.toPandas()
        
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        
        # 評分分佈
        axes[0, 0].bar(rating_data['rating'], rating_data['count'])
        axes[0, 0].set_title('評分分佈')
        axes[0, 0].set_xlabel('評分')
        axes[0, 0].set_ylabel('數量')
        
        # 分類評分數量
        axes[0, 1].bar(category_data['category'], category_data['rating_count'])
        axes[0, 1].set_title('各分類評分數量')
        axes[0, 1].set_xlabel('分類')
        axes[0, 1].set_ylabel('評分數量')
        axes[0, 1].tick_params(axis='x', rotation=45)
        
        # 分類平均評分
        axes[1, 0].bar(category_data['category'], category_data['avg_rating'], color='green')
        axes[1, 0].set_title('各分類平均評分')
        axes[1, 0].set_xlabel('分類')
        axes[1, 0].set_ylabel('平均評分')
        axes[1, 0].tick_params(axis='x', rotation=45)
        
        # 用戶活躍度分佈
        user_activity_data = user_activity.toPandas()
        axes[1, 1].hist(user_activity_data['count'], bins=20, alpha=0.7)
        axes[1, 1].set_title('用戶活躍度分佈')
        axes[1, 1].set_xlabel('評分數量')
        axes[1, 1].set_ylabel('用戶數量')
        
        plt.tight_layout()
        plt.show()
        
        return popular_products, category_preference
    
    def build_recommendation_model(self):
        """
        建立推薦模型
        """
        print("\n=== 建立推薦模型 ===")
        
        # 分割訓練和測試資料
        train_df, test_df = self.ratings_df.randomSplit([0.8, 0.2], seed=42)
        
        print(f"訓練資料: {train_df.count()} 筆")
        print(f"測試資料: {test_df.count()} 筆")
        
        # 使用 ALS (Alternating Least Squares) 建立模型
        als = ALS(
            userCol="user_id",
            itemCol="product_id",
            ratingCol="rating",
            rank=50,  # 潛在因子數量
            maxIter=10,
            regParam=0.01,
            coldStartStrategy="drop",
            nonnegative=True
        )
        
        print("\n訓練 ALS 模型...")
        self.model = als.fit(train_df)
        
        # 在測試集上評估模型
        predictions = self.model.transform(test_df)
        
        # 計算 RMSE
        evaluator = RegressionEvaluator(
            metricName="rmse",
            labelCol="rating",
            predictionCol="prediction"
        )
        
        rmse = evaluator.evaluate(predictions)
        print(f"\n模型 RMSE: {rmse:.4f}")
        
        # 顯示一些預測結果
        print("\n預測結果樣本:")
        predictions.select("user_id", "product_id", "rating", "prediction").show(10)
        
        return self.model, rmse
    
    def generate_recommendations(self, user_id, num_recommendations=10):
        """
        為特定用戶生成推薦
        """
        print(f"\n=== 為用戶 {user_id} 生成推薦 ===")
        
        # 獲取用戶已評分的商品
        user_ratings = self.ratings_df.filter(col("user_id") == user_id)
        rated_products = user_ratings.select("product_id").rdd.map(lambda x: x[0]).collect()
        
        print(f"用戶 {user_id} 已評分商品數: {len(rated_products)}")
        
        # 顯示用戶的評分歷史
        print("\n用戶評分歷史:")
        user_history = user_ratings.join(self.products_df, "product_id") \
                                  .select("product_name", "category", "rating") \
                                  .orderBy(col("rating").desc())
        user_history.show(10)
        
        # 生成推薦
        user_df = self.spark.createDataFrame([(user_id,)], ["user_id"])
        user_recommendations = self.model.recommendForUserSubset(user_df, num_recommendations)
        
        # 解析推薦結果
        recommendations = user_recommendations.select("user_id", "recommendations").collect()[0]
        
        print(f"\n為用戶 {user_id} 推薦的商品:")
        
        recommended_products = []
        for rec in recommendations['recommendations']:
            product_id = rec['product_id']
            predicted_rating = rec['rating']
            
            # 獲取商品資訊
            product_info = self.products_df.filter(col("product_id") == product_id).collect()[0]
            
            recommended_products.append({
                'product_id': product_id,
                'product_name': product_info['product_name'],
                'category': product_info['category'],
                'price': product_info['price'],
                'predicted_rating': predicted_rating
            })
        
        # 顯示推薦結果
        rec_df = self.spark.createDataFrame(recommended_products)
        rec_df.show(num_recommendations, truncate=False)
        
        return recommended_products
    
    def analyze_recommendations(self):
        """
        分析推薦系統的整體表現
        """
        print("\n=== 推薦系統分析 ===")
        
        # 為多個用戶生成推薦
        sample_users = self.ratings_df.select("user_id").distinct().limit(10).rdd.map(lambda x: x[0]).collect()
        
        print(f"為 {len(sample_users)} 個用戶生成推薦...")
        
        all_recommendations = []
        for user_id in sample_users:
            recommendations = self.generate_recommendations(user_id, 5)
            all_recommendations.extend(recommendations)
        
        # 分析推薦的多樣性
        recommended_categories = defaultdict(int)
        for rec in all_recommendations:
            recommended_categories[rec['category']] += 1
        
        print("\n推薦分類分佈:")
        for category, count in sorted(recommended_categories.items(), key=lambda x: x[1], reverse=True):
            print(f"  {category}: {count} 次推薦")
        
        # 分析推薦的價格分佈
        recommended_prices = [rec['price'] for rec in all_recommendations]
        
        print("\n推薦價格統計:")
        print(f"  平均價格: ${np.mean(recommended_prices):.2f}")
        print(f"  價格中位數: ${np.median(recommended_prices):.2f}")
        print(f"  最低價格: ${np.min(recommended_prices):.2f}")
        print(f"  最高價格: ${np.max(recommended_prices):.2f}")
        
        # 視覺化推薦分析
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # 推薦分類分佈
        categories = list(recommended_categories.keys())
        counts = list(recommended_categories.values())
        
        axes[0].bar(categories, counts)
        axes[0].set_title('推薦分類分佈')
        axes[0].set_xlabel('分類')
        axes[0].set_ylabel('推薦次數')
        axes[0].tick_params(axis='x', rotation=45)
        
        # 推薦價格分佈
        axes[1].hist(recommended_prices, bins=20, alpha=0.7)
        axes[1].set_title('推薦價格分佈')
        axes[1].set_xlabel('價格 ($)')
        axes[1].set_ylabel('頻率')
        
        plt.tight_layout()
        plt.show()
        
        return all_recommendations
    
    def evaluate_recommendation_quality(self):
        """
        評估推薦系統品質
        """
        print("\n=== 推薦系統品質評估 ===")
        
        # 計算覆蓋率（推薦系統能推薦多少比例的商品）
        total_products = self.products_df.count()
        
        # 為所有用戶生成推薦
        all_users = self.ratings_df.select("user_id").distinct()
        all_recommendations = self.model.recommendForAllUsers(5)
        
        # 提取所有被推薦的商品
        recommended_products = all_recommendations.select(
            explode("recommendations").alias("recommendation")
        ).select(
            col("recommendation.product_id").alias("product_id")
        ).distinct()
        
        coverage = recommended_products.count() / total_products
        print(f"商品覆蓋率: {coverage:.2%}")
        
        # 計算推薦的多樣性
        category_diversity = recommended_products.join(
            self.products_df, "product_id"
        ).select("category").distinct().count()
        
        total_categories = self.products_df.select("category").distinct().count()
        diversity = category_diversity / total_categories
        
        print(f"分類多樣性: {diversity:.2%}")
        
        # 計算新穎性（推薦不太受歡迎的商品的能力）
        product_popularity = self.ratings_df.groupBy("product_id").count().alias("popularity")
        
        recommended_popularity = recommended_products.join(
            product_popularity, "product_id"
        ).select("count")
        
        avg_recommended_popularity = recommended_popularity.agg(avg("count")).collect()[0][0]
        avg_overall_popularity = product_popularity.agg(avg("count")).collect()[0][0]
        
        novelty = 1 - (avg_recommended_popularity / avg_overall_popularity)
        print(f"新穎性指標: {novelty:.2%}")
        
        # 生成品質報告
        quality_metrics = {
            'coverage': coverage,
            'diversity': diversity,
            'novelty': novelty
        }
        
        print("\n📊 推薦系統品質總結:")
        print(f"  覆蓋率: {coverage:.2%} (推薦了 {coverage*100:.1f}% 的商品)")
        print(f"  多樣性: {diversity:.2%} (覆蓋了 {diversity*100:.1f}% 的分類)")
        print(f"  新穎性: {novelty:.2%} (傾向推薦 {'較不受歡迎' if novelty > 0 else '較受歡迎'} 的商品)")
        
        # 整體品質評分
        overall_quality = (coverage + diversity + abs(novelty)) / 3
        print(f"  整體品質評分: {overall_quality:.2%}")
        
        return quality_metrics

# 執行推薦系統
print("\n=== 專案三：推薦系統 ===")

recommendation_system = RecommendationSystem(spark)
ratings_data, products_data, users_data = recommendation_system.generate_sample_data()

# 顯示樣本資料
print("\n評分資料樣本:")
ratings_data.show(10)

print("\n商品資料樣本:")
products_data.show(10)

# 執行推薦系統分析
popular_products, category_preference = recommendation_system.exploratory_analysis()
model, rmse = recommendation_system.build_recommendation_model()

# 為幾個用戶生成推薦
sample_user_ids = [1, 10, 50, 100]
for user_id in sample_user_ids:
    recommendations = recommendation_system.generate_recommendations(user_id, 5)

# 分析推薦系統整體表現
all_recommendations = recommendation_system.analyze_recommendations()
quality_metrics = recommendation_system.evaluate_recommendation_quality()

## 總結

本章通過三個實戰專案展示了 Spark 在不同領域的應用。

In [None]:
# 實戰專案總結
def projects_summary():
    """
    實戰專案總結
    """
    print("\n" + "="*60)
    print("實戰專案總結")
    print("="*60)
    
    projects = {
        "專案一：日誌分析系統": {
            "核心技術": [
                "日誌解析和資料清理",
                "時間序列分析",
                "異常檢測和安全分析",
                "資料視覺化"
            ],
            "應用場景": [
                "網站訪問日誌分析",
                "安全威脅檢測",
                "系統性能監控",
                "用戶行為分析"
            ],
            "關鍵收穫": [
                "學會處理半結構化資料",
                "掌握異常檢測技術",
                "了解安全分析方法",
                "熟悉時間序列處理"
            ]
        },
        "專案二：即時監控系統": {
            "核心技術": [
                "即時資料處理",
                "閾值監控和警報",
                "趨勢分析",
                "健康度評估"
            ],
            "應用場景": [
                "系統監控",
                "應用程式監控",
                "基礎設施監控",
                "業務指標監控"
            ],
            "關鍵收穫": [
                "掌握即時監控技術",
                "學會設計警報系統",
                "了解趨勢分析方法",
                "熟悉系統健康評估"
            ]
        },
        "專案三：推薦系統": {
            "核心技術": [
                "協同過濾演算法",
                "矩陣分解 (ALS)",
                "推薦品質評估",
                "個性化推薦"
            ],
            "應用場景": [
                "電商商品推薦",
                "內容推薦",
                "社交網絡推薦",
                "廣告推薦"
            ],
            "關鍵收穫": [
                "掌握推薦系統原理",
                "學會使用 MLlib ALS",
                "了解推薦評估指標",
                "熟悉大規模推薦系統"
            ]
        }
    }
    
    for project_name, details in projects.items():
        print(f"\n{project_name}:")
        print("-" * len(project_name))
        
        for category, items in details.items():
            print(f"\n{category}:")
            for item in items:
                print(f"  • {item}")
    
    print("\n" + "="*60)
    print("整體學習成果")
    print("="*60)
    
    achievements = {
        "技術能力": [
            "熟練掌握 Spark 核心 API",
            "具備大資料處理能力",
            "掌握機器學習應用",
            "具備系統設計思維"
        ],
        "實戰經驗": [
            "完成端到端專案開發",
            "解決實際業務問題",
            "掌握性能優化技巧",
            "具備問題診斷能力"
        ],
        "領域知識": [
            "日誌分析和安全監控",
            "系統監控和運維",
            "推薦系統和機器學習",
            "資料科學和分析"
        ]
    }
    
    for category, items in achievements.items():
        print(f"\n{category}:")
        for item in items:
            print(f"  ✓ {item}")
    
    print("\n" + "="*60)
    print("下一步建議")
    print("="*60)
    
    next_steps = [
        "在實際項目中應用所學技能",
        "深入學習特定領域的進階技術",
        "探索 Spark 的新特性和更新",
        "學習相關的大資料生態系統工具",
        "參與開源項目，貢獻社區",
        "持續關注技術發展趨勢"
    ]
    
    for i, step in enumerate(next_steps, 1):
        print(f"  {i}. {step}")
    
    print("\n🎉 恭喜您完成了 Spark 101 的所有課程！")
    print("您現在具備了使用 Apache Spark 解決實際問題的能力。")
    print("繼續學習，不斷實踐，成為大資料處理專家！")

# 執行總結
projects_summary()

In [None]:
# 清理資源
print("\n清理資源...")
spark.stop()
print("Spark 會話已結束")
print("\n感謝您完成所有實戰專案！")