In [2]:
import pandas as pd
import json
import logging
import os
from kafka import KafkaProducer
from kafka.admin import KafkaAdminClient, NewTopic
from kafka.errors import KafkaError
import time

# --- Cấu hình Logging ---
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

# --- Cấu hình Kafka ---
KAFKA_BROKER_URL = os.getenv('KAFKA_BROKER_URL', 'localhost:9092')
KAFKA_TOPIC = 'football_players'

# --- File paths ---
INPUT_CSV = "Cleaned_Data.csv"  # Thay bằng đường dẫn tới file CSV của bạn
OUTPUT_CSV = "Filtered_Final_Data.csv"     # File CSV để lưu dữ liệu đã lọc

# --- Kiểm tra và tạo Kafka topic ---
def check_and_create_topic():
    try:
        admin_client = KafkaAdminClient(bootstrap_servers=[KAFKA_BROKER_URL])
        topic_list = admin_client.list_topics()
        if KAFKA_TOPIC not in topic_list:
            logger.info(f"Topic '{KAFKA_TOPIC}' does not exist. Creating...")
            new_topic = NewTopic(name=KAFKA_TOPIC, num_partitions=1, replication_factor=1)
            admin_client.create_topics(new_topics=[new_topic], validate_only=False)
            logger.info(f"Topic '{KAFKA_TOPIC}' created successfully.")
        else:
            logger.info(f"Topic '{KAFKA_TOPIC}' already exists.")
        admin_client.close()
    except Exception as e:
        logger.error(f"Error checking/creating topic: {e}")
        raise

# --- Hàm gửi dữ liệu vào Kafka (bất đồng bộ) ---
def on_send_success(record_metadata):
    pass

def on_send_error(excp):
    logger.error(f"ERROR sending message to Kafka: {excp}")

def send_to_kafka_async(producer, topic, data, max_retries=3):
    if not data or not producer:
        return False
    for attempt in range(max_retries):
        try:
            producer.send(topic, value=data).add_callback(on_send_success).add_errback(on_send_error)
            return True
        except Exception as e:
            logger.error(f"Attempt {attempt+1}/{max_retries} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                logger.error(f"Failed to send after {max_retries} attempts.")
                return False

# --- Hàm lọc dữ liệu từ CSV ---
def filter_player_data(df):
    required_columns = [
        "player_id", "name", "player_club", "age", "position", "market_value",
        "nationality", "player_height", "strong_foot", "contract_value_time",
        "goalkeeper_or_not", "appearances", "PPG", "goals", "assists", "own_goals",
        "substitutions_on", "substitutions_off", "yellow_cards", "second_yellow_cards",
        "red_cards", "penalty_goals", "minutes_per_goal", "minutes_played",
        "goals_conceded", "clean_sheet", "crawl_timestamp"
    ]
    
    # Kiểm tra các cột bắt buộc có trong file CSV không
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        logger.error(f"Missing required columns in CSV: {missing_columns}")
        raise ValueError(f"CSV file must contain all required columns: {required_columns}")
    
    # Lọc dữ liệu
    filtered_df = df[
        (df['position'] != 'Goalkeeper') &                    # Loại bỏ thủ môn
        (df[required_columns].notna().all(axis=1)) &         # Loại bỏ hàng thiếu bất kỳ trường bắt buộc nào
        (df['strong_foot'].isin(['left', 'right', 'both']))  # Đảm bảo strong_foot hợp lệ
    ]
    
    # Kiểm tra các cột số không âm
    numeric_columns = [
        'age', 'market_value', 'player_height', 'appearances', 'PPG', 'goals',
        'assists', 'own_goals', 'substitutions_on', 'substitutions_off',
        'yellow_cards', 'second_yellow_cards', 'red_cards', 'penalty_goals',
        'minutes_per_goal', 'minutes_played', 'goals_conceded', 'clean_sheet', 'crawl_timestamp'
    ]
    for col in numeric_columns:
        if col in filtered_df.columns:
            filtered_df = filtered_df[
                (filtered_df[col].apply(lambda x: isinstance(x, (int, float)) and x >= 0))
            ]
    
    return filtered_df

# --- Hàm chính ---
def main():
    # Đọc file CSV
    try:
        df = pd.read_csv(INPUT_CSV, encoding='utf-8-sig')
        logger.info(f"Loaded {len(df)} player records from {INPUT_CSV}")
    except Exception as e:
        logger.error(f"Error reading CSV file {INPUT_CSV}: {e}")
        return
    
    # Lọc dữ liệu
    filtered_df = filter_player_data(df)
    logger.info(f"After filtering, {len(filtered_df)} player records remain")
    
    if filtered_df.empty:
        logger.error("No player data passed the filtering criteria. Exiting.")
        return
    
    # Kiểm tra và tạo Kafka topic
    check_and_create_topic()
    
    # Khởi tạo Kafka producer
    try:
        logger.info(f"Connecting to Kafka broker at {KAFKA_BROKER_URL}...")
        producer = KafkaProducer(
            bootstrap_servers=[KAFKA_BROKER_URL],
            value_serializer=lambda v: json.dumps(v, ensure_ascii=False).encode('utf-8'),
            retries=5,
            acks='all'
        )
        logger.info("Successfully connected to Kafka.")
    except Exception as e:
        logger.error(f"CRITICAL: Error connecting to Kafka: {e}")
        return
    
    # Biến đếm số lần gửi Kafka
    successful_sends_attempted = 0
    failed_prepares = 0
    
    try:
        # Xử lý từng cầu thủ
        for i, row in filtered_df.iterrows():
            player_data = row.to_dict()
            player_data['crawl_timestamp'] = int(time.time() * 1000)  # Thêm timestamp
            
            # Gửi dữ liệu vào Kafka
            if send_to_kafka_async(producer, KAFKA_TOPIC, player_data):
                successful_sends_attempted += 1
                logger.info(f"Successfully sent data for player ID {player_data['player_id']} to Kafka")
            else:
                failed_prepares += 1
                logger.error(f"Failed to send data for player ID {player_data['player_id']} to Kafka")
            
            # Lưu vào CSV sau mỗi 10 cầu thủ
            if (i + 1) % 10 == 0:
                filtered_df.iloc[:i+1].to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
                producer.flush()
                logger.info(f"Saved {i+1} players to {OUTPUT_CSV}")
    
    finally:
        logger.info("-" * 30)
        logger.info("Processing finished.")
        logger.info(f"Total Kafka send attempts prepared: {successful_sends_attempted}")
        logger.info(f"Total Kafka send preparation failures: {failed_prepares}")
        
        # Lưu dữ liệu lần cuối
        filtered_df.to_csv(OUTPUT_CSV, index=False, encoding='utf-8-sig')
        logger.info(f"Final filtered data saved to {OUTPUT_CSV}")
        
        # Flush và đóng producer
        if producer:
            logger.info("Flushing Kafka producer (waiting for pending messages)...")
            try:
                producer.flush(timeout=60)
                logger.info("Kafka producer flushed.")
            except Exception as flush_e:
                logger.error(f"ERROR during producer flush: {flush_e}")
            finally:
                logger.info("Closing Kafka producer.")
                producer.close()
        
        logger.info("Script finished.")

if __name__ == "__main__":
    main()

2025-05-26 17:24:49,337 - INFO - Loaded 3363 player records from Cleaned_Data.csv
2025-05-26 17:24:49,414 - INFO - After filtering, 3363 player records remain
2025-05-26 17:24:49,421 - INFO - <BrokerConnection client_id=kafka-python-2.1.5, node_id=bootstrap-0 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: connecting to localhost:9092 [('::1', 9092, 0, 0) IPv6]
2025-05-26 17:24:49,603 - INFO - <BrokerConnection client_id=kafka-python-2.1.5, node_id=bootstrap-0 host=localhost:9092 <checking_api_versions_recv> [IPv6 ('::1', 9092, 0, 0)]>: Broker version identified as 2.6
2025-05-26 17:24:49,605 - INFO - <BrokerConnection client_id=kafka-python-2.1.5, node_id=bootstrap-0 host=localhost:9092 <connected> [IPv6 ('::1', 9092, 0, 0)]>: Connection complete.
2025-05-26 17:24:49,676 - INFO - <BrokerConnection client_id=kafka-python-2.1.5, node_id=1 host=localhost:9092 <connecting> [IPv6 ('::1', 9092, 0, 0)]>: connecting to localhost:9092 [('::1', 9092, 0, 0) IPv6]
2025-05-26 17:24:4