In [9]:
# Import standard libraries
import os
import pandas as pd
import numpy as np
import logging
import datetime
import shutil
from pathlib import Path
import glob
import re
from typing import Dict, List, Tuple, Optional, Any

# For database connection
from data201 import db_connection

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('premier_league_import.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Define paths
DATA_DIR = Path('./data')
PROCESSED_DIR = Path('./processed')
ERROR_DIR = Path('./error')

# Create directories if they don't exist
for directory in [DATA_DIR, PROCESSED_DIR, ERROR_DIR]:
    directory.mkdir(exist_ok=True)

print("Libraries imported successfully")
print(f"Python pandas version: {pd.__version__}")
print(f"Data directory: {DATA_DIR.absolute()}")

Libraries imported successfully
Python pandas version: 2.2.3
Data directory: /Users/lseverini/Library/CloudStorage/Dropbox/SJSU/ Classes/DATA 201-21/Final Project/data


In [10]:
# Establish database connection
try:
    # Connect to the database using db_connection
    conn = db_connection(config_file='premier_league_analytics.ini')
    cursor = conn.cursor()
    
    # Test connection with a simple query
    cursor.execute("SELECT VERSION()")
    db_version = cursor.fetchone()[0]
    print(f"Connected to database. Server version: {db_version}")
    
    # Check if required tables exist
    cursor.execute("""
    SELECT COUNT(*) 
    FROM INFORMATION_SCHEMA.TABLES 
    WHERE TABLE_SCHEMA = DATABASE() 
    AND TABLE_NAME IN ('Teams', 'Matches', 'Seasons', 'stg_premier_league_raw')
    """)
    
    tables_count = cursor.fetchone()[0]
    if tables_count < 4:
        print("WARNING: Some required tables don't exist. Make sure to run the database setup notebook first.")
    else:
        print("All required tables exist. Ready to import data.")
        
except Exception as e:
    print(f"Error connecting to database: {e}")
    raise

Connected to database. Server version: 9.2.0
All required tables exist. Ready to import data.


In [11]:
def register_file_for_processing(filename: str, filepath: str) -> int:
    """
    Register a file in the ETLLog table and return its ID.
    
    Args:
        filename: Name of the file
        filepath: Full path to the file
        
    Returns:
        log_id: ID of the ETL log record
    """
    try:
        # Create a new ETL log entry for this file
        process_name = f"Import_{filename}"
        
        cursor.execute(
            """
            INSERT INTO `ETLLog` (`ProcessName`, `StartTime`, `Status`)
            VALUES (%s, NOW(), 'Pending')
            """,
            (process_name,)
        )
        log_id = cursor.lastrowid
        conn.commit()
        logger.info(f"Registered file for processing: {filename}, LogID: {log_id}")
        return log_id
        
    except Exception as e:
        logger.error(f"Error registering file {filename}: {e}")
        if 'conn' in locals():
            conn.rollback()
        raise

In [12]:
def update_file_status(log_id: int, status: str, records_processed: int = 0, 
                      records_failed: int = 0, error_message: Optional[str] = None) -> bool:
    """
    Update the status of an ETL process in the ETLLog table.
    
    Args:
        log_id: ID of the ETL log record
        status: New status ('Running', 'Completed', 'Failed')
        records_processed: Number of records successfully processed
        records_failed: Number of records that failed processing
        error_message: Error message if processing failed
        
    Returns:
        success: True if the update was successful, False otherwise
    """
    try:
        if status == 'Running':
            # Update status to Running
            cursor.execute(
                """
                UPDATE `ETLLog` 
                SET `Status` = %s
                WHERE `LogID` = %s
                """,
                (status, log_id)
            )
        else:
            # Update status to Completed or Failed with stats
            cursor.execute(
                """
                UPDATE `ETLLog` 
                SET `Status` = %s, 
                    `EndTime` = NOW(), 
                    `RecordsProcessed` = %s,
                    `RecordsFailed` = %s,
                    `ErrorMessage` = %s
                WHERE `LogID` = %s
                """,
                (status, records_processed, records_failed, error_message, log_id)
            )
        
        conn.commit()
        logger.info(f"Updated ETL status to {status} for LogID: {log_id}")
        return True
        
    except Exception as e:
        logger.error(f"Error updating ETL status: {e}")
        if 'conn' in locals():
            conn.rollback()
        return False

In [13]:
def extract_season_from_filename(filename: str) -> str:
    """
    Extract the season from the filename pattern.
    
    Args:
        filename: Name of the file
        
    Returns:
        season: Season name in format '2023-24'
    """
    # Try pattern with dash (e.g., 2023-24)
    dash_pattern = r'(\d{4}-\d{2})'
    dash_match = re.search(dash_pattern, filename)
    if dash_match:
        return dash_match.group(1)
    
    # Try pattern with underscore (e.g., 2023_2024)
    under_pattern = r'(\d{4}_\d{4})'
    under_match = re.search(under_pattern, filename)
    if under_match:
        year_str = under_match.group(1)
        # Convert 2023_2024 to 2023-24
        years = year_str.split('_')
        return f"{years[0]}-{years[1][2:4]}"
    
    # If we can extract a four-digit year, assume current season
    year_pattern = r'(\d{4})'
    year_match = re.search(year_pattern, filename)
    if year_match:
        year = int(year_match.group(1))
        # If this looks like a valid recent year
        if 2000 <= year <= 2100:
            return f"{year}-{str(year+1)[2:4]}"
    
    # Fall back to the current season if we can't determine it
    current_year = datetime.datetime.now().year
    next_year = str(current_year + 1)[2:4]  # Get last two digits
    return f"{current_year}-{next_year}"

In [14]:
def load_csv_to_staging(file_path: str) -> Tuple[int, int, int]:
    """
    Load a CSV file into the staging table.
    
    Args:
        file_path: Path to the CSV file
        
    Returns:
        rows_read: Number of rows read from the file
        rows_processed: Number of rows successfully processed
        rows_failed: Number of rows that failed processing
    """
    file_name = os.path.basename(file_path)
    rows_read = 0
    rows_processed = 0
    rows_failed = 0
    
    try:
        # Register ETL process for this file
        log_id = register_file_for_processing(file_name, str(file_path))
        
        # Update ETL status to Running
        update_file_status(log_id, 'Running')
        
        # Extract season from filename
        season = extract_season_from_filename(file_name)
        print(f"Identified season as: {season}")
        
        # Read the CSV file
        print(f"Reading file: {file_name}")
        df = pd.read_csv(file_path, encoding='utf-8')
        rows_read = len(df)
        print(f"Read {rows_read} rows from the file")
        
        # Basic validation
        if len(df) == 0:
            raise ValueError("CSV file is empty")
        
        # Check for required columns
        required_columns = ['Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
        missing_columns = [col for col in required_columns if col not in df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {', '.join(missing_columns)}")
        
        # Display sample of the data
        print("\nSample of CSV data:")
        display(df.head(3))
        
        # Clean and prepare data
        print("\nCleaning and preparing data...")
        
        # Handle date format (attempting multiple formats)
        if 'Date' in df.columns:
            try:
                df['Date'] = pd.to_datetime(df['Date'], dayfirst=True).dt.strftime('%Y-%m-%d')
            except Exception as e:
                print(f"Error converting dates with dayfirst=True: {e}")
                try:
                    # Try UK format (DD/MM/YY)
                    df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y').dt.strftime('%Y-%m-%d')
                except Exception as e:
                    print(f"Error converting dates with UK format: {e}")
                    # Last resort - try with any format
                    df['Date'] = pd.to_datetime(df['Date'], errors='coerce').dt.strftime('%Y-%m-%d')
        
        # Handle missing time
        if 'Time' not in df.columns:
            df['Time'] = '15:00'  # Default time
        
        # Add season column if not present
        df['Season'] = season
        
        # Add source file information
        df['SourceFile'] = file_name
        
        # Get the existing columns from the staging table
        cursor.execute("DESCRIBE `stg_premier_league_raw`")
        valid_columns = [row[0] for row in cursor.fetchall()]
        
        # Show all columns that are in the CSV but not in the database
        csv_columns = set(df.columns)
        db_columns = set(valid_columns)
        unknown_columns = csv_columns - db_columns
        
        if unknown_columns:
            print(f"\nWARNING: These columns from the CSV are not in the database table: {unknown_columns}")
            print("These columns will be ignored.")
        
        # Handle NaN values for numeric and string columns
        for col in df.select_dtypes(include=['float64', 'int64']).columns:
            if col in valid_columns:
                df[col] = df[col].fillna(0)
            
        for col in df.select_dtypes(include=['object']).columns:
            if col in valid_columns:
                df[col] = df[col].fillna('')
        
        # Insert data into staging table
        print(f"Inserting {len(df)} rows into staging table...")
        successful_inserts = 0
        failed_inserts = 0
        
        # Begin transaction
        cursor.execute("START TRANSACTION")
        
        for _, row in df.iterrows():
            try:
                # Prepare SQL and values - only include columns that exist in the database
                columns = []
                placeholders = []
                values = []
                
                for col in df.columns:
                    # Only include columns that exist in the database table
                    if col in valid_columns and pd.notna(row[col]) and row[col] != '':
                        columns.append(f"`{col}`")
                        placeholders.append("%s")
                        values.append(str(row[col]))
                
                # Add ProcessedFlag if it exists in valid_columns
                if 'ProcessedFlag' in valid_columns and 'ProcessedFlag' not in columns:
                    columns.append("`ProcessedFlag`")
                    placeholders.append("%s")
                    values.append(0)
                
                # Skip if no valid columns found
                if not columns:
                    failed_inserts += 1
                    continue
                
                # Construct SQL
                sql = f"""
                INSERT INTO `stg_premier_league_raw` ({', '.join(columns)})
                VALUES ({', '.join(placeholders)})
                """
                
                # Execute SQL
                cursor.execute(sql, values)
                successful_inserts += 1
                
            except Exception as e:
                failed_inserts += 1
                error_msg = str(e)
                logger.warning(f"Error inserting row: {error_msg}")
                print(f"Error inserting row: {error_msg}")
                
                # Log to dead letter table
                try:
                    cursor.execute(
                        """
                        INSERT INTO `ETLDeadLetter` (`SourceTable`, `SourceId`, `ErrorMessage`, `RawData`)
                        VALUES (%s, %s, %s, %s)
                        """,
                        ('stg_premier_league_raw', None, error_msg, str(row.to_dict()))
                    )
                except Exception as e2:
                    logger.error(f"Error logging to dead letter table: {e2}")
        
        # Commit transaction
        conn.commit()
        rows_processed = successful_inserts
        rows_failed = failed_inserts
        
        # Update ETL status to Completed
        update_file_status(
            log_id, 
            'Completed', 
            rows_processed, 
            rows_failed
        )
        
        print(f"Successfully inserted {rows_processed} rows into staging table")
        print(f"Failed to insert {rows_failed} rows")
        
        # Move file to processed directory
        processed_path = PROCESSED_DIR / file_name
        shutil.move(str(file_path), str(processed_path))
        print(f"Moved file to processed directory: {processed_path}")
        
        return rows_read, rows_processed, rows_failed
        
    except Exception as e:
        error_msg = str(e)
        logger.error(f"Error loading file to staging: {error_msg}")
        print(f"Error loading file to staging: {error_msg}")
        
        # Update ETL status to Failed
        if 'log_id' in locals():
            update_file_status(
                log_id, 
                'Failed', 
                rows_processed, 
                rows_failed, 
                error_msg
            )
        
        # Rollback any open transaction
        if 'conn' in locals():
            conn.rollback()
        
        return rows_read, rows_processed, rows_failed

In [15]:
def load_csv_files():
    # Find available CSV files
    print(f"Loading csv files from folder: {DATA_DIR}")
    csv_files = list(DATA_DIR.glob('*.csv'))
    print(f"Found {len(csv_files)} CSV files in the data directory:")
    for file in csv_files:
        print(f"  - {file.name}")

    # Test with first file if available
    if csv_files:
        first_file = csv_files[0]
        if first_file:
            print(f"\nTesting import with file: {first_file}")
            
            # Load the file to staging
            rows_read, rows_processed, rows_failed = load_csv_to_staging(first_file)
            
            # Check results
            print(f"\nImport summary:")
            print(f"  - Rows read: {rows_read}")
            print(f"  - Rows processed: {rows_processed}")
            print(f"  - Rows failed: {rows_failed}")
            
            # Verify data in staging table
            try:
                cursor.execute("""
                SELECT COUNT(*) FROM `stg_premier_league_raw` 
                WHERE `SourceFile` = %s
                """, (first_file.name,))
                
                stage_count = cursor.fetchone()[0]
                print(f"\nVerified {stage_count} rows in staging table from this file")
                
                # Show sample from staging table - get actual columns that exist
                cursor.execute("DESCRIBE `stg_premier_league_raw`")
                actual_columns = [row[0] for row in cursor.fetchall()]
                
                # Only include columns that we know exist
                display_columns = ['Id', 'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR']
                # Filter to only include columns that actually exist in the table
                display_columns = [col for col in display_columns if col in actual_columns]
                
                # Add SourceFile if it exists
                if 'SourceFile' in actual_columns:
                    display_columns.append('SourceFile')
                
                if display_columns:
                    # Build the query with only existing columns
                    columns_sql = ', '.join([f'`{col}`' for col in display_columns])
                    query = f"""
                    SELECT {columns_sql}
                    FROM `stg_premier_league_raw`
                    WHERE `SourceFile` = %s
                    LIMIT 5
                    """
                    
                    cursor.execute(query, (first_file.name,))
                    
                    columns = [col[0] for col in cursor.description]
                    sample_data = cursor.fetchall()
                    
                    print("\nSample data in staging table:")
                    sample_df = pd.DataFrame(sample_data, columns=columns)
                    display(sample_df)
                else:
                    print("Could not determine which columns to display")
                
                # Check if this file has been logged in ETLLog
                cursor.execute("""
                SELECT `LogID`, `Status`, `RecordsProcessed`, `RecordsFailed` 
                FROM `ETLLog`
                WHERE `ProcessName` LIKE %s
                ORDER BY `LogID` DESC
                LIMIT 1
                """, (f"Import_{first_file.name}%",))
                
                log_data = cursor.fetchone()
                if log_data:
                    log_id, status, records_processed, records_failed = log_data
                    print(f"\nETL Log: LogID={log_id}, Status={status}, Processed={records_processed}, Failed={records_failed}")
                
            except Exception as e:
                print(f"Error verifying data: {e}")
                return False
            
            return True
    else:
        print("No CSV files found in the data directory. Please add files to import.")
    
    return False

In [None]:
def transform_load_data():
    """Transform and load data from staging to operational tables"""
    conn = None
    cursor = None
    records_processed = 0
    records_failed = 0

    try:
        # 1) connect & fetch staging rows
        conn = db_connection(config_file='premier_league_analytics.ini')
        cursor = conn.cursor(dictionary=True)
        cursor.execute("""
            SELECT * 
            FROM `stg_premier_league_raw`
            WHERE `ProcessedFlag` = 0
            ORDER BY `Id`
        """)

        staging = cursor.fetchall()
        if not staging:
            print("No unprocessed data found in staging table")
            return True
        print(f"Found {len(staging)} records to process")

        # 2) cache MarketIDs & BookmakerIDs
        cursor.execute("SELECT MarketID FROM `Markets` WHERE MarketType='MatchResult'")
        m = cursor.fetchone()
        match_result_mid = m['MarketID'] if m else None

        cursor.execute("SELECT MarketID FROM `Markets` WHERE MarketType='OverUnder' AND Parameter='2.5'")
        m = cursor.fetchone()
        ou_mid = m['MarketID'] if m else None
 
        cursor.execute("SELECT BookmakerCode, BookmakerID FROM `Bookmakers`")
        bookmaker_ids = {r['BookmakerCode']: r['BookmakerID'] for r in cursor.fetchall()}

        # 3) mapping for 1X2 odds columns
        three_way = [
            ('B365','B365H','B365D','B365A'),
            ('BW','BWH','BWD','BWA'),
            ('IW','IWH','IWD','IWA'),
            ('PS','PSH','PSD','PSA'),
            ('WH','WHH','WHD','WHA'),
            ('VC','VCH','VCD','VCA'),
        ]

        # 4) process each row
        for row in staging:
            try:
                # a) skip if essential missing
                if not all([row.get('Div'), row.get('Date'),
                            row.get('HomeTeam'), row.get('AwayTeam')]):
                    print(f"Skipping {row['Id']}: missing data")
                    records_failed += 1
                    continue

                # b) parse match date
                raw = row['Date']
                if isinstance(raw, str):
                    md = datetime.strptime(raw, '%Y-%m-%d').date()
                else:
                    md = raw

                # c) season logic Aug→May = XYAB
                if md.month >= 8:
                    sy = md.year % 100
                    ey = (md.year + 1) % 100
                else:
                    sy = (md.year - 1) % 100
                    ey = md.year % 100
                season_name = f"{sy:02d}-{ey:02d}"

                # d) upsert Seasons
                cursor.execute(
                    "SELECT SeasonID FROM `Seasons` WHERE SeasonName=%s",
                    (season_name,)
                )
                r = cursor.fetchone()
                if r:
                    season_id = r['SeasonID']
                else:
                    # set bounds
                    if md.month >= 8:
                        start_dt = f"{md.year}-08-01"
                        end_dt   = f"{md.year+1}-05-31"
                    else:
                        start_dt = f"{md.year-1}-08-01"
                        end_dt   = f"{md.year}-05-31"
                    cursor.execute(
                        "INSERT INTO `Seasons`(SeasonName,StartDate,EndDate) VALUES(%s,%s,%s)",
                        (season_name, start_dt, end_dt)
                    )
                    season_id = cursor.lastrowid

                # e) upsert Divisions
                div = row['Div']
                cursor.execute(
                    "SELECT DivisionID FROM `Divisions` WHERE DivisionCode=%s",
                    (div,)
                )
                r = cursor.fetchone()
                if r:
                    div_id = r['DivisionID']
                else:
                    cursor.execute(
                        "INSERT INTO `Divisions`(DivisionCode,LeagueName) VALUES(%s,%s)",
                        (div, f"{div} League")
                    )
                    div_id = cursor.lastrowid

                # f) upsert Teams
                def upsert_team(name):
                    cursor.execute("SELECT TeamID FROM `Teams` WHERE TeamName=%s", (name,))
                    rr = cursor.fetchone()
                    if rr:
                        return rr['TeamID']
                    short = ''.join(w[0] for w in name.split()).upper()
                    cursor.execute(
                        "INSERT INTO `Teams`(TeamName,ShortName) VALUES(%s,%s)",
                        (name, short)
                    )
                    return cursor.lastrowid

                home_id = upsert_team(row['HomeTeam'])
                away_id = upsert_team(row['AwayTeam'])

                # g) upsert Referee
                ref_id = None
                if row.get('Referee'):
                    ref = row['Referee'].strip()
                    cursor.execute(
                        "SELECT RefereeID FROM `Referees` WHERE RefereeName=%s",
                        (ref,)
                    )
                    rr = cursor.fetchone()
                    if rr:
                        ref_id = rr['RefereeID']
                    else:
                        cursor.execute(
                            "INSERT INTO `Referees`(RefereeName) VALUES(%s)",
                            (ref,)
                        )
                        ref_id = cursor.lastrowid

                # h) insert Matches
                cursor.execute("""
                    INSERT INTO `Matches`(
                      SeasonID,DivisionID,MatchDate,MatchTime,
                      HomeTeamID,AwayTeamID,FTHG,FTAG,FTR,
                      HTHG,HTAG,HTR,RefereeID
                    ) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """,(
                    season_id, div_id, md, row.get('Time'),
                    home_id, away_id,
                    row.get('FTHG',0), row.get('FTAG',0), row.get('FTR'),
                    row.get('HTHG'), row.get('HTAG'), row.get('HTR'),
                    ref_id
                ))
                match_id = cursor.lastrowid

                # i) insert MatchStatistics
                cursor.execute("""
                    INSERT INTO `MatchStatistics`(
                      MatchID,HomeShots,AwayShots,
                      HomeShotsTarget,AwayShotsTarget,
                      HomeCorners,AwayCorners,
                      HomeFouls,AwayFouls,
                      HomeYellowCards,AwayYellowCards,
                      HomeRedCards,AwayRedCards
                    ) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
                """,(
                    match_id,
                    row.get('HS'),  row.get('AS'),
                    row.get('HST'), row.get('AST'),
                    row.get('HC'),  row.get('AC'),
                    row.get('HF'),  row.get('AF'),
                    row.get('HY'),  row.get('AY'),
                    row.get('HR'),  row.get('AR'),
                ))

                # j) insert BettingOdds 1X2
                if match_result_mid:
                    for code, hcol, dcol, acol in three_way:
                        if code in bookmaker_ids:
                            try:
                                h = float(row[hcol])
                                d = float(row[dcol])
                                a = float(row[acol])
                            except (TypeError, ValueError):
                                continue
                            if h>1 and d>1 and a>1:
                                bid = bookmaker_ids[code]
                                for oc, val in [('H',h),('D',d),('A',a)]:
                                    cursor.execute(
                                        "INSERT INTO `BettingOdds`"
                                        "(MatchID,BookmakerID,MarketID,OutcomeCode,OddsValue)"
                                        " VALUES(%s,%s,%s,%s,%s)",
                                        (match_id,bid,match_result_mid,oc,val)
                                    )

                # k) insert BettingOdds Over/Under 2.5
                if ou_mid and 'B365' in bookmaker_ids:
                    ocol, ucol = 'B365>2.5','B365<2.5'
                    try:
                        o = float(row.get(ocol,0))
                        u = float(row.get(ucol,0))
                    except (TypeError, ValueError):
                        o=u=0
                    if o>1 and u>1:
                        bid = bookmaker_ids['B365']
                        for oc, val in [('O',o),('U',u)]:
                            cursor.execute(
                                "INSERT INTO `BettingOdds`"
                                "(MatchID,BookmakerID,MarketID,OutcomeCode,OddsValue)"
                                " VALUES(%s,%s,%s,%s,%s)",
                                (match_id,bid,ou_mid,oc,val)
                            )

                # l) mark processed
                cursor.execute(
                    "UPDATE `stg_premier_league_raw` SET ProcessedFlag=1 WHERE Id=%s",
                    (row['Id'],)
                )
                conn.commit()
                records_processed += 1

            except Exception as rec_e:
                conn.rollback()
                print(f"Error processing row {row['Id']}: {rec_e}")
                records_failed += 1

        print(f"Transform-load completed. Processed: {records_processed}, Failed: {records_failed}")
        return True

    except Exception as e:
        if conn:
            conn.rollback()
        print(f"Fatal error in transform-load: {e}")
        return False

    finally:
        if cursor:
            cursor.close()
        if conn:
            conn.close()

if __name__ == '__main__':
    if load_csv_files():
        transform_load_data()

2025-05-08 17:07:38,025 - INFO - Registered file for processing: PL22-23.csv, LogID: 1
2025-05-08 17:07:38,027 - INFO - Updated ETL status to Running for LogID: 1


Loading csv files from folder: data
Found 1 CSV files in the data directory:
  - PL22-23.csv

Testing import with file: data/PL22-23.csv
Identified season as: 2025-26
Reading file: PL22-23.csv
Read 380 rows from the file

Sample of CSV data:


Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,05/08/2022,20:00,Crystal Palace,Arsenal,0,2,A,0,1,...,1.76,0.5,2.09,1.84,2.04,1.88,2.09,1.88,2.03,1.85
1,E0,06/08/2022,12:30,Fulham,Liverpool,2,2,D,1,0,...,2.73,1.75,1.9,2.03,1.91,2.02,2.01,2.06,1.89,1.99
2,E0,06/08/2022,15:00,Bournemouth,Aston Villa,2,0,H,1,0,...,1.76,0.5,1.93,2.0,1.93,2.0,1.94,2.04,1.88,2.0



Cleaning and preparing data...

These columns will be ignored.
Inserting 380 rows into staging table...


2025-05-08 17:07:38,367 - INFO - Updated ETL status to Completed for LogID: 1


Successfully inserted 380 rows into staging table
Failed to insert 0 rows
Moved file to processed directory: processed/PL22-23.csv

Import summary:
  - Rows read: 380
  - Rows processed: 380
  - Rows failed: 0

Verified 380 rows in staging table from this file

Sample data in staging table:


Unnamed: 0,Id,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,SourceFile
0,1,E0,2022-08-05,Crystal Palace,Arsenal,0,2,A,PL22-23.csv
1,2,E0,2022-08-06,Fulham,Liverpool,2,2,D,PL22-23.csv
2,3,E0,2022-08-06,Bournemouth,Aston Villa,2,0,H,PL22-23.csv
3,4,E0,2022-08-06,Leeds,Wolves,2,1,H,PL22-23.csv
4,5,E0,2022-08-06,Newcastle,Nott'm Forest,2,0,H,PL22-23.csv



ETL Log: LogID=1, Status=Completed, Processed=380, Failed=0
Found 380 records to process
Transform-load completed. Processed: 380, Failed: 0
