In [2]:
# 2_load_analytical_data.ipynb

import pandas as pd
import numpy as np
import logging
from data201 import db_connection, df_query

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Connect to the database
conn = db_connection(config_file='premier_league_analytics.ini')
cursor = conn.cursor()

# 1. Load Time dimension
print("Loading Time dimension...")
cursor.execute("""
INSERT IGNORE INTO `dim_Time` (
    `Date`, `DayOfWeek`, `DayName`, `DayOfMonth`, 
    `DayOfYear`, `WeekOfYear`, `Month`, `MonthName`, 
    `Quarter`, `Year`, `IsWeekend`
)
SELECT DISTINCT 
    m.`MatchDate`,
    DAYOFWEEK(m.`MatchDate`),
    DAYNAME(m.`MatchDate`),
    DAY(m.`MatchDate`),
    DAYOFYEAR(m.`MatchDate`),
    WEEK(m.`MatchDate`),
    MONTH(m.`MatchDate`),
    MONTHNAME(m.`MatchDate`),
    QUARTER(m.`MatchDate`),
    YEAR(m.`MatchDate`),
    CASE WHEN DAYOFWEEK(m.`MatchDate`) IN (1, 7) THEN TRUE ELSE FALSE END
FROM `Matches` m
ORDER BY m.`MatchDate`
""")

time_rows = cursor.rowcount
print(f"Loaded {time_rows} time dimension rows")

# Update with season information
cursor.execute("""
UPDATE `dim_Time` t
JOIN `Seasons` s ON t.`Date` BETWEEN s.`StartDate` AND s.`EndDate`
SET t.`Season` = s.`SeasonName`
WHERE t.`Season` IS NULL
""")

season_updates = cursor.rowcount
print(f"Updated {season_updates} time dimension rows with season info")

# 2. Load match facts
print("Loading match facts...")
cursor.execute("""
INSERT INTO `fact_MatchResult` (
    `MatchID`, `TimeID`, `HomeTeamID`, `AwayTeamID`, `RefereeID`,
    `SeasonID`, `DivisionID`, `HomeGoals`, `AwayGoals`, `Result`,
    `HalfTimeHomeGoals`, `HalfTimeAwayGoals`, `HalfTimeResult`
)
SELECT 
    m.`MatchID`,
    dt.`TimeID`,
    m.`HomeTeamID`,
    m.`AwayTeamID`,
    m.`RefereeID`,
    m.`SeasonID`,
    m.`DivisionID`,
    m.`FTHG`,
    m.`FTAG`,
    m.`FTR`,
    m.`HTHG`,
    m.`HTAG`,
    m.`HTR`
FROM `Matches` m
JOIN `dim_Time` dt ON m.`MatchDate` = dt.`Date`
LEFT JOIN `fact_MatchResult` fmr ON m.`MatchID` = fmr.`MatchID`
WHERE fmr.`MatchID` IS NULL
""")

match_rows = cursor.rowcount
print(f"Loaded {match_rows} match fact rows")

# 3. Load team match stats
print("Loading team match stats...")
# Home team stats
cursor.execute("""
INSERT INTO `fact_TeamMatchStats` (
    `MatchID`, `TeamID`, `TimeID`, `SeasonID`,
    `IsHomeTeam`, `OpponentID`, `Goals`, `GoalsConceded`,
    `Shots`, `ShotsOnTarget`, `Corners`, `Fouls`, 
    `YellowCards`, `RedCards`, `Result`, `Points`
)
SELECT 
    m.`MatchID`,
    m.`HomeTeamID`,
    dt.`TimeID`,
    m.`SeasonID`,
    TRUE,
    m.`AwayTeamID`,
    m.`FTHG`,
    m.`FTAG`,
    ms.`HomeShots`,
    ms.`HomeShotsTarget`,
    ms.`HomeCorners`,
    ms.`HomeFouls`,
    ms.`HomeYellowCards`,
    ms.`HomeRedCards`,
    CASE 
        WHEN m.`FTR` = 'H' THEN 'W'
        WHEN m.`FTR` = 'D' THEN 'D'
        ELSE 'L'
    END,
    CASE 
        WHEN m.`FTR` = 'H' THEN 3
        WHEN m.`FTR` = 'D' THEN 1
        ELSE 0
    END
FROM `Matches` m
JOIN `dim_Time` dt ON m.`MatchDate` = dt.`Date`
JOIN `MatchStatistics` ms ON m.`MatchID` = ms.`MatchID`
LEFT JOIN `fact_TeamMatchStats` tms 
    ON m.`MatchID` = tms.`MatchID` AND m.`HomeTeamID` = tms.`TeamID`
WHERE tms.`TeamStatsID` IS NULL
""")

home_stats = cursor.rowcount
print(f"Loaded {home_stats} home team stats rows")

# Away team stats
cursor.execute("""
INSERT INTO `fact_TeamMatchStats` (
    `MatchID`, `TeamID`, `TimeID`, `SeasonID`,
    `IsHomeTeam`, `OpponentID`, `Goals`, `GoalsConceded`,
    `Shots`, `ShotsOnTarget`, `Corners`, `Fouls`, 
    `YellowCards`, `RedCards`, `Result`, `Points`
)
SELECT 
    m.`MatchID`,
    m.`AwayTeamID`,
    dt.`TimeID`,
    m.`SeasonID`,
    FALSE,
    m.`HomeTeamID`,
    m.`FTAG`,
    m.`FTHG`,
    ms.`AwayShots`,
    ms.`AwayShotsTarget`,
    ms.`AwayCorners`,
    ms.`AwayFouls`,
    ms.`AwayYellowCards`,
    ms.`AwayRedCards`,
    CASE 
        WHEN m.`FTR` = 'A' THEN 'W'
        WHEN m.`FTR` = 'D' THEN 'D'
        ELSE 'L'
    END,
    CASE 
        WHEN m.`FTR` = 'A' THEN 3
        WHEN m.`FTR` = 'D' THEN 1
        ELSE 0
    END
FROM `Matches` m
JOIN `dim_Time` dt ON m.`MatchDate` = dt.`Date`
JOIN `MatchStatistics` ms ON m.`MatchID` = ms.`MatchID`
LEFT JOIN `fact_TeamMatchStats` tms 
    ON m.`MatchID` = tms.`MatchID` AND m.`AwayTeamID` = tms.`TeamID`
WHERE tms.`TeamStatsID` IS NULL
""")

away_stats = cursor.rowcount
print(f"Loaded {away_stats} away team stats rows")


# After loading fact_MatchResult data, add this code to create a league snapshot

def create_initial_league_snapshot():
    """Create an initial league table snapshot from match data"""
    conn = db_connection(config_file='premier_league_analytics.ini')
    cursor = conn.cursor()
    
    try:
        # Find season with ID 6 (2023-24 season)
        cursor.execute("SELECT SeasonID, SeasonName FROM Seasons WHERE SeasonID = 6")
        season_result = cursor.fetchone()
        
        if not season_result:
            print("Season ID 6 not found!")
            return False
            
        season_id = season_result[0]
        season_name = season_result[1]
        print(f"Creating league snapshot for {season_name} (ID: {season_id})")
        
        # Get Premier League division
        cursor.execute("SELECT DivisionID FROM Divisions WHERE DivisionCode = 'E0'")
        division_result = cursor.fetchone()
        
        if not division_result:
            # Use first division as fallback
            cursor.execute("SELECT DivisionID FROM Divisions LIMIT 1")
            division_result = cursor.fetchone()
            
        if not division_result:
            print("No divisions found in database")
            return False
            
        division_id = division_result[0]
        
        # Get most recent date in dim_Time
        cursor.execute("SELECT MAX(TimeID) FROM dim_Time")
        time_id = cursor.fetchone()[0]
        
        # Create the league snapshot
        cursor.execute("""
        INSERT INTO fact_LeagueSnapshot (
            SeasonID, DivisionID, TimeID, TeamID, Position,
            MatchesPlayed, Won, Drawn, Lost, GoalsFor, GoalsAgainst,
            Points, Form
        )
        WITH TeamStats AS (
            SELECT 
                t.TeamID,
                t.TeamName,
                COUNT(DISTINCT CASE WHEN mr.HomeTeamID = t.TeamID OR mr.AwayTeamID = t.TeamID 
                                  THEN mr.MatchID END) AS MatchesPlayed,
                SUM(CASE WHEN (mr.HomeTeamID = t.TeamID AND mr.Result = 'H') OR
                               (mr.AwayTeamID = t.TeamID AND mr.Result = 'A') 
                          THEN 1 ELSE 0 END) AS Won,
                SUM(CASE WHEN mr.Result = 'D' AND 
                               (mr.HomeTeamID = t.TeamID OR mr.AwayTeamID = t.TeamID)
                          THEN 1 ELSE 0 END) AS Drawn,
                SUM(CASE WHEN (mr.HomeTeamID = t.TeamID AND mr.Result = 'A') OR
                               (mr.AwayTeamID = t.TeamID AND mr.Result = 'H') 
                          THEN 1 ELSE 0 END) AS Lost,
                SUM(CASE WHEN mr.HomeTeamID = t.TeamID THEN mr.HomeGoals
                          WHEN mr.AwayTeamID = t.TeamID THEN mr.AwayGoals
                          ELSE 0 END) AS GoalsFor,
                SUM(CASE WHEN mr.HomeTeamID = t.TeamID THEN mr.AwayGoals
                          WHEN mr.AwayTeamID = t.TeamID THEN mr.HomeGoals
                          ELSE 0 END) AS GoalsAgainst,
                SUM(CASE WHEN (mr.HomeTeamID = t.TeamID AND mr.Result = 'H') THEN 3
                          WHEN (mr.AwayTeamID = t.TeamID AND mr.Result = 'A') THEN 3
                          WHEN mr.Result = 'D' AND (mr.HomeTeamID = t.TeamID OR mr.AwayTeamID = t.TeamID) THEN 1
                          ELSE 0 END) AS Points
            FROM Teams t
            JOIN fact_MatchResult mr ON t.TeamID = mr.HomeTeamID OR t.TeamID = mr.AwayTeamID
            WHERE mr.SeasonID = %s
            GROUP BY t.TeamID, t.TeamName
            HAVING MatchesPlayed > 0
        ),
        RankedTeams AS (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    ORDER BY Points DESC, 
                            (GoalsFor - GoalsAgainst) DESC, 
                            GoalsFor DESC
                ) AS Position
            FROM TeamStats
        )
        SELECT 
            %s AS SeasonID,
            %s AS DivisionID,
            %s AS TimeID,
            TeamID,
            Position,
            MatchesPlayed,
            Won,
            Drawn,
            Lost,
            GoalsFor,
            GoalsAgainst,
            Points,
            '' AS Form -- Empty for initial load
        FROM RankedTeams
        """, (season_id, season_id, division_id, time_id))
        
        rows_affected = cursor.rowcount
        print(f"Created league table snapshot with {rows_affected} team positions")
        
        # Commit changes
        conn.commit()
        return True
        
    except Exception as e:
        print(f"Error creating league snapshot: {e}")
        if conn:
            conn.rollback()
        return False
        
    finally:
        cursor.close()
        conn.close()

# Create the initial league snapshot
create_initial_league_snapshot()


# 4. Create league snapshot
print("Creating league table snapshot...")
# Get the time ID for today
cursor.execute("SELECT TimeID FROM dim_Time WHERE Date = CURRENT_DATE")
today_time_id = cursor.fetchone()

if not today_time_id:
    # Get the most recent date
    cursor.execute("SELECT MAX(TimeID) FROM dim_Time")
    today_time_id = cursor.fetchone()

today_time_id = today_time_id[0]

# Get active seasons
cursor.execute("""
SELECT SeasonID 
FROM Seasons 
WHERE CURRENT_DATE BETWEEN StartDate AND EndDate
ORDER BY StartDate DESC
""")

active_season = cursor.fetchone()
if active_season:
    active_season_id = active_season[0]
else:
    # Use most recent season
    cursor.execute("SELECT SeasonID FROM Seasons ORDER BY EndDate DESC LIMIT 1")
    active_season_id = cursor.fetchone()[0]

# Get Premier League division
cursor.execute("SELECT DivisionID FROM Divisions WHERE DivisionCode = 'E0'")
division_result = cursor.fetchone()
if division_result:
    division_id = division_result[0]
else:
    # Use first division as fallback
    cursor.execute("SELECT DivisionID FROM Divisions LIMIT 1")
    division_id = cursor.fetchone()[0]

# Create snapshot
cursor.execute("""
INSERT INTO fact_LeagueSnapshot (
    SeasonID, DivisionID, TimeID, TeamID, Position,
    MatchesPlayed, Won, Drawn, Lost, GoalsFor, GoalsAgainst,
    Points, Form
)
WITH TeamStandings AS (
    SELECT
        tms.TeamID,
        COUNT(DISTINCT tms.MatchID) AS MatchesPlayed,
        SUM(CASE WHEN tms.Result = 'W' THEN 1 ELSE 0 END) AS Won,
        SUM(CASE WHEN tms.Result = 'D' THEN 1 ELSE 0 END) AS Drawn,
        SUM(CASE WHEN tms.Result = 'L' THEN 1 ELSE 0 END) AS Lost,
        SUM(tms.Goals) AS GoalsFor,
        SUM(tms.GoalsConceded) AS GoalsAgainst,
        SUM(tms.Points) AS Points,
        GROUP_CONCAT(
            SUBSTRING(tms.Result, 1, 1) 
            ORDER BY fmr.TimeID DESC 
            SEPARATOR ''
        ) AS FormString
    FROM fact_TeamMatchStats tms
    JOIN fact_MatchResult fmr ON tms.MatchID = fmr.MatchID
    WHERE tms.SeasonID = %s
    GROUP BY tms.TeamID
),
RankedTeams AS (
    SELECT
        ts.*,
        ROW_NUMBER() OVER (
            ORDER BY ts.Points DESC, 
                    (ts.GoalsFor - ts.GoalsAgainst) DESC, 
                    ts.GoalsFor DESC
        ) AS Position
    FROM TeamStandings ts
)
SELECT 
    %s,
    %s,
    %s,
    rt.TeamID,
    rt.Position,
    rt.MatchesPlayed,
    rt.Won,
    rt.Drawn,
    rt.Lost,
    rt.GoalsFor,
    rt.GoalsAgainst,
    rt.Points,
    SUBSTRING(rt.FormString, 1, 5)
FROM RankedTeams rt
""", (active_season_id, active_season_id, division_id, today_time_id))

snapshot_rows = cursor.rowcount
print(f"Created league table snapshot with {snapshot_rows} team positions")

conn.commit()
print("Data loading complete!")

# Display loaded data summary
analytical_summary = df_query(conn, """
SELECT 
    (SELECT COUNT(*) FROM dim_Time) AS TimeDimCount,
    (SELECT COUNT(*) FROM fact_MatchResult) AS MatchFactCount,
    (SELECT COUNT(*) FROM fact_TeamMatchStats) AS TeamStatsFactCount,
    (SELECT COUNT(*) FROM fact_LeagueSnapshot) AS LeagueSnapshotCount
""")

print("\nAnalytical Database Summary:")
display(analytical_summary)

# Display current league table
league_table = df_query(conn, "SELECT * FROM premier_league_analytics.vw_LeagueTable")
print("\nCurrent League Table:")
display(league_table)

# Close connection
cursor.close()
conn.close()

Loading Time dimension...
Loaded 117 time dimension rows
Updated 117 time dimension rows with season info
Loading match facts...
Loaded 0 match fact rows
Loading team match stats...
Loaded 0 home team stats rows
Loaded 0 away team stats rows
Creating league snapshot for 2223 (ID: 6)
Error creating league snapshot: 1205 (HY000): Lock wait timeout exceeded; try restarting transaction
Creating league table snapshot...
Created league table snapshot with 0 team positions
Data loading complete!

Analytical Database Summary:


Unnamed: 0,TimeDimCount,MatchFactCount,TeamStatsFactCount,LeagueSnapshotCount
0,234,380,760,20



Current League Table:


Unnamed: 0,SeasonName,LeagueName,TeamName,Position,MatchesPlayed,Won,Drawn,Lost,GoalsFor,GoalsAgainst,GoalDifference,Points,PointsPerGame,Form
0,2223,English Premier League,Man City,1,38,28,5,5,94,33,61,89,2.34,
1,2223,English Premier League,Arsenal,2,38,26,6,6,88,43,45,84,2.21,
2,2223,English Premier League,Man United,3,38,23,6,9,58,43,15,75,1.97,
3,2223,English Premier League,Newcastle,4,38,19,14,5,68,33,35,71,1.87,
4,2223,English Premier League,Liverpool,5,38,19,10,9,75,47,28,67,1.76,
5,2223,English Premier League,Brighton,6,38,18,8,12,72,53,19,62,1.63,
6,2223,English Premier League,Aston Villa,7,38,18,7,13,51,46,5,61,1.61,
7,2223,English Premier League,Tottenham,8,38,18,6,14,70,63,7,60,1.58,
8,2223,English Premier League,Brentford,9,38,15,14,9,58,46,12,59,1.55,
9,2223,English Premier League,Fulham,10,38,15,7,16,55,53,2,52,1.37,
