In [1]:
import os
import sys
sys.path.append('../')
os.chdir('../')

In [2]:
import pandas as pd
import duck_db_helper
from duck_db_helper import get_table_df
import duckdb
import matplotlib.pyplot as plt

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 500)
conn = duckdb.connect('../data/analytical_sandboxes/analytical_sandbox_zone.db')
match_df = get_table_df("matches",conn)
player_df = get_table_df("players",conn)
conn.close()

# Data Quality

For our data quality process, we have chosen to utilize the outputs from the analytical sandboxes. This decision is based on the fact that the data in these sandboxes is in its raw form, without any encoding or feature generation. By evaluating the data quality at this stage, we can ensure its integrity before proceeding to further data manipulations. If the data exhibits poor quality at this initial stage, it is likely to maintain the same level of quality in subsequent steps.

## 1. Understanding and Identifying Data Quality Issues

### 1.1. Data Profiling

First we are going to list all the shortenings and its real meaning.

#### Matches

1. Match data
    * **Date**: The date on which the match was played.
    * **HomeTeam**: Team that plays the match on its own field.
    * **AwayTeam**: Foreign team.
    * **FTHG**: HomeTeam goals.
    * **FTAG**: AwayTeam goals.
    * **FTR**: Result of the match, H means HomeTeam won, D means Draw and A means AwayTeam won. 
2. Betting Statistics
    * **AvgH**: Average of the betting for HomeTeam winning.
    * **AvgD**: Average of the betting for draw.
    * **AvgA**: Average of the betting for AwayTeam winning.
3. Weather
    * **PRESS**: Atmosferic preassure, in hectopascals.
    * **WDIR**: Wind direction, the letter means the cardinality of the wind.
    * **WSPD**: Wind speed in miles per hour.
    * **CLOUD**: Cloud coverage in scale from 0 to 8.
    * **TEMP**: Temperature in Celcius.
    * **TDEW**: Dew point temperature in Celcius.
    
#### Players

1. Match specific information:
    * **match_date**: The date on which the match was played.
    * **team_x**: The team being analyzed in the fantasy context.
    * **opp_team_name**: The opposing team's name.
    * **was_home**: Indicates whether the team_x played at their home ground (1) or is from the opponent team (0).
2. Basic player information
    * **name**: The name of the player.
    * **position**: The playing position of the player (e.g., forward, midfielder, defender, goalkeeper).
3. Statistics
    * **assists**: Number of assists made by the player.
    * **bonus**: Bonus points earned by the player based on performance.
    * **bps**: Bonus Points System - a tally used to award additional points based on player performance.
    * **clean_sheets**: Indicates whether the player's team did not concede any goals while the player was on the field (applicable to defenders and goalkeepers).
    * **creativity**: A metric that reflects the player's ability to create goal-scoring opportunities.
    * **element**: An identifier for the player in the fantasy game.
    * **goals_conceded**: Number of goals conceded while the player was on the field (usually relevant for defenders and goalkeepers).
    * **goals_scored**: Number of goals scored by the player.
    * **ict_index**: Index combining Influence, Creativity, and Threat metrics to gauge a player's overall performance.
    * **influence**: A measure of a player's influence on the game.
    * **minutes**: Minutes played by the player in the match.
    * **own_goals**: Number of own goals scored by the player.
    * **penalties_missed**: Number of penalties missed by the player.
    * **penalties_saved**: Number of penalties saved by the player (relevant for goalkeepers).
    * **yellow_cards**: Number of yellow cards received by the player.
    * **red_cards**: Number of red cards received by the player.
    * **round**: The matchday or round of the fantasy league.
    * **saves**: Number of saves made by the player (relevant for goalkeepers).
    * **selected**: Indicates how often the player is chosen in fantasy teams.
    * **threat**: A metric indicating the player's potential for scoring.
    * **total_points**: Total fantasy points accumulated by the player.
    * **value**: The value of the player in the fantasy league, typically related to their performance and popularity.

In [4]:
player_df.drop(['match_date','was_home'],axis=1).hist(bins=15, figsize=(15, 10))
plt.suptitle("Player Statistics Data Distributions")
plt.show()

In [5]:
match_df.drop(['Date'],axis=1).hist(bins=15, figsize=(15, 10))
plt.suptitle("Match Information Data Distributions")
plt.show()

Upon initial manual inspection and profiling of the data, no anomalies were observed.

### 1.2. Data Quality Dimensions
#### 1.2.1. Completeness

In ensuring completeness, we carefully verify that every necessary detail is present in both player performance and match information datasets. This includes checking for missing values in key fields such as 'goals_scored' and 'match results', ensuring our data is comprehensive and ready for analysis.

In [6]:
print("Missing Values in Player Performance Dataset:")
print(player_df.isnull().sum())

print("\nMissing Values in Match Information Dataset:")
print(match_df.isnull().sum())

#### 1.2.2. Uniqueness

To maintain uniqueness, we scrutinize our datasets for duplicate entries. In the player performance dataset, we ensure each player's data is recorded only once per match. Similarly, in the match information dataset, we confirm that each match is uniquely represented, preventing data redundancy.

In [7]:
print("\nDuplicate Rows in Player Performance Dataset:")
print(player_df[player_df.duplicated()])

print("\nDuplicate Rows in Match Information Dataset:")
print(match_df[match_df.duplicated()])

#### 1.2.3. Consistency

In [8]:
print("Unique values in 'position' column in Player Dataset:")
print(player_df['position'].unique())

print("\nUnique values in 'team_x' column in Player Dataset:")
print(player_df['team_x'].unique())

## 2. Denial Constraints

We first create a class to handle the cases.

In [9]:
class DenialConstraintsChecker:
    def __init__(self, denial_constraints, df, key_columns):
        self.dc_list = denial_constraints
        self.df = df
        self.key_columns = key_columns

    def check_denial_constraint(self, dc):
        invalid_rows, relevant_columns = dc(self.df)
        if invalid_rows:
            self.correct_rows(invalid_rows, relevant_columns)

    def check_denial_constraints(self):
        for dc in self.dc_list:
            self.check_denial_constraint(dc)
        print(f'FINISH: All denial constraints checked.')

    def correct_rows(self, invalid_rows, relevant_columns):
        for index in invalid_rows:
            key_values = ", ".join([f"{key}: {self.df.at[index, key]}" for key in self.key_columns])
            for col in relevant_columns:
                current_value = self.df.at[index, col]
                user_input = input(f"Input new value for {col} ({key_values}) (Current: {current_value}, Enter for no change): ")
                if user_input:
                    self.df.at[index, col] = user_input


### 2.1. Denial Constraints for Matches

In [10]:
def check_unique_matches(match_df):
    duplicates = match_df.duplicated(subset=['Date', 'HomeTeam', 'AwayTeam'])
    if duplicates.any():
        print("DC1 Violation: Duplicate match entries found.")
        return match_df[duplicates].index.tolist(), ['Date', 'HomeTeam', 'AwayTeam']
    return [], []

def check_non_negative_goals(match_df):
    negative_goals = match_df[(match_df['FTHG'] < 0) | (match_df['FTAG'] < 0)]
    if not negative_goals.empty:
        print("DC2 Violation: Negative goal values found.")
        return negative_goals.index.tolist(), ['FTHG', 'FTAG']
    return [], []

def check_match_result_logic(match_df):
    result_logic = (
        (match_df['FTHG'] > match_df['FTAG']) & (match_df['FTR'] != 'H') |
        (match_df['FTHG'] < match_df['FTAG']) & (match_df['FTR'] != 'A') |
        (match_df['FTHG'] == match_df['FTAG']) & (match_df['FTR'] != 'D')
    )
    if result_logic.any():
        print("DC3, DC4, DC5 Violation: Inconsistent match results found.")
        return match_df[result_logic].index.tolist(), ['FTHG', 'FTAG', 'FTR']
    return [], []

def check_betting_odds_validity(match_df):
    invalid_odds = match_df[(match_df['AvgH'] <= 0) | (match_df['AvgD'] <= 0) | (match_df['AvgA'] <= 0)]
    if not invalid_odds.empty:
        print("DC6 Violation: Invalid betting odds found.")
        return invalid_odds.index.tolist(), ['AvgH', 'AvgD', 'AvgA']
    return [], []

def check_weather_data_integrity(match_df):
    weather_issues = match_df[
        (match_df['PRESS'] < 800) | (match_df['PRESS'] > 1080) |
        (match_df['TEMP'] < -50) | (match_df['TEMP'] > 60)
    ]
    if not weather_issues.empty:
        print("DC7, DC8 Violation: Weather data out of expected range found.")
        return weather_issues.index.tolist(), ['PRESS', 'TEMP']
    return [], []

def check_wind_direction_consistency(match_df):
    valid_directions = ['N', 'NE', 'E', 'SE', 'S', 'SW', 'W', 'NW', 'NNE', 'ENE', 'ESE', 'SSE', 'SSW', 'WSW', 'WNW', 'NNW']
    invalid_wdir = match_df[~match_df['WDIR'].isin(valid_directions)]
    if not invalid_wdir.empty:
        print("DC9 Violation: Invalid wind directions found.")
        print(invalid_wdir[['HomeTeam','AwayTeam', 'Date','WDIR']])
        return invalid_wdir.index.tolist(), ['WDIR']
    return [], []

### 2.2. Denial Constraits for Players

In [11]:
def check_position_consistency(player_df):
    inconsistent_positions = player_df.groupby('name')['position'].unique()
    inconsistent_names = inconsistent_positions[inconsistent_positions.apply(len) > 1].index
    if inconsistent_names.empty:
        return [], []
    print("DC1 Violation: Players with inconsistency in positions:")
    print(player_df[player_df['name'].isin(inconsistent_names)][['name', 'position']])
    return player_df[player_df['name'].isin(inconsistent_names)].index.tolist(), ['position']

def check_max_points(player_df, max_points=20):
    over_point_players = player_df[player_df['total_points'] > max_points]
    if over_point_players.empty:
        return [], []
    print("DC2 Violation: Players with unrealistically high total points found")
    print(over_point_players[['name', 'match_date', 'total_points']])
    return over_point_players.index.tolist(), ['total_points']

def check_double_red_card(player_df):
    illegal_players = player_df[player_df['red_cards'] > 1]
    if illegal_players.empty:
        return [], []
    print("DC3 Violation: Players with illegal red cards found")
    print(illegal_players[['name', 'match_date']])
    return illegal_players.index.tolist(), ['red_cards']

def check_goalkeepers_goals(player_df):
    gk_goal = player_df[(player_df['position'] == 'GK') & (player_df['goals_scored'] > 0)]
    if gk_goal.empty:
        return [], []
    print("DC4 Violation: Goalkeepers with goals found")
    print(gk_goal[['name', 'match_date', 'goals_scored']])
    return gk_goal.index.tolist(), ['goals_scored']

def check_unique_player_identification(player_df):
    duplicate_players = player_df[player_df.duplicated(['name', 'match_date', 'team_x'], keep=False)]
    if duplicate_players.empty:
        return [], []
    print("DC5 Violation: Duplicate player records in a single match from the same team found")
    print(duplicate_players[['name', 'match_date', 'team_x']])
    return duplicate_players.index.tolist(), ['name', 'match_date', 'team_x']

def check_player_play_time(player_df):
    overplayed_players = player_df[player_df['minutes'] > 90]
    if overplayed_players.empty:
        return [], []
    print("DC6 Violation: Players who played more than 90 minutes:")
    print(overplayed_players[['name', 'match_date', 'minutes']])
    return overplayed_players.index.tolist(), ['minutes']

def check_player_metrics(player_df):
    invalid_metrics = player_df[(player_df['creativity'] < 0) | (player_df['threat'] < 0) | (player_df['influence'] < 0)]
    if invalid_metrics.empty:
        return [], []
    print("DC7 Violation: Negative values in creativity, threat, or influence metrics.")
    print(invalid_metrics[['name', 'match_date']])
    return invalid_metrics.index.tolist(), ['creativity', 'threat', 'influence']

### 2.3. Denial Constraints for both

In [12]:
def check_player_goals_vs_team_goals(joined_df):
    joined_df['team_goals'] = joined_df.apply(lambda x: x['FTHG'] if x['was_home'] == 1 else x['FTAG'], axis=1)
    violations = joined_df[joined_df['goals_scored'] > joined_df['team_goals']].index.tolist()
    if violations:
        print("DC1 Violation: Players scoring more goals than their team in a match.")
    return violations, ['goals_scored']

def check_player_home_status_consistency(joined_df):
    violations = joined_df[(joined_df['was_home'] == 1) & (joined_df['team_x'] != joined_df['HomeTeam'])].index.tolist()
    if violations:
        print("DC2 Violation: Inconsistency in 'was_home' status.")
    return violations, ['was_home','team_x', 'HomeTeam']

def check_player_team_consistency(joined_df):
    violations = joined_df[(joined_df['team_x'] != joined_df['HomeTeam']) & (joined_df['team_x'] != joined_df['AwayTeam'])].index.tolist()
    if violations:
        print("DC3 Violation: Inconsistency in team names for players.")
    return violations, ['team_x','HomeTeam','AwayTeam']

### 2.4 Denial Constraints checking


In [13]:
player_df['goals_scored'] = pd.to_numeric(player_df['goals_scored'], errors='coerce')
player_df['assists'] = pd.to_numeric(player_df['assists'], errors='coerce')
player_df['red_cards'] = pd.to_numeric(player_df['red_cards'], errors='coerce')
player_df['total_points'] = pd.to_numeric(player_df['total_points'], errors='coerce')
player_df['minutes'] = pd.to_numeric(player_df['minutes'], errors='coerce')
player_df['creativity'] = pd.to_numeric(player_df['creativity'], errors='coerce')
player_df['threat'] = pd.to_numeric(player_df['threat'], errors='coerce')
player_df['influence'] = pd.to_numeric(player_df['influence'], errors='coerce')


matches_denial_constraints = [ 
    check_unique_matches,
    check_non_negative_goals,
    check_match_result_logic,
    check_betting_odds_validity,
    check_weather_data_integrity,
    check_wind_direction_consistency
]

players_denial_constraints = [
    check_position_consistency,
    check_max_points,
    check_double_red_card,
    check_goalkeepers_goals,
    check_unique_player_identification,
    check_player_play_time,
    check_player_metrics
]

joined_denial_constraints = [
    check_player_goals_vs_team_goals,
    check_player_home_status_consistency,
    check_player_team_consistency
]



# MATCHES DF

match_dc = DenialConstraintsChecker(matches_denial_constraints,match_df,['HomeTeam','AwayTeam', 'Date'])
match_dc.check_denial_constraints()

# PLAYER DF

player_dc = DenialConstraintsChecker(players_denial_constraints,player_df,['name','match_date'])
player_dc.check_denial_constraints()

# BOTH DF

df_home = pd.merge(player_df, match_df, left_on=['team_x', 'match_date'], right_on=['HomeTeam', 'Date'], how='inner')
df_away = pd.merge(player_df, match_df, left_on=['team_x', 'match_date'], right_on=['AwayTeam', 'Date'], how='inner')
joined_df = pd.concat([df_home, df_away], ignore_index=True)

joined_dc = DenialConstraintsChecker(joined_denial_constraints,joined_df,['name','match_date'])
joined_dc.check_denial_constraints()

Input new value for WDIR (HomeTeam: Newcastle, AwayTeam: Crystal Palace, Date: 2022-03-09 00:00:00) (Current: nan, Enter for no change): 
Input new value for WDIR (HomeTeam: Fulham, AwayTeam: Chelsea, Date: 2023-12-01 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: West Ham, AwayTeam: Chelsea, Date: 2023-11-02 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: Arsenal, AwayTeam: Brentford, Date: 2023-11-02 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: Crystal Palace, AwayTeam: Brighton, Date: 2023-11-02 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: Fulham, AwayTeam: Nott'm Forest, Date: 2023-11-02 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: Leicester, AwayTeam: Spurs, Date: 2023-11-02 00:00:00) (Current: None, Enter for no change): 
Input new value for WDIR (HomeTeam: Southampton, AwayTeam: Wolves, 

In [14]:
# Undo the join
player_columns = [col for col in joined_df.columns if col in player_df.columns]
match_columns = [col for col in joined_df.columns if col in match_df.columns]

player_changes_df = joined_df[player_columns].drop_duplicates()
match_changes_df = joined_df[match_columns].drop_duplicates()

In [15]:
analytical_sandbox_db = '../data/analytical_sandboxes/analytical_sandbox_zone.db'

conn = duckdb.connect(analytical_sandbox_db)
duck_db_helper.create_table('matches', match_changes_df, conn)
duck_db_helper.create_table('players', player_changes_df, conn)
conn.close()