In [None]:
import sqlite3
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil.relativedelta import relativedelta

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt 
import seaborn as sns # Import seaborn

from datetime import datetime
from datetime import date
from dateutil import parser

from collections import defaultdict

import warnings
import time

In [None]:
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

## Database connection

The data is in sqlite tables. We read the data from tables into dataframes in the following cells

In [None]:
# open connection 
conn = sqlite3.connect("data/database.sqlite")
cur = conn.cursor()

In [None]:
def executeQuery(cur, query):
    cur.execute(query)
    return cur.fetchall()

In [None]:
def read_table_to_df(table):
    query = "SELECT * from " + table
    return pd.read_sql_query(query, conn)

In [None]:
# list of all tables
q_all_tables = """SELECT name FROM sqlite_master
    WHERE type='table';"""
all_tables = executeQuery(cur, q_all_tables)
useful_tables = all_tables[1:]
print(useful_tables)

In [None]:
df_matches = read_table_to_df("Match");
df_player = read_table_to_df("PLAYER");
df_team = read_table_to_df("Team");
df_player_attr = read_table_to_df("Player_Attributes");
df_country = read_table_to_df("Country");
df_league = read_table_to_df("League");
df_team_attr = read_table_to_df("Team_Attributes");

In [None]:
df_matches.head()

In [None]:
df_player.head()

In [None]:
df_team.head()

In [None]:
df_player_attr.head()

In [None]:
df_country.head()

In [None]:
df_league.head()

In [None]:
df_team_attr.head()

In [None]:
# ranges
df_match_info = df_matches.iloc[:, :11]
df_match_player_positions = df_matches.iloc[:, 11:55]
df_match_stats = df_matches.iloc[:, 77:85]
df_match_betting = df_matches.iloc[:, 85:]
df_match_stats.head()

In [None]:
# font = {'family': 'Times New Roman', 'size': 12}
# font2 = {'family': 'Times New Roman', 'size': 14, 'weight': 'bold'}

font = {'family': 'Helvetica', 'size': 12}

In [None]:
def plot_match_bar(X, y):
    X = np.array(X)
    y = np.array(y)
    
    fig, ax = plt.subplots(figsize=(8, 5), dpi=300)
    # Plot the data
    bars = ax.bar(X, y)
    
    for bar in bars:
        bar.set_color('#CBCBCB')

    ax.set_xticks(X)
    ax.set_xticklabels(X, fontdict=font)

    ax.set_yticks(y)
    ax.set_yticklabels(y, fontdict=font)

    ax.set_xlabel('Column group', fontdict=font)
    ax.set_ylabel('Percentage of NaN values', fontdict=font)
    plt.title('Percentage of NaN values by column groups', fontdict=font)
    plt.savefig('data/figs/match_bar_plot.pdf')
    plt.show()

In [None]:
def percentage_nan_df(df):
    total_cells = df.size
    total_nan = df.isna().sum().sum()
    percentage_of_nan = (total_nan / total_cells) * 100
    result = "{:.2f}".format(percentage_of_nan)
    return float(result)

In [None]:
df_match_info_pc = percentage_nan_df(df_match_info);
df_match_player_positions_pc = percentage_nan_df(df_match_player_positions);
df_match_stats_pc = percentage_nan_df(df_match_stats);
df_match_betting_pc = percentage_nan_df(df_match_betting)

df_match_X = ['Match Info', 'Player Positions', 'Match Statistics', 'Betting']
df_match_y = [df_match_info_pc, df_match_player_positions_pc, df_match_stats_pc, df_match_betting_pc]
print(type(df_match_y[0]))
print(df_match_X)
print(df_match_y)

In [None]:
plot_match_bar(df_match_X, df_match_y)

In [None]:
def plot_player_attr_iqr(df_player, df_player_attr):
    # Extract height and weight columns from DataFrame
    height = df_player['height']
    weight = df_player['weight']
    overall_rating = df_player_attr['overall_rating']
    
    # Create a figure with two subplots
    fig, axs = plt.subplots(nrows=1, ncols=3, figsize=(15, 5), dpi=300)

    # Plot IQR for height and weight
    gray_color = '#CBCBCB'
    sns.boxplot(y=height, ax=axs[0], color=gray_color)
    sns.boxplot(y=weight, ax=axs[1], color=gray_color)
    sns.boxplot(y=overall_rating, ax=axs[2], color=gray_color)
    

    # Set axis labels and title
    axs[0].set_ylabel('Height (cm)', fontdict=font)
    axs[1].set_ylabel('Weight (lbs)', fontdict=font)
    axs[2].set_ylabel('Overall Rating ', fontdict=font)
    fig.suptitle('Interquartile Range (IQR) for Height and Weight', font=font)

    # Show the plot
    plt.savefig('data/figs/players_iqr.pdf')
    plt.show()

In [None]:
plot_player_attr_iqr(df_player, df_player_attr)

In [None]:
des_matches = df_matches.describe()
des_player = df_player.describe()
des_team = df_team.describe()
des_player_attr = df_player_attr.describe()
des_country = df_country.describe()
des_league = df_league.describe()
des_team_attr = df_team_attr.describe()

In [None]:
tr_matches = des_matches.transpose()
tr_player = des_player.transpose()
tr_team = des_team.transpose()
tr_player_attr = des_player_attr.transpose()
tr_country = des_country.transpose()
tr_league = des_league.transpose()
tr_team_attr = des_team_attr.transpose()

df_all_stats = pd.concat([tr_matches, tr_player, tr_team, tr_player_attr, tr_country, tr_league, tr_team_attr], axis=0)

In [None]:
styler = df_all_stats.style
stats_table = styler.to_latex()

with open('data/stats_table.tex', 'w') as f:
    f.write(stats_table)