In [2]:
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Connect to the database
def extract_tables(table_list=None):
    conn = sqlite3.connect('database.sqlite')
    c = conn.cursor()
    c.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = c.fetchall()
    df_dict = dict()

    for table in tables:
        df_name = f"df_{table[0]}"
        #print(df_name)
        if table_list:
            if df_name in table_list:
                print(f"Extracting table: {table[0]}")
                df_dict.update({df_name:pd.read_sql_query(f"SELECT * from {table[0]}", conn)})
        else:
            print('no table list')
            df_dict.update({df_name: pd.read_sql_query(f"SELECT * from {table[0]}", conn)})
            
    conn.close()

    return df_dict

df_dict = extract_tables(table_list=['df_Player_Attributes', 'df_Match'])

Extracting table: Player_Attributes
Extracting table: Match


In [3]:
def clean_up_players_df(df_dict):
    '''make necessary changes to the player attributes dataframe for further processing and analysis'''
    df_players_attributes = df_dict['df_Player_Attributes']
    #convert to datetime object
    df_players_attributes['date'] = pd.to_datetime(df_players_attributes['date'])

    #add dummy variables for categorical columns
    for col in df_players_attributes.columns:
        if len(df_players_attributes[col].unique()) < 25:
            df_players_attributes = pd.concat([df_players_attributes, pd.get_dummies(df_players_attributes[col], prefix=col)], axis=1)
            df_players_attributes.drop(col, axis=1, inplace=True)
        
    
    return df_players_attributes

df_players_attributes = clean_up_players_df(df_dict)
df_players_attributes.head()

Unnamed: 0,id,player_fifa_api_id,player_api_id,date,overall_rating,potential,crossing,finishing,heading_accuracy,short_passing,...,defensive_work_rate_9,defensive_work_rate__0,defensive_work_rate_ean,defensive_work_rate_es,defensive_work_rate_high,defensive_work_rate_low,defensive_work_rate_medium,defensive_work_rate_o,defensive_work_rate_ormal,defensive_work_rate_tocky
0,1,218353,505942,2016-02-18,67.0,71.0,49.0,44.0,71.0,61.0,...,0,0,0,0,0,0,1,0,0,0
1,2,218353,505942,2015-11-19,67.0,71.0,49.0,44.0,71.0,61.0,...,0,0,0,0,0,0,1,0,0,0
2,3,218353,505942,2015-09-21,62.0,66.0,49.0,44.0,71.0,61.0,...,0,0,0,0,0,0,1,0,0,0
3,4,218353,505942,2015-03-20,61.0,65.0,48.0,43.0,70.0,60.0,...,0,0,0,0,0,0,1,0,0,0
4,5,218353,505942,2007-02-22,61.0,65.0,48.0,43.0,70.0,60.0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
df_players_attributes.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183978 entries, 0 to 183977
Data columns (total 68 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   id                          183978 non-null  int64         
 1   player_fifa_api_id          183978 non-null  int64         
 2   player_api_id               183978 non-null  int64         
 3   date                        183978 non-null  datetime64[ns]
 4   overall_rating              183142 non-null  float64       
 5   potential                   183142 non-null  float64       
 6   crossing                    183142 non-null  float64       
 7   finishing                   183142 non-null  float64       
 8   heading_accuracy            183142 non-null  float64       
 9   short_passing               183142 non-null  float64       
 10  volleys                     181265 non-null  float64       
 11  dribbling                   183142 non-

  df_players_attributes.info(verbose=True, null_counts=True)


In [5]:
def clean_up_matches_df(df_dict):
    '''make necessary adjustments to the matches dataframe for further processing and analysis such as dropping empty columns or rows, and adding in a column for home team result
    Note that home_player_1	etc are player ids for the given teams and can be used to join with the player attributes dataframe'''

    df_matches = df_dict['df_Match']

    #convert date to datetime object
    df_matches['date'] = pd.to_datetime(df_matches['date'])

    total = df_matches.shape[0]
    for col in df_matches.columns:
        #drop columns with greater than 10% missing values
        if df_matches[col].isnull().sum()/total > 0.1:
            df_matches.drop(col, axis=1, inplace=True)
        elif col in ['goal', 'shoton', 'shotoff', 'foulcommit', 'card', 'cross', 'corner', 'possession','country_id', 'league_id','stage','season']:
            df_matches.drop(col, axis=1, inplace=True)
        
    #drop rows with missing values
    df_matches.dropna(axis=0, inplace=True)

    #add column for home team win, draw, away team win
    df_matches['home_win'] = np.where(df_matches['home_team_goal'] > df_matches['away_team_goal'], 1, 0)
    df_matches['home_draw'] = np.where(df_matches['home_team_goal'] == df_matches['away_team_goal'], 1, 0)
    df_matches['home_loss'] = np.where(df_matches['home_team_goal'] < df_matches['away_team_goal'], 1, 0)

    
    return df_matches


df_matches = clean_up_matches_df(df_dict)
print(df_matches.shape)
df_matches.head()

(21361, 76)


Unnamed: 0,id,date,match_api_id,home_team_api_id,away_team_api_id,home_team_goal,away_team_goal,home_player_X1,home_player_X2,home_player_X3,...,away_player_5,away_player_6,away_player_7,away_player_8,away_player_9,away_player_10,away_player_11,home_win,home_draw,home_loss
145,146,2009-02-27,493017,8203,9987,2,1,1.0,2.0,4.0,...,148314.0,37202.0,43158.0,9307.0,42153.0,32690.0,38782.0,1,0,0
153,154,2009-03-08,493025,9984,8342,1,3,1.0,2.0,4.0,...,38366.0,37983.0,39578.0,38336.0,52280.0,27423.0,38440.0,0,0,1
155,156,2009-03-07,493027,8635,10000,2,0,1.0,2.0,4.0,...,94030.0,37893.0,37981.0,131531.0,130027.0,38231.0,131530.0,1,0,0
162,163,2009-03-13,493034,8203,8635,2,1,1.0,2.0,4.0,...,164694.0,30949.0,38378.0,38383.0,38393.0,38253.0,37069.0,1,0,0
168,169,2009-03-14,493040,10000,9999,0,0,1.0,2.0,4.0,...,94284.0,45832.0,26669.0,33671.0,163670.0,37945.0,33622.0,0,1,0


In [42]:
def create_player_dict(df_players_attributes, player_api_id):
    '''takes the players attributes dataframe and returns a dict of dicts that includes {player id : {date: feature vector}}
    df_players_attribute will be in form: 
        <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 183978 entries, 0 to 183977
        Data columns (total 68 columns):
        #   Column                      Non-Null Count   Dtype         
        ---  ------                      --------------   -----         
        0   id                          183978 non-null  int64         
        1   player_fifa_api_id          183978 non-null  int64         
        2   player_api_id               183978 non-null  int64         
        3   date                        183978 non-null  datetime64[ns]
        4   overall_rating              183142 non-null  float64       
        5   potential                   183142 non-null  float64       
        6   crossing                    183142 non-null  float64       
        7   finishing                   183142 non-null  float64       
        8   heading_accuracy            183142 non-null  float64       
        9   short_passing               183142 non-null  float64       
        10  volleys                     181265 non-null  float64       
        11  dribbling                   183142 non-null  float64       
        12  curve                       181265 non-null  float64       
        13  free_kick_accuracy          183142 non-null  float64       
        14  long_passing                183142 non-null  float64       
        15  ball_control                183142 non-null  float64       
        16  acceleration                183142 non-null  float64       
        17  sprint_speed                183142 non-null  float64       
        18  agility                     181265 non-null  float64       
        19  reactions                   183142 non-null  float64       
        ...
        66  defensive_work_rate_ormal   183978 non-null  uint8         
        67  defensive_work_rate_tocky   183978 non-null  uint8      
        
        where each column after date is a feature of that particular player on that particular date'''
    
    player_dict = dict()
    df_player = df_players_attributes[df_players_attributes['player_api_id'] == player_api_id].drop(['id', 'player_fifa_api_id', 'player_api_id'], axis=1)
    player_dates = df_player['date'].unique()

    if df_player.index.name != 'date':
        df_player = df_player.set_index(['date'])

    for date in player_dates:
        player_dict.update({date: df_player.loc[date].to_list()})
    
    return player_dict

def get_match_players(df_match, match_api_id):
    '''takes match_df and a match_id and returns a dict of {col: player_api_id} of the players in the match'''

    df_match = df_match[df_match['match_api_id'] == match_api_id]
    #col_names = ['home_player_'+'i' for i in range(1,12)] + ['away_player_'+'i' for i in range(1,12)]
    df_match = df_match.astype(int)
    return df_match.iloc[0]['home_player_1':'away_player_11'].to_dict()

def get_match_players_position(df_match, match_api_id):
    '''takes match_df and a match_id and returns a dict of {col: player_api_id} of the players in the match'''
    player_pos_dict = {} 
    df_match = df_match[df_match['match_api_id'] == match_api_id]

    #create dict of dict[player_id] = (x_position, y_position)
    for i in range(1,12):
        player_pos_dict[df_match.iloc[0]['home_player_'+str(i)].astype(int)] = (df_match.iloc[0]['home_player_X'+str(i)].astype(int), df_match.iloc[0]['home_player_Y'+str(i)].astype(int))
        player_pos_dict[df_match.iloc[0]['away_player_'+str(i)].astype(int)] = (df_match.iloc[0]['away_player_X'+str(i)].astype(int), df_match.iloc[0]['away_player_Y'+str(i)].astype(int))

    return player_pos_dict

def get_most_recents_stats(df_match, match_api_id, player_dict):
    '''given a match_df, match id, and player_dict as returned by create_player_dict(), return thee feature vector of the most recent stats of that player
    keys of player_dict and dates in dataframe are both date time objects '''
    import numpy as np
    from datetime import datetime, timedelta

    df_match = df_match[df_match['match_api_id'] == match_api_id]
    match_date = df_match['date'].iloc[0]
    most_recent_date = 0
    abs =  np.inf
    #get the players in the match
   
    for date in player_dict.keys():
        date_diff = match_date - date
        if np.absolute(date_diff.days) < abs:
            abs = np.absolute(date_diff.days)
            most_recent_date = date
    
    return player_dict[most_recent_date]



In [None]:
player_dict = create_player_dict(df_players_attributes,67958)
print(player_dict)

In [26]:
get_match_players(df_matches, 493017)

{'home_player_1': 38327,
 'home_player_2': 67950,
 'home_player_3': 67958,
 'home_player_4': 67959,
 'home_player_5': 37112,
 'home_player_6': 36393,
 'home_player_7': 148286,
 'home_player_8': 67898,
 'home_player_9': 164352,
 'home_player_10': 38801,
 'home_player_11': 26502,
 'away_player_1': 37937,
 'away_player_2': 38293,
 'away_player_3': 148313,
 'away_player_4': 104411,
 'away_player_5': 148314,
 'away_player_6': 37202,
 'away_player_7': 43158,
 'away_player_8': 9307,
 'away_player_9': 42153,
 'away_player_10': 32690,
 'away_player_11': 38782}

In [27]:
get_match_players_position(df_matches, 493017)

{38327: (1, 1),
 37937: (1, 1),
 67950: (2, 3),
 38293: (2, 3),
 67958: (4, 3),
 148313: (4, 3),
 67959: (6, 3),
 104411: (6, 3),
 37112: (8, 3),
 148314: (8, 3),
 36393: (2, 7),
 37202: (2, 7),
 148286: (4, 7),
 43158: (4, 7),
 67898: (6, 7),
 9307: (6, 7),
 164352: (8, 7),
 42153: (8, 7),
 38801: (4, 10),
 32690: (4, 10),
 26502: (6, 10),
 38782: (6, 10)}

In [44]:
len(get_most_recents_stats(df_matches, 493017, player_dict))

64