In [1]:
import pandas as pd
import numpy as np

In [2]:
game_data_df = pd.read_csv('../train_data/game_data.csv', index_col=0)
form_data_df = pd.read_csv('../train_data/form_data.csv', index_col=0)
player_data_df = pd.read_csv('../train_data/player_data.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
game_data_df = game_data_df[game_data_df['temp'].map(lambda x: x.endswith('C'))]

In [4]:
display(game_data_df.head())
display(form_data_df.head())
display(player_data_df.head())

Unnamed: 0,game_id,away,home,temp,weather_desc,date,home market value,home age,away market value,away age,FTAG,FTHG
0,0,Leicester City,Arsenal,11C,Clear,11/08/17,570510000,23.8,226890000,25.5,3,4
1,194,Leicester City,Watford,3C,Clear,26/12/17,150570000,25.9,226890000,25.5,1,2
2,219,Leicester City,Chelsea,5C,Cloudy,13/01/18,577940000,23.7,226890000,25.5,0,0
3,344,Leicester City,Crystal Palace,8C,Cloudy,28/04/18,174980000,24.8,226890000,25.5,0,5
4,243,Leicester City,Everton,5C,Breezy,31/01/18,312080000,23.9,226890000,25.5,1,2


Unnamed: 0,goals_for,goals_against,team,game_id
0,1.0,2.0,Arsenal,0
1,5.0,2.0,Arsenal,0
2,1.0,1.0,Arsenal,0
3,2.0,1.0,Arsenal,0
4,3.0,1.0,Arsenal,0


Unnamed: 0,Acceleration,Age,Aggression,Agility,Balance,Ball control,Composure,Crossing,team,Curve,...,Shot power,Sliding tackle,Sprint speed,Stamina,Standing tackle,Strength,Vision,Volleys,game_id,position
0,40,35.0,17,49,34,22,70,19,Arsenal,13,...,21,12,44,32,13,65,53,17,0,XX
1,61,21.0,72,60,69,69,72,52,Arsenal,33,...,48,73,63,75,74,75,55,27,0,DF
2,74,31.0,74,73,74,77,76,78,Arsenal,74,...,71,81,72,83,82,65,66,56,0,DF
3,73,24.0,91,52,55,69,70,74,Arsenal,43,...,82,84,82,85,83,91,56,31,0,DF
4,96,22.0,72,84,77,79,72,76,Arsenal,63,...,52,81,94,87,77,62,63,41,0,DF


In [5]:
# function to get all of the form predictors needed
def get_form_predictors (form, home, away):
    
    # get form for the home and away team
    form_home = form[form['team'] == home]
    form_away = form[form['team'] == away]
    
    # get average goal difference form predictor
    form_home_goal_diff = np.mean(form_home['goals_for']) - np.mean(form_home['goals_against'])
    form_away_goal_diff = np.mean(form_away['goals_for']) - np.mean(form_away['goals_against'])
    form_goal_diff = form_home_goal_diff - form_away_goal_diff
    
    # get win difference form predictor
    form_home_results = form_home['goals_for'] - form_home['goals_against']
    form_home_wins = sum(3 for i in form_home_results if i > 0) + sum(1 for i in form_home_results if i == 0)
    form_away_results = form_away['goals_for'] - form_away['goals_against']
    form_away_wins = sum(3 for i in form_away_results if i > 0) + sum(1 for i in form_away_results if i == 0)
    form_win_diff = form_home_wins / len(form_home_results) - form_away_wins / len(form_away_results)
    
    # return desired values
    return form_goal_diff, form_win_diff

In [6]:
# function to get all of the game predictors needed
def get_game_predictors (game):
    
    # get temperature predictor
    temp = int(game['temp'][0][:game['temp'][0].index("C")])
    
    # get weather predictor
    weather = game['weather_desc'][0]
    
    # get difference in market value
    home_market_value = int(game['home market value'][0].replace(',', ''))
    away_market_value = int(game['away market value'][0].replace(',', ''))
    market_value_diff = home_market_value - away_market_value
    
    # get response variable
    goal_diff = (game['FTHG'] - game['FTAG'])[0]
    
    # return desired values
    return temp, weather, market_value_diff, goal_diff


In [7]:
# function to go in and get all the player info we want for each team
def get_player_info (team):
    
    # get the goalie, defense, midfield, and forward for that side
    goalie = []
    defense = []
    midfield = []
    forward = []

    for i in range(len(team['position'])):
        if 'X' in team['position'][i]:
            goalie.append(i)
        elif 'D' in team['position'][i] and 'M' not in team['position'][i]:
            defense.append(i)
        elif 'M' in team['position'][i]:
            midfield.append(i)
        else:
            forward.append(i)
    goalies = team.iloc[goalie]
    defenders = team.iloc[defense]
    midfielders = team.iloc[midfield]
    forwards = team.iloc[forward]
    
    goalie_lst = [np.mean(goalies['GK diving']), np.mean(goalies['GK handling']), 
                  np.mean(goalies['GK positioning']), np.mean(goalies['GK reflexes'])]
    
    defense_lst = [np.mean(defenders['Acceleration']), np.mean(defenders['Aggression']), 
                   np.mean(defenders['Heading accuracy']), np.mean(defenders['Interceptions']),
                   np.mean(defenders['Jumping']), np.mean(defenders['Marking']), 
                   np.mean(defenders['Positioning']), np.mean(defenders['Sliding tackle']), 
                   np.mean(defenders['Sprint speed']), np.mean(defenders['Standing tackle']), 
                   np.mean(defenders['Strength'])]
    
    midfield_lst = [np.mean(midfielders['Balance']), np.mean(midfielders['Acceleration']), 
                    np.mean(midfielders['Crossing']), np.mean(midfielders['Ball control']), 
                    np.mean(midfielders['Dribbling']), np.mean(midfielders['Long passing']),
                    np.mean(midfielders['Short passing']), np.mean(midfielders['Positioning']), 
                    np.mean(midfielders['Sprint speed']), np.mean(midfielders['Stamina']), 
                    np.mean(midfielders['Vision']), np.mean(midfielders['Interceptions'])]
    
    forward_lst = [np.mean(forwards['Acceleration']), np.mean(forwards['Finishing']), 
                   np.mean(forwards['Reactions']), np.mean(forwards['Shot power']), 
                   np.mean(forwards['Volleys']), np.mean(forwards['Sprint speed']), 
                   np.mean(forwards['Stamina']), np.mean(forwards['Strength']), 
                   np.mean(forwards['Composure']), np.mean(forwards['Jumping']),
                   np.mean(forwards['Agility']), np.mean(forwards['Dribbling'])]
    
    # get the overall rating for the team's goalie, defense, midfield, and forward
    team_goalie = np.mean(goalie_lst)
    team_defense = np.mean(defense_lst)
    team_midfield = np.mean(midfield_lst)
    team_forward = np.mean(forward_lst)
    
    # return desired values
    return team_goalie, team_defense, team_midfield, team_forward

In [8]:
# function to get all of the player predictors needed
def get_player_predictors (players, home, away):
    
    # get home and away players
    players_home = players[players['team'] == home].reset_index()
    players_away = players[players['team'] == away].reset_index()
    
    # get the best player matchup
    best_player_diff = max(players_home['Overall']) - max(players_away['Overall'])
    
    # get the worst player matchup
    worst_player_diff = min(players_home['Overall']) - min(players_away['Overall'])
    
    # get the difference in variances of players
    var_players_diff = np.var(players_home['Overall']) - np.var(players_away['Overall'])
    
    # get the difference in age
    age_diff = np.mean(players_home['Age']) - np.mean(players_away['Age'])
    
    # get stats for sections of home players
    home_player_info = get_player_info(players_home)
    home_goalie = home_player_info[0]
    home_defense = home_player_info[1]
    home_midfield = home_player_info[2]
    home_forward = home_player_info[3]
    
    # get stats for sections of away players
    away_player_info = get_player_info(players_away)
    away_goalie = away_player_info[0]
    away_defense = away_player_info[1]
    away_midfield = away_player_info[2]
    away_forward = away_player_info[3]
    
    # get the rest of the player predictors
    HF_AD = home_forward - away_defense
    HF_AG = home_forward - away_goalie
    HM_AM = home_midfield - away_midfield
    HD_AF = home_defense - away_forward
    HG_AF = home_goalie - away_forward
    
    # return desired values
    return HF_AD, HF_AG, HM_AM, HD_AF, HG_AF, best_player_diff, worst_player_diff, var_players_diff, age_diff

In [9]:
def build_set (game_data, form_data, player_data):

    # prep lists to put into dataset
    temps = []
    weathers = []
    market_value_diffs = []
    goal_diffs = []
    form_goal_diffs = []
    form_win_diffs = []
    HF_ADs = []
    HF_AGs = []
    HM_AMs = []
    HD_AFs = []
    HG_AFs = []
    best_player_diffs = []
    worst_player_diffs = []
    var_players_diffs = []
    age_diffs = []
    
    game_ids = game_data['game_id']
    for game_id in game_ids:
        # get data for just this game
        players = player_data[player_data['game_id'] == game_id].reset_index()
        form = form_data[form_data['game_id'] == game_id].reset_index()
        game = game_data[game_data['game_id'] == game_id].reset_index()

        # get home and away team
        home = game['home'][0]
        away = game['away'][0]

        # get all game predictors
        game_preds = get_game_predictors(game)
        temp = game_preds[0]
        weather = game_preds[1]
        market_value_diff = game_preds[2]
        goal_diff = game_preds[3]

        # get all form predictors
        form_preds = get_form_predictors(form, home, away)
        form_goal_diff = form_preds[0]
        form_win_diff = form_preds[1]

        # get all player predictors
        player_preds = get_player_predictors(players, home, away)
        HF_AD = player_preds[0]
        HF_AG = player_preds[1]
        HM_AM = player_preds[2]
        HD_AF = player_preds[3]
        HG_AF = player_preds[4]
        best_player_diff = player_preds[5]
        worst_player_diff = player_preds[6]
        var_players_diff = player_preds[7]
        age_diff = player_preds[8]
        
        # fill the lists
        temps.append(temp)
        weathers.append(weather)
        market_value_diffs.append(market_value_diff)
        goal_diffs.append(goal_diff)
        form_goal_diffs.append(form_goal_diff)
        form_win_diffs.append(form_win_diff)
        HF_ADs.append(HF_AD)
        HF_AGs.append(HF_AG)
        HM_AMs.append(HM_AM)
        HD_AFs.append(HD_AF)
        HG_AFs.append(HG_AF)
        best_player_diffs.append(best_player_diff)
        worst_player_diffs.append(worst_player_diff)
        var_players_diffs.append(var_players_diff)
        age_diffs.append(age_diff)

    return_df = pd.DataFrame()
    return_df['temp'] = temps
    return_df['weather'] = weathers
    return_df['market_value_diff'] = market_value_diffs
    return_df['form_goal_diff'] = form_goal_diffs
    return_df['form_win_diff'] = form_win_diffs
    return_df['HF_AD'] = HF_ADs
    return_df['HF_AG'] = HF_AGs
    return_df['HM_AM'] = HM_AMs
    return_df['HD_AF'] = HD_AFs
    return_df['HG_AF'] = HG_AFs
    return_df['best_player_diff'] = best_player_diffs
    return_df['worst_player_diff'] = worst_player_diffs
    return_df['variance_players_diff'] = var_players_diffs
    return_df['age_diff'] = age_diffs
    return_df['goal_diff'] = goal_diffs
    
    return return_df

In [11]:
int_cols = ['Acceleration', 'Aggression', 'Agility', 'Balance',
           'Ball control', 'Composure', 'Crossing', 'Curve', 'Dribbling',
           'Finishing', 'GK diving', 'GK handling', 'GK positioning',
           'GK reflexes', 'Heading accuracy', 'Interceptions', 'Jumping',
           'Long passing', 'Long shots', 'Marking', 'Positioning',
           'Reactions', 'Short passing', 'Shot power', 'Sliding tackle',
           'Sprint speed', 'Stamina', 'Standing tackle', 'Strength', 'Vision',
           'Volleys']

In [12]:
for col in int_cols:
    for i in range(player_data_df.shape[0]):
        val = player_data_df[col].iloc[i]
        player_data_df.at[i, col] = val[:2] if type(val) == str else val
    player_data_df[col] = player_data_df[col].astype(int)

In [13]:
form_data_df = form_data_df.dropna()

In [14]:
data_train = build_set(game_data_df, form_data_df, player_data_df)

In [50]:
#data_train.to_csv('../train_data/data_train.csv')

In [51]:
#data_train

Get the testing dataset after 90 minutes.

In [40]:
game_data_test = pd.read_csv('../test_data/game_data.csv', index_col=0)
form_data_test = pd.read_csv('../test_data/form_data.csv', index_col=0)
player_data_test = pd.read_csv('../test_data/player_data.csv', index_col=0)

In [41]:
game_data_test.drop(['FTHG', 'FTAG'], axis = 1, inplace = True)

In [42]:
game_data_test.rename(columns = {"90HG": "FTHG", "90AG": "FTAG", "home_market_value": "home market value", "away_market_value": "away market value"}, inplace = True)

In [43]:
game_data_test['home market value'] = game_data_test['home market value'].astype(str)
game_data_test['away market value'] = game_data_test['away market value'].astype(str)

In [44]:
for col in int_cols:
    for i in range(player_data_test.shape[0]):
        val = player_data_test[col].iloc[i]
        player_data_test.at[i, col] = val[:2] if type(val) == str else val
    player_data_test[col] = player_data_test[col].astype(int)

In [45]:
form_data_test = form_data_test.dropna()

In [46]:
data_test = build_set(game_data_test, form_data_test, player_data_test)

In [48]:
data_test.to_csv('../test_data/data_test.csv')

Now get the testing dataset after extra time.

In [52]:
game_data_test_et = pd.read_csv('../test_data/game_data.csv', index_col=0)
form_data_test_et = pd.read_csv('../test_data/form_data.csv', index_col=0)
player_data_test_et = pd.read_csv('../test_data/player_data.csv', index_col=0)

In [53]:
game_data_test_et.rename(columns = {"home_market_value": "home market value", "away_market_value": "away market value"}, inplace = True)

In [54]:
game_data_test_et['home market value'] = game_data_test_et['home market value'].astype(str)
game_data_test_et['away market value'] = game_data_test_et['away market value'].astype(str)

In [55]:
for col in int_cols:
    for i in range(player_data_test_et.shape[0]):
        val = player_data_test_et[col].iloc[i]
        player_data_test_et.at[i, col] = val[:2] if type(val) == str else val
    player_data_test_et[col] = player_data_test_et[col].astype(int)

In [56]:
form_data_test_et = form_data_test_et.dropna()

In [57]:
data_test_et = build_set(game_data_test_et, form_data_test_et, player_data_test_et)

In [59]:
data_test_et.to_csv('../test_data/data_test_extratime.csv')