In [None]:
import numpy as np
import pandas as pd

import scipy.stats as stats

# from scraping_utility import *

# Import and clean transfer data

## Import data from the top 5 leagues for 2015-2019

In [None]:
data_path = '../transfers-master/data/'

years = [2019, 2018, 2017, 2016, 2015]

leagues = ['english_premier_league',
          'french_ligue_1',
          'german_bundesliga_1',
          'italian_serie_a',
          'spanish_primera_division']

# Scrape player stats from FBref.com

In [None]:
scrape_new = False

In [None]:
if scrape_new:
    
    shot_stats, pass_stats, time_stats, misc_stats = scrape_all_players(transfer_data)

    shot_stats.to_csv('shot_stats.csv', index=False)
    pass_stats.to_csv('pass_stats.csv', index=False)
    time_stats.to_csv('time_stats.csv', index=False)
    misc_stats.to_csv('misc_stats.csv', index=False)

In [None]:
shot_stats = pd.read_csv('shot_stats.csv')
pass_stats = pd.read_csv('pass_stats.csv')
time_stats = pd.read_csv('time_stats.csv')
misc_stats = pd.read_csv('misc_stats.csv')

# Merge all stats

I used a player's name, age, year of transfer, team, competition level, and minutes played as an ID unique to each transfer

In [None]:
huge_stats = shot_stats.merge(pass_stats, on=['age', 'player_name', 'year',
                                              'squad', 'comp_level', 'minutes_90s'])

huge_stats = huge_stats.merge(time_stats, on=['age', 'player_name', 'year',
                                              'squad', 'comp_level', 'minutes_90s'])

huge_stats = huge_stats.merge(misc_stats, on=['age', 'player_name', 'year',
                                              'squad', 'comp_level', 'minutes_90s'])

huge_stats = huge_stats.replace('', np.nan)
huge_stats.to_csv('huge_stats.csv', index=False)

# Cleaning scraped data

## Select a smaller subset of stats to keep

In [None]:
data_cols = ['player_name', 'age', 'year', 'minutes_90s', 'goals', 'pens_made', 'pens_att',
             'shots_on_target', 'shots_on_target_pct', 'shots_on_target_per90', 'goals_per_shot_on_target', 'assists',
             'games', 'minutes', 'minutes_per_game', 'minutes_pct', 'games_starts', 'games_subs',
             'unused_subs', 'points_per_match', 'on_goals_for', 'on_goals_against', 'plus_minus',
             'plus_minus_per90', 'cards_red', 'cards_yellow', 'fouls']

In [None]:
data = huge_stats.loc[:, data_cols]

# Minutes column has commas that need to be removed before it can be turned into a numeric datatype
data['minutes'] = data.minutes.str.replace(',', '')
data = data.set_index('player_name').apply(pd.to_numeric)

data = data.reset_index()

## Fill in missing values for goals per shot on target

In [None]:
# Penalties don't count in our goals per shot on target
data['in_play_goals'] = data.goals - data.pens_made
data['goals_per_shot_on_target'] = (data.in_play_goals / data.shots_on_target).replace(np.inf, 0).fillna(0)

# You can't score more than one goal per shot on target, so we'll assume those are errors in the data
# and fill them with the mean
data.loc[data.goals_per_shot_on_target > 1, 'goals_per_shot_on_target'] = data.goals_per_shot_on_target.mean()

## Lots of shots on target % is missing. Let's impute them so we can calculate total shots

In [None]:
data.loc[data.shots_on_target_pct > 100, 'shots_on_target_pct'] = np.nan
data.loc[data.shots_on_target == 0, 'shots_on_target_pct'] = 0

In [None]:
#I want to select only rows that have shots on target because I will use that to impute total shots
data = data[data.shots_on_target.notna()]

# Fill null shots on target percentages with mean + random between -2$\sigma$ and +2$\sigma$
import random
random.seed(12)
sog_mean = data.shots_on_target_pct.mean()
sog_std = data.shots_on_target.std()

data.loc[data.shots_on_target_pct.isna(), 'shots_on_target_pct'] = sog_mean + sog_std*random.uniform(-2, 2)

# The shots on target percentages are reported as between 0-100, I want them as decimals.
data['shots_on_target_pct'] /= 100
data['shots_total'] = (data.shots_on_target / data.shots_on_target_pct).replace(np.inf, 0).fillna(0).astype(int)

### Combine rows for players with multiple entries in same year using [name, year, age] as an identifier

Some players may have played on multiple teams in one season. I'm interested in their cumulative stats from the season so let's combine them all.

Adding values is the reason why I needed to calculate total shots, because it is much easier to add shots and shots on target than it is to add shots on target percenetages.

In [None]:
data_combined = data.groupby(['player_name', 'year', 'age']).sum().reset_index()
combined_cols = ['player_name', 'age', 'year', 'minutes_90s', 'goals', 'in_play_goals', 'pens_made', 'pens_att',
                'shots_total', 'shots_on_target', 'assists', 'games', 'minutes', 'games_starts', 'games_subs',
                'unused_subs', 'on_goals_for', 'on_goals_against', 'plus_minus', 'cards_red',
                'cards_yellow', 'fouls']
data_combined = data_combined[combined_cols]

#### Recalculate all ratio features with new combined values

In [None]:
data_combined['goals_per_shot_on_target'] = (data_combined.in_play_goals / data_combined.shots_on_target).replace(np.inf, 0).fillna(0)
data_combined['shots_on_target_pct'] = (data_combined.shots_on_target / data_combined.shots_total).replace(np.inf, 0).fillna(0)
data_combined['shots_on_target_per90'] = data_combined.shots_on_target / data_combined.minutes_90s

# Merge scraped stats data with transfer data

In [None]:
data_full = data_combined.merge(transfer_data, on=['player_name', 'year'])
data_full = data_full.drop_duplicates(['player_name', 'year'])
data_full.to_csv('data_full.csv', index=False)