## Create master database of player information (market values and stats)

In [259]:
import pandas as pd
import numpy as np
import time

In [260]:
# Load base dataframe

combined = pd.read_csv("combined_table.csv")

In [302]:
market_values[market_values['pid'] == 133964].T

Unnamed: 0,12345,13234,14254
teamid,6195,6195,6195
number,24,24,24
name,Lorenzo Insigne,Lorenzo Insigne,Lorenzo Insigne
pid,133964,133964,133964
main_pos,attacker,attacker,attacker
position,Left Wing,Left Wing,Left Wing
dob,1991-06-04,1991-06-04,1991-06-04
age,24,25,26
nat1,Italy,Italy,Italy
nat2,,,


In [261]:
# Load additional dataframes

market_values = pd.read_csv("market_values.csv")
tm_by_year = pd.read_csv("career_by_year.csv")
tm_summary = pd.read_csv("career_summary.csv")
s_stats = pd.read_csv("player_stats.csv")
s_info = pd.read_csv("player_info.csv")
teams_list = pd.read_csv("teams.csv")

In [262]:
# Combine MPG, PPG, apps, assists, league_name from tm_by_year

# Create a subset dataframe & select only the 2016 info for each player

#tm_by_year['pid'] = tm_by_year['pid'].apply(pd.to_numeric, errors = 'coerce')

tm_by_year['total_points'] = tm_by_year['apps'] * tm_by_year['PPM'] # To calculate the total team points earned

tm_by_year_summary = tm_by_year.groupby(['pid','year'])[['minutes','total_points','apps',
                                                         'assists','goals','conceded_goals',
                                                         'clean_sheets']].sum().copy()

tm_by_year_summary.reset_index(inplace = True)

tm_2016 = tm_by_year_summary[tm_by_year_summary['year'] == 2016].copy()

tm_2016['MPM'] = tm_2016['minutes'] / tm_2016['apps']
tm_2016['PPM'] = tm_2016['total_points'] / tm_2016['apps']
tm_2016['GPM'] = tm_2016['goals'] / tm_2016['apps']
tm_2016['CS_R'] = tm_2016['clean_sheets'] / tm_2016['apps']
tm_2016['GAPM'] = tm_2016['conceded_goals'] / tm_2016['apps']

tm_2016.drop('year', axis = 1, inplace = True)

tm_2016.columns = ['pid','mins_2016','points_2016','apps_2016','assists_2016','goals_2016','conceded_2016',
                 'clean_sheets_2016','MPM_2016','PPM_2016','GPM_2016','CS_R','GAPM']

tm_2016.head(3)

Unnamed: 0,pid,mins_2016,points_2016,apps_2016,assists_2016,goals_2016,conceded_2016,clean_sheets_2016,MPM_2016,PPM_2016,GPM_2016,CS_R,GAPM
36,26,1050.0,20.01,11.0,,,18.0,1.0,95.454545,1.819091,,0.090909,1.636364
54,68,454.0,13.98,9.0,,1.0,,,50.444444,1.553333,0.111111,,
72,80,450.0,15.0,5.0,,,7.0,1.0,90.0,3.0,,0.2,1.4


In [263]:
# Find the league where a player played most games in 2016

tm_club_summary = tm_by_year.groupby(['pid','league_name','year'])[['apps']].sum().copy()
tm_club_summary.reset_index(inplace = True)

tm_club_2016 = tm_club_summary[tm_club_summary['year'] == 2016].copy()

tm_club_2016.sort_values(['pid', 'apps'], ascending=[True, False], inplace=True)

tm_club_2016 = tm_club_2016.drop_duplicates(subset=['pid'], keep='first')

tm_club_2016.head(6)

Unnamed: 0,pid,league_name,year,apps
76,26,1.Bundesliga,2016.0,7.0
152,68,MLS,2016.0,6.0
187,80,1.Bundesliga,2016.0,3.0
251,107,MLS,2016.0,4.0
301,132,HET Liga,2016.0,1.0
411,488,EFL Trophy,2016.0,4.0


In [264]:
# Create a field that counts how many champions league games the player played in

tm_ucl_summary = tm_by_year.groupby(['pid','league_name','year'])[['apps']].sum().copy()
tm_ucl_summary.reset_index(inplace = True)

tm_ucl_2016 = tm_ucl_summary[(tm_ucl_summary['year'] == 2016) & (tm_ucl_summary['league_name'] == "Champions League")].copy()

# tm_ucl_2016.sort_values(['pid', 'apps'], ascending=[True, False], inplace=True)

# tm_ucl_2016 = tm_club_2016.drop_duplicates(subset=['pid'], keep='first')
tm_ucl_2016.drop(tm_ucl_2016[tm_ucl_2016['apps'].isnull()].index, inplace=True)

# Change column name to ucl_2016_apps
tm_ucl_2016.rename(columns={'apps':'ucl_2016_apps'}, inplace=True)

In [265]:
# Add the player's most common league in 2016 to the dataframe & champions league games played

tm_2016 = pd.merge(tm_2016, tm_club_2016[['pid','league_name']], how = 'left', on = 'pid')
tm_2016 = pd.merge(tm_2016, tm_ucl_2016[['pid','ucl_2016_apps']], how = 'left', on = 'pid')

In [266]:
tm_2016.head()

Unnamed: 0,pid,mins_2016,points_2016,apps_2016,assists_2016,goals_2016,conceded_2016,clean_sheets_2016,MPM_2016,PPM_2016,GPM_2016,CS_R,GAPM,league_name,ucl_2016_apps
0,26,1050.0,20.01,11.0,,,18.0,1.0,95.454545,1.819091,,0.090909,1.636364,1.Bundesliga,2.0
1,68,454.0,13.98,9.0,,1.0,,,50.444444,1.553333,0.111111,,,MLS,
2,80,450.0,15.0,5.0,,,7.0,1.0,90.0,3.0,,0.2,1.4,1.Bundesliga,
3,107,34.0,1.0,4.0,,,,,8.5,0.25,,,,MLS,
4,132,18.0,1.0,1.0,,,,,18.0,1.0,,,,HET Liga,


In [267]:
market_values.head(5)

Unnamed: 0,teamid,number,name,pid,main_pos,position,dob,age,nat1,nat2,...,contractuntil,marketval,prevmarketval,team,season,teamurl,year,current_team,current_teamid,country
0,7831.0,1.0,Michael Theo,39657.0,keeper,Keeper,1981-02-11,34.0,Australia,Greece,...,2018-06-30,700000.0,750000.0,Brisbane Roar,15/16,brisbane-roar,2015.0,Brisbane Roar,7831.0,Australia
1,7831.0,21.0,Jamie Young,13435.0,keeper,Keeper,1985-08-25,29.0,England,Australia,...,,100000.0,50000.0,Brisbane Roar,15/16,brisbane-roar,2015.0,Brisbane Roar,7831.0,Australia
2,7831.0,36.0,Andre Jannese,403669.0,keeper,Keeper,1996-10-28,18.0,Australia,,...,,,,Brisbane Roar,15/16,brisbane-roar,2015.0,Brisbane Roar U21,26205.0,Australia
3,7831.0,13.0,Jade North,29583.0,defender,Centre-Back,1982-01-07,33.0,Australia,,...,,400000.0,350000.0,Brisbane Roar,15/16,brisbane-roar,2015.0,Brisbane Roar,7831.0,Australia
4,7831.0,33.0,Luke DeVere,80716.0,defender,Centre-Back,1989-11-05,25.0,Australia,France,...,2017-06-30,400000.0,800000.0,Brisbane Roar,15/16,brisbane-roar,2015.0,Brisbane Roar,7831.0,Australia


In [268]:
# Merge dataframes

master = pd.merge(combined, tm_2016, how = 'left', on = 'pid')

In [269]:
master.shape

(3138, 46)

In [270]:
master.head()

Unnamed: 0,teamid,number,name,pid,main_pos,position,dob,age,nat1,nat2,...,goals_2016,conceded_2016,clean_sheets_2016,MPM_2016,PPM_2016,GPM_2016,CS_R,GAPM,league_name,ucl_2016_apps
0,8054.0,1.0,Andrew Redmayne,51975,keeper,Keeper,1989-01-13,28.0,Australia,,...,,32.0,4.0,83.111111,1.337778,,0.444444,3.555556,A-League,
1,8054.0,5.0,Jordy Buijs,31111,defender,Centre-Back,1988-12-28,28.0,Netherlands,,...,1.0,,,86.8,1.9995,0.05,,,A-League,
2,8054.0,4.0,Alex Wilkinson,43128,defender,Centre-Back,1984-08-13,33.0,Australia,,...,,,,89.206897,2.478621,,,,A-League,
3,8054.0,22.0,Sebastian Ryall,58120,defender,Centre-Back,1989-07-18,28.0,Australia,,...,,,,53.933333,2.534667,,,,A-League,
4,8054.0,7.0,Michael Zullo,55444,defender,Left-Back,1988-11-09,28.0,Australia,Italy,...,,,,91.071429,2.535714,,,,A-League,


In [271]:
# Merge master with tm summary

# Columns to merge

tm_career = tm_summary[['pid','MPG','PPM','apps','assists','clean_sheets','conceded_goals','goals','intl_caps',
                  'intl_goals','minutes']].copy()

# rename columns for merge

tm_career.columns = ['pid','MPG_career','PPM_career','apps_career','assists_career','clean_sheets_career',
                    'conceded_goals_career','goals_career','intl_caps','intl_goals','minutes_career']

In [272]:
# Merge dataframes

master = pd.merge(master, tm_career, how = 'left', on = 'pid')

In [273]:
master.shape

(3138, 56)

In [274]:
# Merge master with squawka match stats

# Columns to merge

s_stats.head()

Unnamed: 0,index,idn,attack,defense,possession,goalkeeping,score,match_id,start_year,season_code,goals,assists,team_id,state,age,sub,result,clean_sheet
0,0,67.0,0.0,-27.3,-1.43,0.0,-28.73,8445,2014,14/15,0,0,315.0,1,30.0,0,L,0
1,1,375.0,0.0,55.21,4.42,0.0,59.63,8445,2014,14/15,0,0,46.0,1,31.0,0,W,1
2,2,523.0,10.19,-8.28,-7.88,0.0,-5.97,8445,2014,14/15,0,0,46.0,1,24.0,0,W,1
3,3,728.0,0.0,14.45,17.12,0.0,31.57,8445,2014,14/15,0,0,315.0,1,33.0,0,L,0
4,4,807.0,0.0,-2.04,7.05,0.0,5.01,8445,2014,14/15,0,0,46.0,1,34.0,0,W,1


In [275]:
# Create a subset dataframe & select only the 2016 info for each player

#tm_by_year['pid'] = tm_by_year['pid'].apply(pd.to_numeric, errors = 'coerce')

#s_stats['total_points'] = tm_by_year['apps'] * tm_by_year['PPM'] # To calculate the total team points earned

s_stats_summary = s_stats.groupby(['idn','start_year'])[['attack','defense','possession',
                                                         'goalkeeping','score']].mean().copy()

s_stats_summary.reset_index(inplace = True)

s_stats_2016 = s_stats_summary[s_stats_summary['start_year'] == 2016].copy()

s_stats_2016.drop('start_year', axis = 1, inplace = True)

s_stats_2016.columns = ['idn','attack_2016','defense_2016','poss_2016','gk_2016','score_2016']

s_stats_2016.head(3)

Unnamed: 0,idn,attack_2016,defense_2016,poss_2016,gk_2016,score_2016
1,1.0,23.701562,-0.696562,-2.101875,0.0,20.903125
6,7.0,0.176667,14.289167,2.426389,0.0,16.892222
9,8.0,12.7704,5.6104,8.2936,0.0,26.6744


In [276]:
# Merge dataframes

master = pd.merge(master, s_stats_2016, how = 'left', on = 'idn')

In [277]:
master.shape

(3138, 61)

In [278]:
s_info.shape

(297116, 17)

In [279]:
master.columns

Index([u'teamid', u'number', u'name', u'pid', u'main_pos', u'position', u'dob',
       u'age', u'nat1', u'nat2', u'height', u'foot', u'teamsince', u'prevteam',
       u'prevteamid', u'transferfee', u'contractuntil', u'marketval',
       u'prevmarketval', u'team', u'season', u'teamurl', u'year',
       u'current_team', u'current_teamid', u'country', u'marketval_2016',
       u'country_2016', u'name_dob', u'check', u'squawka_name_dob', u'idn',
       u'mins_2016', u'points_2016', u'apps_2016', u'assists_2016',
       u'goals_2016', u'conceded_2016', u'clean_sheets_2016', u'MPM_2016',
       u'PPM_2016', u'GPM_2016', u'CS_R', u'GAPM', u'league_name',
       u'ucl_2016_apps', u'MPG_career', u'PPM_career', u'apps_career',
       u'assists_career', u'clean_sheets_career', u'conceded_goals_career',
       u'goals_career', u'intl_caps', u'intl_goals', u'minutes_career',
       u'attack_2016', u'defense_2016', u'poss_2016', u'gk_2016',
       u'score_2016'],
      dtype='object')

In [280]:
s_info_select.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18882 entries, 0 to 297110
Data columns (total 6 columns):
idn         18882 non-null float64
height_s    17522 non-null float64
weight      17223 non-null float64
bmi         18882 non-null float64
age         18882 non-null float64
count       18882 non-null float64
dtypes: float64(6)
memory usage: 1.0 MB


In [281]:
# Merge dataframes

s_info_select = s_info[['idn','height','weight','bmi','age']]

s_info_select.columns = ['idn','height_s','weight','bmi','age']

s_info_select = s_info_select.drop_duplicates()


In [282]:
s_info.head()

Unnamed: 0,index,age,bmi,country,dob,first_name,height,last_name,name,position,shirt_num,state,surname,team_id,team_name,weight,idn
0,0,27.0,21.2,Wales,1989-02-07,Neil,175.0,Taylor,Neil Taylor,Defender,0.0,playing,Neil Taylor,46.0,Swansea,65.0,817.0
1,1,32.0,23.0,Wales,1984-08-23,Ashley,183.0,Williams,Ashley Williams,Defender,0.0,playing,Ashley Williams,46.0,Swansea,77.0,810.0
2,2,25.0,22.2,England,1991-05-22,Kyle,185.0,Bartley,Kyle Bartley,Defender,0.0,bench,Kyle Bartley,46.0,Swansea,76.0,382.0
3,3,31.0,23.0,Poland,1985-04-18,Lukasz,190.0,Fabianski,Lukasz Fabianski,Goalkeeper,1.0,playing,Fabianski,46.0,Swansea,83.0,375.0
4,4,28.0,22.6,Korea Republic,1989-01-24,Sung-yueng,187.0,Ki,Ki Sung-yueng,Midfielder,4.0,playing,Ki,46.0,Swansea,79.0,3691.0


In [283]:
# Review duplicated entries

s_info_select['count'] = s_info_select.groupby('idn')['idn'].transform('count')
s_info_select[s_info_select['count'] != 1].sort_values(by = 'idn').head()

Unnamed: 0,idn,height_s,weight,bmi,age,count
112874,1.0,187.0,80.0,22.9,29.0,3.0
96502,1.0,187.0,80.0,22.9,28.0,3.0
295809,1.0,187.0,78.0,22.3,30.0,3.0
46230,7.0,191.0,84.0,23.0,32.0,3.0
190322,7.0,191.0,84.0,23.0,33.0,3.0


Entries are duplicated since player weights have changed with age. I want to make a dataframe whereby the oldest age is the entry that remains.

In [284]:
# Create empty dataframe of just player ids

s_info_unique_ids = s_info_select[['idn']]
s_info_unique_ids = s_info_unique_ids.drop_duplicates()
s_info_unique_ids.head(2)

Unnamed: 0,idn
0,817.0
1,810.0


In [285]:
def most_recent_height(value):
    temp_df = s_info_select[s_info_select['idn'] == value] # Creates a temporary dataframe for the player id
    temp_df = temp_df.sort_values(by = 'age', ascending = False) # Makes the most recent entry 1st
    temp_df = temp_df.reset_index() # To ensure we can retrieve the first row
    height = temp_df.loc[0,'height_s']
    return height

def most_recent_weight(value):
    temp_df = s_info_select[s_info_select['idn'] == value] # Creates a temporary dataframe for the player id
    temp_df = temp_df.sort_values(by = 'age', ascending = False) # Makes the most recent entry 1st
    temp_df = temp_df.reset_index() # To ensure we can retrieve the first row
    weight = temp_df.loc[0,'weight']
    return weight

def most_recent_bmi(value):
    temp_df = s_info_select[s_info_select['idn'] == value] # Creates a temporary dataframe for the player id
    temp_df = temp_df.sort_values(by = 'age', ascending = False) # Makes the most recent entry 1st
    temp_df = temp_df.reset_index() # To ensure we can retrieve the first row
    bmi = temp_df.loc[0,'bmi']
    return bmi

s_info_unique_ids['height_s'] = s_info_unique_ids['idn'].apply(most_recent_height)
s_info_unique_ids['weight'] = s_info_unique_ids['idn'].apply(most_recent_weight)
s_info_unique_ids['bmi'] = s_info_unique_ids['idn'].apply(most_recent_bmi)

In [286]:
master = pd.merge(master, s_info_unique_ids, how = 'left', on = 'idn')

In [287]:
master.head(30)

Unnamed: 0,teamid,number,name,pid,main_pos,position,dob,age,nat1,nat2,...,intl_goals,minutes_career,attack_2016,defense_2016,poss_2016,gk_2016,score_2016,height_s,weight,bmi
0,8054.0,1.0,Andrew Redmayne,51975,keeper,Keeper,1989-01-13,28.0,Australia,,...,0.0,7298.0,0.0,-9.395556,3.576667,0.0,-5.818889,194.0,84.0,22.3
1,8054.0,5.0,Jordy Buijs,31111,defender,Centre-Back,1988-12-28,28.0,Netherlands,,...,3.0,22132.0,5.535556,17.768889,4.896667,0.0,28.201111,182.0,80.0,24.2
2,8054.0,4.0,Alex Wilkinson,43128,defender,Centre-Back,1984-08-13,33.0,Australia,,...,0.0,21819.0,2.198077,24.034615,3.050385,0.0,29.283077,187.0,85.0,24.3
3,8054.0,22.0,Sebastian Ryall,58120,defender,Centre-Back,1989-07-18,28.0,Australia,,...,1.0,12415.0,0.242222,9.186667,1.485556,0.0,10.914444,180.0,75.0,23.1
4,8054.0,7.0,Michael Zullo,55444,defender,Left-Back,1988-11-09,28.0,Australia,Italy,...,0.0,9834.0,4.0736,23.5464,-9.1056,0.0,18.5144,170.0,63.0,21.8
5,8054.0,23.0,Rhyan Grant,108108,defender,Right-Back,1991-02-26,26.0,Australia,,...,,11269.0,12.59,23.530833,-11.0275,0.0,25.093333,174.0,74.0,24.4
6,8054.0,12.0,Aaron Calver,255162,defender,Right-Back,1996-01-12,21.0,Australia,,...,2.0,2045.0,-0.141667,29.253333,-3.883333,0.0,25.228333,186.0,76.0,22.0
7,8054.0,,Paulo Retre,257685,midfielder,Defensive Midfield,1993-03-04,24.0,Australia,Portugal,...,,2546.0,0.318,-8.046,0.334,0.0,-7.394,170.0,63.0,21.8
8,8054.0,6.0,Joshua Brillante,171307,midfielder,Central Midfield,1993-03-25,24.0,Australia,Italy,...,0.0,7624.0,5.723478,7.532174,3.655217,0.0,16.91087,177.0,73.0,23.3
9,8054.0,13.0,Brandon O'Neill,218371,midfielder,Central Midfield,1994-04-12,23.0,Australia,,...,0.0,4838.0,7.47375,6.725417,4.68375,0.0,18.882917,179.0,78.0,24.3


In [288]:
master.shape

(3138, 64)

In [289]:
master.columns

Index([u'teamid', u'number', u'name', u'pid', u'main_pos', u'position', u'dob',
       u'age', u'nat1', u'nat2', u'height', u'foot', u'teamsince', u'prevteam',
       u'prevteamid', u'transferfee', u'contractuntil', u'marketval',
       u'prevmarketval', u'team', u'season', u'teamurl', u'year',
       u'current_team', u'current_teamid', u'country', u'marketval_2016',
       u'country_2016', u'name_dob', u'check', u'squawka_name_dob', u'idn',
       u'mins_2016', u'points_2016', u'apps_2016', u'assists_2016',
       u'goals_2016', u'conceded_2016', u'clean_sheets_2016', u'MPM_2016',
       u'PPM_2016', u'GPM_2016', u'CS_R', u'GAPM', u'league_name',
       u'ucl_2016_apps', u'MPG_career', u'PPM_career', u'apps_career',
       u'assists_career', u'clean_sheets_career', u'conceded_goals_career',
       u'goals_career', u'intl_caps', u'intl_goals', u'minutes_career',
       u'attack_2016', u'defense_2016', u'poss_2016', u'gk_2016',
       u'score_2016', u'height_s', u'weight', u'bmi'],
    

In [293]:
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 0 to 3137
Data columns (total 64 columns):
teamid                   3138 non-null float64
number                   3012 non-null float64
name                     3138 non-null object
pid                      3138 non-null object
main_pos                 3138 non-null object
position                 3138 non-null object
dob                      3138 non-null object
age                      3138 non-null float64
nat1                     3138 non-null object
nat2                     1020 non-null object
height                   3122 non-null float64
foot                     2963 non-null object
teamsince                3128 non-null object
prevteam                 3128 non-null object
prevteamid               3073 non-null float64
transferfee              3138 non-null float64
contractuntil            3025 non-null object
marketval                3138 non-null float64
prevmarketval            3137 non-null float64
team       

In [295]:
# Clean columns

master['pid'] = master['pid'].apply(pd.to_numeric, errors = 'coerce')

In [298]:
# Convert date fields to datetime

date_fields = ['dob','teamsince','contractuntil']

for col in date_fields:
    master[col] = pd.to_datetime(master[col], format="%Y-%m-%d")

In [299]:
master.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3138 entries, 0 to 3137
Data columns (total 64 columns):
teamid                   3138 non-null float64
number                   3012 non-null float64
name                     3138 non-null object
pid                      3138 non-null float64
main_pos                 3138 non-null object
position                 3138 non-null object
dob                      3138 non-null datetime64[ns]
age                      3138 non-null float64
nat1                     3138 non-null object
nat2                     1020 non-null object
height                   3122 non-null float64
foot                     2963 non-null object
teamsince                3128 non-null datetime64[ns]
prevteam                 3128 non-null object
prevteamid               3073 non-null float64
transferfee              3138 non-null float64
contractuntil            3025 non-null datetime64[ns]
marketval                3138 non-null float64
prevmarketval            3137 non

## Save final data frame to CSV

In [300]:
# Convert dataframes to CSV

start_time=time.time()
start_time_stamp = str(start_time).replace(".","")
start_time_stamp

matched_file_name = "master_table_" + start_time_stamp + ".csv"

master.to_csv(matched_file_name, encoding="utf-8", index = False)