# Create a dataset of player positions

I don't currently have player positions in the dataset.

Positions for pre-97 players are covered by A Complete Record.

Players from 1997 onwards are covered by Soccerbase.

In [1]:
import pandas as pd

# Import Complete Record player data

df = pd.read_csv('https://raw.githubusercontent.com/petebrown/complete-record/main/output/player_stats.csv')

In [2]:
df.head(3)

Unnamed: 0,page,row,surname,forename,date_of_birth,dob,dob_qtr,dob_yr,dob_yr_is_est,period,...,lc_gls,other_apps,other_sub,other_gls,total_apps,total_subs,total_gls,prev_club,next_club,is_loan
0,101,1,A'Court,Alan,30-Sep-34,1934-09-30,3.0,1934.0,0,1,...,0,0,0,0,54,0,11,Liverpool,Norwich City,0
1,101,2,Adams,Arthur,,,,,0,1,...,0,0,0,0,4,0,1,,,0
2,101,3,Adams,Edward,12-Nov-08,1908-11-12,4.0,1908.0,0,1,...,0,0,0,0,3,0,0,Bromborough Pool,Bromborough Pool,0


Positions are contained in the `position` column, but some are messy.

In [3]:
df.position.unique()

array(['OL', 'OR', 'G', 'F', 'RH', 'LB', 'WH', 'CF', 'CD', 'MF', 'IR',
       'CH', 'IF', 'HB', 'FB', 'IL', 'W', 'D', 'RB', 'w', 'LH', 'M', 'S',
       's'], dtype=object)

In [4]:
def fix_positions(pos):
    if pos == 'fR':
        return 'IR'
    if pos == 'I CF':
        return 'CF'
    if pos == 'I WH':
        return 'WH'
    if pos == 'M':
        return 'MF'
    if pos == 'RB·':
        return 'RB'
    elif pos == 's':
        return 'S'
    elif pos == 'w':
        return 'W'
    
    else:
        return pos

df.position = df.position.apply(fix_positions)

In [5]:
df["comp_rec_pos"] = df.position.map({
    "G":"Goalkeeper",
    "D":"Defender",
    "HB":"Half Back",
    "FB":"Fullback",
    "RB":"Right Back",
    "LB":"Left Back",
    "CH":"Center Half",
    "CD":"Central Defender",
    "WH":"Wing Half",
    "RH":"Right Half",
    "LH":"Left Half",
    "W":"Winger",
    "MF":"Midfielder",
    "OR":"Outside Right",
    "OL":"Outside Left",
    "IF":"Inside Forward",
    "IR":"Inside Right",
    "IL":"Inside Left",
    "CF":"Center Forward",
    "F":"Forward",
    "S":"Striker"
})

# Assign `player_id` to each player

I don't currently have a unique identifier for each player.

Player IDs will be created by combining player names and dates of birth.

In [6]:
plr_ids = df[['surname', 'forename', 'date_of_birth', 'dob', 'dob_qtr', 'dob_yr', 'dob_yr_is_est']].copy()

In [7]:
def add_player_ids(df):
    '''
    Create a player_id from the surname, forename and date of birth, where available.
    '''

    # Where nothing is known about the player's date of birth, create a player_id from the surname and forename
    df.loc[(df.dob.isnull()) & (df.dob_qtr.isnull()) & (df.dob_yr.isnull()), 'player_id'] = df.surname + df.forename

    # Where only the year of birth is known, create a player_id from the surname, forename and year of birth
    df.loc[(df.dob.isnull()) & (df.dob_qtr.isnull()) & (~df.dob_yr.isnull()), 'player_id'] = df.surname + df.forename + df.dob_yr.astype(str)

    # Where only the birth year and quarter are known, create a player_id from the surname, forename, quarter and year of birth
    df.loc[(df.dob.isnull()) & (~df.dob_qtr.isnull()) & (~df.dob_yr.isnull()), 'player_id'] = df.surname + df.forename + df.dob_qtr.astype(str) + df.dob_yr.astype(str)

    # Where the date of birth is known, create a player_id from the surname, forename and date of birth
    df.loc[(df.dob.notnull()), 'player_id'] = df.surname + df.forename + df.dob

    df['player_id'] = df['player_id'].str.replace('.0', '').str.replace('-', '').str.replace("'", '').str.replace(' ', '')

    return df

plr_ids = add_player_ids(plr_ids)

In [8]:
# Check for any missing player_id values
plr_ids[plr_ids.player_id.isnull()]

Unnamed: 0,surname,forename,date_of_birth,dob,dob_qtr,dob_yr,dob_yr_is_est,player_id


# Add player positions to the dataset

In [9]:
comp_rec_pos = df.copy()

comp_rec_pos = add_player_ids(comp_rec_pos)

comp_rec_pos['player_name'] = comp_rec_pos.forename + ' ' + comp_rec_pos.surname

comp_rec_pos = comp_rec_pos[['player_id', 'surname', 'forename', 'player_name', 'dob', 'comp_rec_pos']].copy()

comp_rec_pos = comp_rec_pos.rename(columns={'dob':'player_dob'})

comp_rec_pos = comp_rec_pos.drop_duplicates().reset_index(drop=True)

In [10]:
comp_rec_pos.head(5)

Unnamed: 0,player_id,surname,forename,player_name,player_dob,comp_rec_pos
0,ACourtAlan19340930,A'Court,Alan,Alan A'Court,1934-09-30,Outside Left
1,AdamsArthur,Adams,Arthur,Arthur Adams,,Outside Right
2,AdamsEdward19081112,Adams,Edward,Edward Adams,1908-11-12,Outside Right
3,AdkinsNigel19650311,Adkins,Nigel,Nigel Adkins,1965-03-11,Goalkeeper
4,AlcockEdward1914,Alcock,Edward,Edward Alcock,,Outside Left


In [11]:
comp_rec_pos.to_csv('./output/complete-record.csv', index=False)