In [101]:
import pandas as pd
import numpy as np

In [102]:
data = pd.read_csv("england.csv")
data = data[(data.division != '3S')]
data = data[(data.division != '3N')]
data = data.astype({'division':int})
data = data[data.division == 1]
data.head()

Unnamed: 0,Date,Season,home,visitor,FT,hgoal,vgoal,division,tier,totgoal,goaldif,result
0,1888-09-08,1888,Bolton Wanderers,Derby County,3-6,3,6,1,1,9,-3,A
1,1888-09-08,1888,Everton,Accrington F.C.,2-1,2,1,1,1,3,1,H
2,1888-09-08,1888,Preston North End,Burnley,5-2,5,2,1,1,7,3,H
3,1888-09-08,1888,Stoke City,West Bromwich Albion,0-2,0,2,1,1,2,-2,A
4,1888-09-08,1888,Wolverhampton Wanderers,Aston Villa,1-1,1,1,1,1,2,0,D


# Obtain First Division from season 1980 

In [104]:
data_first_division = data[data.Season >= 1980]
data_first_division.head()

Unnamed: 0,Date,Season,home,visitor,FT,hgoal,vgoal,division,tier,totgoal,goaldif,result
118802,1980-08-16,1980,Birmingham City,Coventry City,3-1,3,1,1,1,4,2,H
118803,1980-08-16,1980,Brighton & Hove Albion,Wolverhampton Wanderers,2-0,2,0,1,1,2,2,H
118804,1980-08-16,1980,Leeds United,Aston Villa,1-2,1,2,1,1,3,-1,A
118805,1980-08-16,1980,Leicester City,Ipswich Town,0-1,0,1,1,1,1,-1,A
118806,1980-08-16,1980,Liverpool,Crystal Palace,3-0,3,0,1,1,3,3,H


# Remove columns unnecessary 

In [105]:
data_first_division = data_first_division.drop(["tier", "FT", "division", "totgoal", "goaldif"], axis = 1)
data_first_division.tail()

Unnamed: 0,Date,Season,home,visitor,hgoal,vgoal,result
198487,2020-07-26,2019,Leicester City,Manchester United,0,2,A
198488,2020-07-26,2019,Manchester City,Norwich City,5,0,H
198489,2020-07-26,2019,Newcastle United,Liverpool,1,3,A
198490,2020-07-26,2019,Southampton,Sheffield United,3,1,H
198491,2020-07-26,2019,West Ham United,Aston Villa,1,1,D


# Separate Date

In [106]:
date = data_first_division.Date.str.split('-',  n=-1, expand=True)
date.columns = ["year", "month", "day"]
date.head()

Unnamed: 0,year,month,day
118802,1980,8,16
118803,1980,8,16
118804,1980,8,16
118805,1980,8,16
118806,1980,8,16


In [107]:
data_first_division = data_first_division.drop("Date", axis = 1)
data_first_division = pd.concat([date, data_first_division], axis=1)
data_first_division.head()

Unnamed: 0,year,month,day,Season,home,visitor,hgoal,vgoal,result
118802,1980,8,16,1980,Birmingham City,Coventry City,3,1,H
118803,1980,8,16,1980,Brighton & Hove Albion,Wolverhampton Wanderers,2,0,H
118804,1980,8,16,1980,Leeds United,Aston Villa,1,2,A
118805,1980,8,16,1980,Leicester City,Ipswich Town,0,1,A
118806,1980,8,16,1980,Liverpool,Crystal Palace,3,0,H


# Create dictionary teams 

In [108]:
teams_set = set(data_first_division.home).union(set(data_first_division.visitor))
team_number = range(len(teams_set))
teams_dict = dict(zip(teams_set, team_number))
teams_dict

{'Watford': 0,
 'Leeds United': 1,
 'Everton': 2,
 'Millwall': 3,
 'Tottenham Hotspur': 4,
 'Norwich City': 5,
 'Fulham': 6,
 'Brighton & Hove Albion': 7,
 'Crystal Palace': 8,
 'Ipswich Town': 9,
 'Liverpool': 10,
 'Charlton Athletic': 11,
 'Swansea City': 12,
 'Cardiff City': 13,
 'Leicester City': 14,
 'Bolton Wanderers': 15,
 'Reading': 16,
 'Manchester City': 17,
 'Burnley': 18,
 'Aston Villa': 19,
 'Derby County': 20,
 'Queens Park Rangers': 21,
 'Southampton': 22,
 'Manchester United': 23,
 'Barnsley': 24,
 'Portsmouth': 25,
 'Bradford City': 26,
 'Hull City': 27,
 'Sheffield United': 28,
 'West Bromwich Albion': 29,
 'Chelsea': 30,
 'Sheffield Wednesday': 31,
 'Oxford United': 32,
 'Wimbledon': 33,
 'Newcastle United': 34,
 'Swindon Town': 35,
 'Wigan Athletic': 36,
 'Blackpool': 37,
 'Nottingham Forest': 38,
 'Notts County': 39,
 'Coventry City': 40,
 'AFC Bournemouth': 41,
 'Wolverhampton Wanderers': 42,
 'Sunderland': 43,
 'Blackburn Rovers': 44,
 'Arsenal': 45,
 'Birmingham

In [109]:
teams_df = pd.DataFrame({"team_name":list(teams_set), "team_number":team_number})
teams_df.head()

Unnamed: 0,team_name,team_number
0,Watford,0
1,Leeds United,1
2,Everton,2
3,Millwall,3
4,Tottenham Hotspur,4


In [110]:
teams_df.to_csv("teams.csv", index=False)

In [111]:
data_first_division.home = data_first_division.home.apply(teams_dict.get)
data_first_division.visitor = data_first_division.visitor.apply(teams_dict.get)
data_first_division.head()

Unnamed: 0,year,month,day,Season,home,visitor,hgoal,vgoal,result
118802,1980,8,16,1980,46,40,3,1,H
118803,1980,8,16,1980,7,42,2,0,H
118804,1980,8,16,1980,1,19,1,2,A
118805,1980,8,16,1980,14,9,0,1,A
118806,1980,8,16,1980,10,8,3,0,H


# Modify Result

In [112]:
result_dict = {"D":1, "H":2, "A":3}
data_first_division.result = data_first_division.result.apply(result_dict.get)
data_first_division.head()

Unnamed: 0,year,month,day,Season,home,visitor,hgoal,vgoal,result
118802,1980,8,16,1980,46,40,3,1,2
118803,1980,8,16,1980,7,42,2,0,2
118804,1980,8,16,1980,1,19,1,2,3
118805,1980,8,16,1980,14,9,0,1,3
118806,1980,8,16,1980,10,8,3,0,2


# Save DataFrame

In [113]:
data_first_division.to_csv("england-clean.csv", index = 0)

In [114]:
data_first_division.tail()

Unnamed: 0,year,month,day,Season,home,visitor,hgoal,vgoal,result
198487,2020,7,26,2019,14,23,0,2,3
198488,2020,7,26,2019,17,5,5,0,2
198489,2020,7,26,2019,34,10,1,3,3
198490,2020,7,26,2019,22,28,3,1,2
198491,2020,7,26,2019,48,19,1,1,1
