# Import necessary libraries

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import time
import html5lib
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# !pip install html5lib

In [None]:
# The shooting, passing, goalkeeping, passing types, goal and shot creation, defense, possession,
# and miscellaneous statistics for teams in English Premier League are scraped.
# The dataset includes these statistics seasons ranging from 2013 to 2017.
# BeautifulSoup library is used to scrape data.
# The dataset is then used to test (validate) a classification model to predict the result of football matches.

# Shooting table

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
data = requests.get(standings_url)

In [4]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [5]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [6]:
data = requests.get(team_urls[0])

In [7]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [8]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/shooting/' in l]

In [9]:
data = requests.get(f"https://fbref.com{links[0]}")

In [10]:
shooting = pd.read_html(data.text, match="Shooting")[0]

In [11]:
shooting.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Standard,Expected,Expected,Expected,Expected,Expected,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Gls,Sh,SoT,SoT%,G/Sh,G/SoT,Dist,FK,PK,PKatt,xG,npxG,npxG/Sh,G-xG,np:G-xG,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,1,10,2,20.0,0.1,0.5,14.6,1,0,0,1.0,1.0,0.1,0.0,0.0,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,4,19,7,36.8,0.21,0.57,13.0,0,0,0,2.7,2.7,0.16,1.3,1.3,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,3,14,6,42.9,0.21,0.5,14.8,0,0,0,1.3,1.3,0.1,1.7,1.7,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,2,22,8,36.4,0.09,0.25,15.5,1,0,0,2.6,2.6,0.12,-0.6,-0.6,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,2,22,8,36.4,0.09,0.25,16.3,1,0,0,2.4,2.4,0.12,-0.4,-0.4,Match Report


In [12]:
shooting.columns = shooting.columns.droplevel()

In [13]:
shooting_data = matches.merge(shooting.iloc[:,[0,11,12,16,17,18,19]], on = "Date")

In [14]:
shooting_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Sh,SoT,Dist,FK,PK,PKatt
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,10,2,14.6,1,0,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,19,7,13.0,0,0,0
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,14,6,14.8,0,0,0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,22,8,15.5,1,0,0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,22,8,16.3,1,0,0


# Shooting dataframe

In [15]:
years = list(range(2017, 2012, -1))
all_shooting_data = []

In [16]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [17]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/shooting/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        shooting = pd.read_html(data.text, match="Shooting")[0]
        shooting.columns = shooting.columns.droplevel()
        try:
            shooting_data = matches.merge(shooting.iloc[:,[0,11,12,16,17,18,19]], on = "Date")
        except ValueError:
            continue
        shooting_data = shooting_data[shooting_data["Comp"] == "Premier League"]
        
        shooting_data["Season"] = year
        shooting_data["Team"] = team_name
        all_shooting_data.append(shooting_data)
        time.sleep(1)

In [18]:
len(all_shooting_data)

100

In [19]:
shooting_df = pd.concat(all_shooting_data)

In [20]:
shooting_df.columns = [c.lower() for c in shooting_df.columns]

In [21]:
shooting_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,10.0,2.0,14.6,1.0,0.0,0.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,19.0,7.0,13.0,0.0,0.0,0.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,14.0,6.0,14.8,0.0,0.0,0.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,22.0,8.0,15.5,1.0,0.0,0.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,22.0,8.0,16.3,1.0,0.0,0.0,2017,Arsenal


In [22]:
shooting_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/shooting_validation.csv")


In [23]:
shooting_df.shape

(3292, 27)

# Passing table

In [24]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [25]:
data = requests.get(standings_url)

In [26]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [27]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [28]:
data = requests.get(team_urls[0])

In [29]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [30]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/passing/' in l]

In [31]:
data = requests.get(f"https://fbref.com{links[0]}")

In [32]:
passing = pd.read_html(data.text, match="Passing")[0]

In [33]:
passing.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Total,Total,Total,Total,Total,Short,Short,Short,Medium,Medium,Medium,Long,Long,Long,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0,Unnamed: 27_level_0,Unnamed: 28_level_0,Unnamed: 29_level_0,Unnamed: 30_level_0,Unnamed: 31_level_0,Unnamed: 32_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Cmp,Att,Cmp%,TotDist,PrgDist,Cmp,Att,Cmp%,Cmp,Att,Cmp%,Cmp,Att,Cmp%,Ast,xAG,xA,KP,1/3,PPA,CrsPA,Prog,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,378,469,80.6,6643,2559,171,201,85.1,156,176,88.6,34,60,56.7,1,0.6,0.8,7,34,10,0,32,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,407,499,81.6,6642,2321,212,232,91.4,148,173,85.5,31,61,50.8,3,1.5,1.0,14,29,7,2,22,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,534,614,87.0,8331,2663,286,307,93.2,188,206,91.3,34,61,55.7,2,0.8,0.6,9,24,6,2,22,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,534,637,83.8,9990,3197,221,252,87.7,246,268,91.8,62,91,68.1,2,2.1,1.3,16,42,17,3,47,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,452,549,82.3,8154,3066,186,203,91.6,210,244,86.1,48,76,63.2,1,1.8,1.5,15,40,16,0,39,Match Report


In [34]:
passing.columns = passing.columns.droplevel()

In [35]:
passing_data = matches.merge(passing.iloc[:,[0,10,11,13,14,24,25,26,27,28,29,30]], on = "Date")

In [36]:
passing_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Cmp,Att,TotDist,PrgDist,Ast,xAG,xA,KP,1/3,PPA,CrsPA
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,378,469,6643,2559,1,0.6,0.8,7,34,10,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,407,499,6642,2321,3,1.5,1.0,14,29,7,2
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,534,614,8331,2663,2,0.8,0.6,9,24,6,2
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,534,637,9990,3197,2,2.1,1.3,16,42,17,3
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,452,549,8154,3066,1,1.8,1.5,15,40,16,0


# Passing dataframe

In [37]:
years = list(range(2017, 2012, -1))
all_passing_data = []

In [38]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [39]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/passing/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        passing = pd.read_html(data.text, match="Passing")[0]
        passing.columns = passing.columns.droplevel()
        try:
            passing_data = matches.merge(passing.iloc[:,[0,10,11,13,14,24,25,26,27,28,29,30]], on = "Date")
        except ValueError:
            continue
        passing_data = passing_data[passing_data["Comp"] == "Premier League"]
        
        passing_data["Season"] = year
        passing_data["Team"] = team_name
        all_passing_data.append(passing_data)
        time.sleep(1)

In [40]:
len(all_passing_data)

100

In [41]:
passing_df = pd.concat(all_passing_data)

In [42]:
passing_df.columns = [c.lower() for c in passing_df.columns]

In [43]:
passing_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,cmp,att,totdist,prgdist,ast,xag,xa,kp,1/3,ppa,crspa,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,378.0,469.0,6643.0,2559.0,1.0,0.6,0.8,7.0,34.0,10.0,0.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,407.0,499.0,6642.0,2321.0,3.0,1.5,1.0,14.0,29.0,7.0,2.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,534.0,614.0,8331.0,2663.0,2.0,0.8,0.6,9.0,24.0,6.0,2.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,534.0,637.0,9990.0,3197.0,2.0,2.1,1.3,16.0,42.0,17.0,3.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,452.0,549.0,8154.0,3066.0,1.0,1.8,1.5,15.0,40.0,16.0,0.0,2017,Arsenal


In [44]:
passing_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/passing_validation.csv")


In [45]:
passing_df.shape

(3292, 32)

# Goalkeeping table

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
data = requests.get(standings_url)

In [4]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [5]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [6]:
data = requests.get(team_urls[0])

In [7]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [8]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/keeper/' in l]

In [9]:
data = requests.get(f"https://fbref.com{links[0]}")

In [10]:
goalkeeping = pd.read_html(data.text, match="Goalkeeping")[0]

In [11]:
goalkeeping.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Penalty Kicks,Penalty Kicks,Penalty Kicks,Penalty Kicks,Launched,Launched,Launched,Passes,Passes,Passes,Passes,Goal Kicks,Goal Kicks,Goal Kicks,Crosses,Crosses,Crosses,Sweeper,Sweeper,Unnamed: 36_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,SoTA,GA,Saves,Save%,CS,PSxG,PSxG+/-,PKatt,PKA,PKsv,PKm,Cmp,Att,Cmp%,Att,Thr,Launch%,AvgLen,Att,Launch%,AvgLen,Opp,Stp,Stp%,#OPA,AvgDist,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,2,0,2,100.0,1,0.3,0.3,0,0,0,0,8,15,53.3,30,4,46.7,36.5,2,50.0,43.5,16,2,12.5,1,15.7,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,2,2,1,0.0,0,0.5,-0.5,0,0,0,0,5,13,38.5,23,3,43.5,37.3,3,100.0,72.3,6,0,0.0,1,17.0,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,1,0,1,100.0,1,0.1,0.1,0,0,0,0,5,13,38.5,23,2,39.1,33.6,4,100.0,72.3,12,0,0.0,0,7.0,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,3,1,2,66.7,0,1.1,0.1,0,0,0,0,3,10,30.0,22,5,31.8,30.8,8,37.5,34.6,10,2,20.0,0,14.5,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,3,1,2,66.7,0,0.2,-0.8,0,0,0,0,3,13,23.1,22,9,40.9,37.6,5,80.0,61.8,9,0,0.0,0,,Match Report


In [12]:
goalkeeping.columns = goalkeeping.columns.droplevel()

In [13]:
goalkeeping_data = matches.merge(goalkeeping.iloc[:,[0,12,14,19,22,26,27,32,34]], on = "Date")

In [14]:
goalkeeping_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Saves,CS,PKsv,Att,Launch%,AvgLen,Stp,#OPA
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,2,1,0,15,46.7,36.5,2,1
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,1,0,0,13,43.5,37.3,0,1
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,1,1,0,13,39.1,33.6,0,0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,2,0,0,10,31.8,30.8,2,0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,2,0,0,13,40.9,37.6,0,0


# Goalkeeping dataframe

In [15]:
years = list(range(2017, 2012, -1))
all_goalkeeping_data = []

In [16]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [17]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/keeper/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        goalkeeping = pd.read_html(data.text, match="Goalkeeping")[0]
        goalkeeping.columns = goalkeeping.columns.droplevel()
        try:
            goalkeeping_data = matches.merge(goalkeeping.iloc[:,[0,12,14,19,22,26,27,32,34]], on = "Date")
        except ValueError:
            continue
        goalkeeping_data = goalkeeping_data[goalkeeping_data["Comp"] == "Premier League"]
        
        goalkeeping_data["Season"] = year
        goalkeeping_data["Team"] = team_name
        all_goalkeeping_data.append(goalkeeping_data)
        time.sleep(1)

In [18]:
len(all_goalkeeping_data)

100

In [19]:
goalkeeping_df = pd.concat(all_goalkeeping_data)

In [20]:
goalkeeping_df.columns = [c.lower() for c in goalkeeping_df.columns]

In [21]:
goalkeeping_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,saves,cs,pksv,att,launch%,avglen,stp,#opa,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,2.0,1.0,0.0,15.0,46.7,36.5,2.0,1.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,1.0,0.0,0.0,13.0,43.5,37.3,0.0,1.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,1.0,1.0,0.0,13.0,39.1,33.6,0.0,0.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,2.0,0.0,0.0,10.0,31.8,30.8,2.0,0.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,2.0,0.0,0.0,13.0,40.9,37.6,0.0,0.0,2017,Arsenal


In [22]:
goalkeeping_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/goalkeeping_validation.csv")

In [23]:
goalkeeping_df.shape

(3292, 29)

# Passing types table

In [24]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [25]:
data = requests.get(standings_url)

In [26]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [27]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [28]:
data = requests.get(team_urls[0])

In [29]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [30]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/passing_types/' in l]

In [31]:
data = requests.get(f"https://fbref.com{links[0]}")

In [32]:
passing_types = pd.read_html(data.text, match="Pass Types")[0]

In [33]:
passing_types.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Unnamed: 10_level_0,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Pass Types,Corner Kicks,Corner Kicks,Corner Kicks,Outcomes,Outcomes,Outcomes,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Att,Live,Dead,FK,TB,Sw,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,469,429,38,16,0,4,11,14,5,5,0,0,378,2,6,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,499,458,39,12,2,2,17,15,6,5,0,0,407,2,10,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,614,576,35,13,0,2,16,13,4,4,0,0,534,3,9,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,637,587,49,11,2,3,19,17,9,6,0,0,534,1,13,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,549,502,46,14,1,2,20,15,10,7,0,0,452,1,7,Match Report


In [34]:
passing_types.columns = passing_types.columns.droplevel()

In [35]:
passing_types_data = matches.merge(passing_types.iloc[:,[0,11,12,13,14,15,16,17,18,19,20,21,22,23,24]], on = "Date")

In [36]:
passing_types_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Live,Dead,FK,TB,Sw,Crs,TI,CK,In,Out,Str,Cmp,Off,Blocks
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,429,38,16,0,4,11,14,5,5,0,0,378,2,6
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,458,39,12,2,2,17,15,6,5,0,0,407,2,10
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,576,35,13,0,2,16,13,4,4,0,0,534,3,9
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,587,49,11,2,3,19,17,9,6,0,0,534,1,13
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,502,46,14,1,2,20,15,10,7,0,0,452,1,7


# Passing types dataframe

In [37]:
years = list(range(2017, 2012, -1))
all_passing_types_data = []

In [38]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [39]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/passing_types/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        passing_types = pd.read_html(data.text, match="Pass Types")[0]
        passing_types.columns = passing_types.columns.droplevel()
        try:
            passing_types_data = matches.merge(passing_types.iloc[:,[0,11,12,13,14,15,16,17,18,19,20,21,22,23,24]], on = "Date")
        except ValueError:
            continue
        passing_types_data = passing_types_data[passing_types_data["Comp"] == "Premier League"]
        
        passing_types_data["Season"] = year
        passing_types_data["Team"] = team_name
        all_passing_types_data.append(passing_types_data)
        time.sleep(1)

In [40]:
len(all_passing_types_data)

100

In [41]:
passing_types_df = pd.concat(all_passing_types_data)

In [42]:
passing_types_df.columns = [c.lower() for c in passing_types_df.columns]

In [43]:
passing_types_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,live,dead,fk,tb,sw,crs,ti,ck,in,out,str,cmp,off,blocks,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,429.0,38.0,16.0,0.0,4.0,11.0,14.0,5.0,5.0,0.0,0.0,378.0,2.0,6.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,458.0,39.0,12.0,2.0,2.0,17.0,15.0,6.0,5.0,0.0,0.0,407.0,2.0,10.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,576.0,35.0,13.0,0.0,2.0,16.0,13.0,4.0,4.0,0.0,0.0,534.0,3.0,9.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,587.0,49.0,11.0,2.0,3.0,19.0,17.0,9.0,6.0,0.0,0.0,534.0,1.0,13.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,502.0,46.0,14.0,1.0,2.0,20.0,15.0,10.0,7.0,0.0,0.0,452.0,1.0,7.0,2017,Arsenal


In [44]:
passing_types_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/passing_types_validation.csv")


In [45]:
passing_types_df.shape

(3292, 35)

# Goal and shot creation table

In [46]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [47]:
data = requests.get(standings_url)

In [48]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [49]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [50]:
data = requests.get(team_urls[0])

In [51]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [52]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/gca/' in l]

In [53]:
data = requests.get(f"https://fbref.com{links[0]}")

In [54]:
goal_shot_create = pd.read_html(data.text, match="Goal and Shot Creation")[0]

In [55]:
goal_shot_create.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,SCA Types,SCA Types,SCA Types,SCA Types,SCA Types,SCA Types,SCA Types,GCA Types,GCA Types,GCA Types,GCA Types,GCA Types,GCA Types,GCA Types,Unnamed: 24_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,SCA,PassLive,PassDead,Drib,Sh,Fld,Def,GCA,PassLive,PassDead,Drib,Sh,Fld,Def,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,19,11,2,3,1,1,1,2,1,1,0,0,0,0,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,33,23,4,1,3,2,0,6,6,0,0,0,0,0,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,25,18,1,2,3,0,1,5,4,0,0,1,0,0,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,36,26,1,3,2,2,2,3,2,0,0,1,0,0,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,38,25,5,0,6,2,0,4,4,0,0,0,0,0,Match Report


In [56]:
goal_shot_create.columns = goal_shot_create.columns.droplevel()

In [57]:
goal_shot_create_data = matches.merge(goal_shot_create.iloc[:,[0,10,11,12,13,14,15,16,17,18,19,20,21,22,23]], on = "Date")


In [58]:
goal_shot_create_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,SCA,PassLive,PassDead,Drib,Sh,Fld,Def,GCA,PassLive.1,PassDead.1,Drib.1,Sh.1,Fld.1,Def.1
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,19,11,2,3,1,1,1,2,1,1,0,0,0,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,33,23,4,1,3,2,0,6,6,0,0,0,0,0
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,25,18,1,2,3,0,1,5,4,0,0,1,0,0
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,36,26,1,3,2,2,2,3,2,0,0,1,0,0
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,38,25,5,0,6,2,0,4,4,0,0,0,0,0


# Goal and shot creation dataframe

In [59]:
years = list(range(2017, 2012, -1))
all_goal_shot_create_data = []

In [60]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [61]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/gca/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        goal_shot_create = pd.read_html(data.text, match="Goal and Shot Creation")[0]
        goal_shot_create.columns = goal_shot_create.columns.droplevel()
        try:
            goal_shot_create_data = matches.merge(goal_shot_create.iloc[:,[0,10,11,12,13,14,15,16,17,18,19,20,21,22,23]], on = "Date")
        except ValueError:
            continue
        goal_shot_create_data = goal_shot_create_data[goal_shot_create_data["Comp"] == "Premier League"]
        
        goal_shot_create_data["Season"] = year
        goal_shot_create_data["Team"] = team_name
        all_goal_shot_create_data.append(goal_shot_create_data)
        time.sleep(1)

In [62]:
len(all_goal_shot_create_data)

100

In [63]:
goal_shot_create_df = pd.concat(all_goal_shot_create_data)

In [64]:
goal_shot_create_df.columns = [c.lower() for c in goal_shot_create_df.columns]

In [65]:
goal_shot_create_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,sca,passlive,passdead,drib,sh,fld,def,gca,passlive.1,passdead.1,drib.1,sh.1,fld.1,def.1,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,19.0,11.0,2.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,33.0,23.0,4.0,1.0,3.0,2.0,0.0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,25.0,18.0,1.0,2.0,3.0,0.0,1.0,5.0,4.0,0.0,0.0,1.0,0.0,0.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,36.0,26.0,1.0,3.0,2.0,2.0,2.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,38.0,25.0,5.0,0.0,6.0,2.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,2017,Arsenal


In [66]:
goal_shot_create_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/goal_shot_create_validation.csv")

In [67]:
goal_shot_create_df.shape

(3292, 35)

# Defense table

In [68]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [69]:
data = requests.get(standings_url)

In [70]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [71]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [72]:
data = requests.get(team_urls[0])

In [73]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [74]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/defense/' in l]

In [75]:
data = requests.get(f"https://fbref.com{links[0]}")

In [76]:
defense = pd.read_html(data.text, match="Defensive Actions")[0]

In [77]:
defense.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Tackles,Tackles,Tackles,Tackles,Tackles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Vs Dribbles,Blocks,Blocks,Blocks,Unnamed: 22_level_0,Unnamed: 23_level_0,Unnamed: 24_level_0,Unnamed: 25_level_0,Unnamed: 26_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl,Att,Tkl%,Past,Blocks,Sh,Pass,Int,Tkl+Int,Clr,Err,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,29,16,17,10,2,13,29,44.8,16,17,6,11,9,38,24,0,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,8,6,5,1,2,5,8,62.5,3,9,1,8,8,16,12,0,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,14,8,5,5,4,5,12,41.7,7,11,0,11,10,24,20,1,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,15,11,2,8,5,6,10,60.0,4,17,2,15,7,22,5,1,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,10,7,7,2,1,4,9,44.4,5,4,0,4,8,18,13,0,Match Report


In [78]:
defense.columns = defense.columns.droplevel()

In [79]:
defense_data = matches.merge(defense.iloc[:,[0,10,11,12,13,14,15,16,18,19,20,22,23,24,25]], on = "Date")


In [80]:
defense_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Tkl.1,Att,Past,Blocks,Sh,Int,Tkl+Int,Clr,Err
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,29,16,17,10,2,13,29,16,17,6,9,38,24,0
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,8,6,5,1,2,5,8,3,9,1,8,16,12,0
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,14,8,5,5,4,5,12,7,11,0,10,24,20,1
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,15,11,2,8,5,6,10,4,17,2,7,22,5,1
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,10,7,7,2,1,4,9,5,4,0,8,18,13,0


# Defense dataframe

In [81]:
years = list(range(2017, 2012, -1))
all_defense_data = []

In [82]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [83]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/defense/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        defense = pd.read_html(data.text, match="Defensive Actions")[0]
        defense.columns = defense.columns.droplevel()
        try:
            defense_data = matches.merge(defense.iloc[:,[0,10,11,12,13,14,15,16,18,19,20,22,23,24,25]], on = "Date")
        except ValueError:
            continue
        defense_data = defense_data[defense_data["Comp"] == "Premier League"]
        
        defense_data["Season"] = year
        defense_data["Team"] = team_name
        all_defense_data.append(defense_data)
        time.sleep(1)

In [84]:
len(all_defense_data)

100

In [85]:
defense_df = pd.concat(all_defense_data)

In [86]:
defense_df.columns = [c.lower() for c in defense_df.columns]

In [87]:
defense_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,tkl,tklw,def 3rd,mid 3rd,att 3rd,tkl.1,att,past,blocks,sh,int,tkl+int,clr,err,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,29.0,16.0,17.0,10.0,2.0,13.0,29.0,16.0,17.0,6.0,9.0,38,24.0,0.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,8.0,6.0,5.0,1.0,2.0,5.0,8.0,3.0,9.0,1.0,8.0,16,12.0,0.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,14.0,8.0,5.0,5.0,4.0,5.0,12.0,7.0,11.0,0.0,10.0,24,20.0,1.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,15.0,11.0,2.0,8.0,5.0,6.0,10.0,4.0,17.0,2.0,7.0,22,5.0,1.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,10.0,7.0,7.0,2.0,1.0,4.0,9.0,5.0,4.0,0.0,8.0,18,13.0,0.0,2017,Arsenal


In [88]:
defense_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/defense_validation.csv")

In [89]:
defense_df.shape

(3292, 35)

# Possession table

In [90]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [91]:
data = requests.get(standings_url)

In [92]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [93]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [94]:
data = requests.get(team_urls[0])

In [95]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [96]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/possession/' in l]

In [97]:
data = requests.get(f"https://fbref.com{links[0]}")

In [98]:
possession = pd.read_html(data.text, match="Possession")[0]

In [99]:
possession.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Touches,Touches,Touches,Touches,Touches,Touches,Touches,Dribbles,Dribbles,Dribbles,Dribbles,Dribbles,Receiving,Receiving,Unnamed: 25_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,Poss,Touches,Def Pen,Def 3rd,Mid 3rd,Att 3rd,Att Pen,Live,Succ,Att,Succ%,Mis,Dis,Rec,Prog,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,44.0,598,78,220,251,135,29,598,13,17,76.5,19,15,376,31,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,50.0,585,32,137,229,221,34,585,11,17,64.7,12,7,403,22,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,57.0,731,48,186,381,172,17,731,12,27,44.4,16,9,527,22,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,71.0,743,43,159,352,241,48,743,13,29,44.8,15,15,525,43,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,59.0,653,39,194,270,197,44,653,12,23,52.2,18,9,445,38,Match Report


In [100]:
possession.columns = possession.columns.droplevel()

In [101]:
possession_data = matches.merge(possession.iloc[:,[0,11,12,13,14,15,16,18,19,21,22,23,24]], on = "Date")


In [102]:
possession_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,Touches,Def Pen,Def 3rd,Mid 3rd,Att 3rd,Att Pen,Succ,Att,Mis,Dis,Rec,Prog
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,598,78,220,251,135,29,13,17,19,15,376,31
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,585,32,137,229,221,34,11,17,12,7,403,22
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,731,48,186,381,172,17,12,27,16,9,527,22
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,743,43,159,352,241,48,13,29,15,15,525,43
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,653,39,194,270,197,44,12,23,18,9,445,38


# Possession dataframe

In [103]:
years = list(range(2017, 2012, -1))
all_possession_data = []

In [104]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [105]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/possession/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        possession = pd.read_html(data.text, match="Possession")[0]
        possession.columns = possession.columns.droplevel()
        try:
            possession_data = matches.merge(possession.iloc[:,[0,11,12,13,14,15,16,18,19,21,22,23,24]], on = "Date")
        except ValueError:
            continue
        possession_data = possession_data[possession_data["Comp"] == "Premier League"]
        
        possession_data["Season"] = year
        possession_data["Team"] = team_name
        all_possession_data.append(possession_data)
        time.sleep(1)

In [106]:
len(all_possession_data)

100

In [107]:
possession_df = pd.concat(all_possession_data)

In [108]:
possession_df.columns = [c.lower() for c in possession_df.columns]

In [109]:
possession_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,touches,def pen,def 3rd,mid 3rd,att 3rd,att pen,succ,att,mis,dis,rec,prog,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,598.0,78.0,220.0,251.0,135.0,29.0,13.0,17.0,19.0,15.0,376.0,31.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,585.0,32.0,137.0,229.0,221.0,34.0,11.0,17.0,12.0,7.0,403.0,22.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,731.0,48.0,186.0,381.0,172.0,17.0,12.0,27.0,16.0,9.0,527.0,22.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,743.0,43.0,159.0,352.0,241.0,48.0,13.0,29.0,15.0,15.0,525.0,43.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,653.0,39.0,194.0,270.0,197.0,44.0,12.0,23.0,18.0,9.0,445.0,38.0,2017,Arsenal


In [110]:
possession_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/possession_validation.csv")

In [111]:
possession_df.shape

(3292, 33)

# Miscellaneous table

In [2]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [3]:
data = requests.get(standings_url)

In [4]:
soup = BeautifulSoup(data.text)
standings_table = soup.select('table.stats_table')[0]
links = standings_table.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if '/squads/' in l]

In [5]:
team_urls = [f"https://fbref.com{l}" for l in links]

In [6]:
data = requests.get(team_urls[0])

In [7]:
matches = pd.read_html(data.text, match="Scores & Fixtures")[0]

In [8]:
soup = BeautifulSoup(data.text)
links = soup.find_all('a')
links = [l.get("href") for l in links]
links = [l for l in links if l and 'all_comps/misc/' in l]

In [9]:
data = requests.get(f"https://fbref.com{links[0]}")

In [10]:
miscellaneous = pd.read_html(data.text, match="Miscellaneous Stats")[0]

In [11]:
miscellaneous.head()

Unnamed: 0_level_0,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,For Arsenal,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Performance,Aerial Duels,Aerial Duels,Aerial Duels,Unnamed: 26_level_0
Unnamed: 0_level_1,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,CrdY,CrdR,2CrdY,Fls,Fld,Off,Crs,Int,TklW,PKwon,PKcon,OG,Recov,Won,Lost,Won%,Match Report
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2,0,Crystal Palace,2,0,0,11,16,2,11,9,16,0,0,0,55,14,10,58.3,Match Report
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4,2,Leicester City,1,0,0,15,9,2,17,8,6,0,0,1,50,19,14,57.6,Match Report
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3,0,Bournemouth,1,0,0,10,12,3,16,10,8,0,0,0,56,14,12,53.8,Match Report
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2,1,Fulham,2,0,0,6,12,1,19,7,11,0,0,0,53,11,22,33.3,Match Report
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2,1,Aston Villa,2,0,0,10,14,1,20,8,7,0,0,0,50,10,10,50.0,Match Report


In [12]:
miscellaneous.columns = miscellaneous.columns.droplevel()

In [13]:
miscellaneous_data = matches.merge(miscellaneous.iloc[:,[0,10,11,12,13,14,15,16,17,19,20,21,22,23,24]], on = "Date")


In [14]:
miscellaneous_data.head()

Unnamed: 0,Date,Time,Comp,Round,Day,Venue,Result,GF,GA,Opponent,xG,xGA,Poss,Attendance,Captain,Formation,Referee,Match Report,Notes,CrdY,CrdR,2CrdY,Fls,Fld,Off,Crs,Int,PKwon,PKcon,OG,Recov,Won,Lost
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,2,0,0,11,16,2,11,9,0,0,0,55,14,10
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,1,0,0,15,9,2,17,8,0,0,1,50,19,14
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,1,0,0,10,12,3,16,10,0,0,0,56,14,12
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,2,0,0,6,12,1,19,7,0,0,0,53,11,22
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,2,0,0,10,14,1,20,8,0,0,0,50,10,10


# Miscellaneous dataframe

In [15]:
years = list(range(2017, 2012, -1))
all_miscellaneous_data = []

In [16]:
standings_url = "https://fbref.com/en/comps/9/Premier-League-Stats"

In [17]:
import time
for year in years:
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]

    links = [l.get("href") for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    team_urls = [f"https://fbref.com{l}" for l in links]
    
    previous_season = soup.select("a.prev")[0].get("href")
    standings_url = f"https://fbref.com{previous_season}"
    
    for team_url in team_urls:
        team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")
        data = requests.get(team_url)
        matches = pd.read_html(data.text, match="Scores & Fixtures")[0]
        soup = BeautifulSoup(data.text)
        links = [l.get("href") for l in soup.find_all('a')]
        links = [l for l in links if l and 'all_comps/misc/' in l]
        data = requests.get(f"https://fbref.com{links[0]}")
        miscellaneous = pd.read_html(data.text, match="Miscellaneous Stats")[0]
        miscellaneous.columns = miscellaneous.columns.droplevel()
        try: 
            miscellaneous_data = matches.merge(miscellaneous.iloc[:,[0,10,11,12,13,14,15,16,17,19,20,21,22,23,24]], on = "Date")
        except ValueError:
            continue
        miscellaneous_data = miscellaneous_data[miscellaneous_data["Comp"] == "Premier League"]
        
        miscellaneous_data["Season"] = year
        miscellaneous_data["Team"] = team_name
        all_miscellaneous_data.append(miscellaneous_data)
        time.sleep(1)

In [18]:
len(all_miscellaneous_data)

100

In [19]:
miscellaneous_df = pd.concat(all_miscellaneous_data)

In [20]:
miscellaneous_df.columns = [c.lower() for c in miscellaneous_df.columns]

In [21]:
miscellaneous_df.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,xg,xga,poss,attendance,captain,formation,referee,match report,notes,crdy,crdr,2crdy,fls,fld,off,crs,int,pkwon,pkcon,og,recov,won,lost,season,team
0,2022-08-05,20:00,Premier League,Matchweek 1,Fri,Away,W,2.0,0.0,Crystal Palace,1.0,1.2,44.0,25286.0,Martin Ødegaard,4-3-3,Anthony Taylor,Match Report,,2.0,0.0,0.0,11.0,16.0,2.0,11.0,9.0,0.0,0.0,0.0,55.0,14.0,10.0,2017,Arsenal
1,2022-08-13,15:00,Premier League,Matchweek 2,Sat,Home,W,4.0,2.0,Leicester City,2.7,0.5,50.0,60033.0,Martin Ødegaard,4-2-3-1,Darren England,Match Report,,1.0,0.0,0.0,15.0,9.0,2.0,17.0,8.0,0.0,0.0,1.0,50.0,19.0,14.0,2017,Arsenal
2,2022-08-20,17:30,Premier League,Matchweek 3,Sat,Away,W,3.0,0.0,Bournemouth,1.3,0.3,57.0,10423.0,Martin Ødegaard,4-2-3-1,Craig Pawson,Match Report,,1.0,0.0,0.0,10.0,12.0,3.0,16.0,10.0,0.0,0.0,0.0,56.0,14.0,12.0,2017,Arsenal
3,2022-08-27,17:30,Premier League,Matchweek 4,Sat,Home,W,2.0,1.0,Fulham,2.6,0.8,71.0,60164.0,Martin Ødegaard,4-2-3-1,Jarred Gillett,Match Report,,2.0,0.0,0.0,6.0,12.0,1.0,19.0,7.0,0.0,0.0,0.0,53.0,11.0,22.0,2017,Arsenal
4,2022-08-31,19:30,Premier League,Matchweek 5,Wed,Home,W,2.0,1.0,Aston Villa,2.4,0.4,59.0,60012.0,Martin Ødegaard,4-2-3-1,Robert Jones,Match Report,,2.0,0.0,0.0,10.0,14.0,1.0,20.0,8.0,0.0,0.0,0.0,50.0,10.0,10.0,2017,Arsenal


In [22]:
miscellaneous_df.to_csv("/Users/ozguryildirim/Desktop/5-Ironhack-Bootcamp/20221031-20221104_9th_Week_FinalProject/csv_validation/miscellaneous_validation.csv")

In [23]:
miscellaneous_df.shape

(3292, 35)