In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import re
import requests

In [2]:
def convert_num_nam(string):
    "Given a string that is a concatenation of a jersey number and the name, return the two substring seperate"

    num = [letter for letter in string if letter.isdigit()]
    num = "".join(map(str, num))
    nam = [letter for letter in string if not letter.isdigit()]
    nam = "".join(map(str, nam))

    return num, nam
    
def get_field_df(soup):
    field_ratings = []
    field_players = []
    field_numbers = []
    for field_player in soup.find_all(class_=re.compile("LineupPlayerContainer")):
        rating_element = field_player.find_all(class_=re.compile("PlayerRatingStyled"))
        if len(rating_element) == 0:
            field_ratings.append("nan")
        else:
            field_ratings.append(rating_element[0].find_all("span")[0].get_text())
        
        player_text = field_player.find(class_=re.compile("LineupPlayerText"))
        num, name = convert_num_nam(player_text.get_text())
        field_players.append(name)
        field_numbers.append(num)

    return pd.DataFrame({"number": field_numbers, "name": field_players, "rating": field_ratings})
    
def get_bench_df(soup):
    bench_ratings = []
    bench_players = []
    bench_numbers = []
    for subsoup in soup.find_all(class_=re.compile("LeftBenchItem ")):  # the space is very important in the regex
        if 'PlayerRating' in str(subsoup):
            # extract name and number
            num, name = convert_num_nam((subsoup.find_all('span')[-2].get_text()))
            bench_players.append(name)
            bench_numbers.append(num)

            # get rating
            rating_element = subsoup.find(class_=re.compile("PlayerRatingStyled"))
            bench_ratings.append(rating_element.find('span').get_text())

    return pd.DataFrame({"number": bench_numbers, "name": bench_players, "rating": bench_ratings})

In [3]:
page = requests.get("https://www.fotmob.com/match/3602683/matchfacts/fc-utrecht-vs-go-ahead-eagles")
soup = BeautifulSoup(page.content, 'html.parser')

In [None]:
bench_matchcard_urls = []

for bench_item in soup.find_all(class_=re.compile("LeftBenchItemOuter")):
    if 'PlayerRating' in str(bench_item):
        bench_matchcard_urls.append(f"https://www.fotmob.com{bench_item.find('a')['href']}")

In [4]:
bench_matchcard_urls

['https://www.fotmob.com/match/3602683/matchfacts/player-match-card/594983',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/304668',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/562846',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/902105',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/17761',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/957556',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/467480']

In [70]:
bmu_data = []
for bmu in bench_matchcard_urls:
    matchcard = requests.get(bmu)
    mc_soup = BeautifulSoup(matchcard.content, 'html.parser')
    name = mc_soup.find(class_=re.compile("jalymf-PlayerName")).get_text()
    bmu_data.append(mc_soup.find_all(class_=re.compile("RowContainer e1fnykti"))[1:])

In [28]:
columns = [row.find_all('span')[0].get_text() for row in bmu_data[0]]

In [75]:
bmu_data[0][1].find_all('span')

[<span>Minutes played</span>, <span>30</span>]

In [69]:
test = {'tim': [24], 'birgit': [24]}
test['rob'] = [25]
test

{'tim': [24], 'birgit': [24], 'rob': [25]}

In [64]:
matchcard_data = []
for bmud in bmu_data:
    for index, row in enumerate(bmud):
        spans = row.find_all('span')
        # check if we have data for column i
        if len(spans) < 2:
            data = 'Nan'
        else:
            data = spans[1].get_text()

        # first iteration, creating the nested list
        if len(matchcard_data) < 21:
            matchcard_data.append([data])
        # 2nd iteration and beyond
            
        else:
            print(index, data)
            matchcard_data[index].append(data)

    

matchcard_data

0 30
1 0
2 0
3 1
4 4/7 (57%)
5 0
6 0.03
7 0.03
8 Nan
9 1
10 12
11 0/1 (0%)
12 1
13 Nan
14 0
15 1
16 Nan
17 1/2 (50%)
18 3/4 (75%)
19 1
20 0
0 30
1 0
2 0
3 1
4 11/19 (58%)
5 0
6 0.05
7 Nan
8 1
9 32
10 1/1 (100%)
11 0/2 (0%)
12 0/4 (0%)
13 0
14 Nan
15 0
16 1
17 3
18 Nan
19 2/4 (50%)
20 3/4 (75%)
21 1


IndexError: list index out of range

In [42]:
for c, d in zip(columns, matchcard_data):
    print(c, d)

Minutes played 30
Goals 0
Assists 0
Total shots 2
Accurate passes 6/7 (86%)
Key passes 2
Expected goals (xG) 0.25
Expected assists (xA) 0.05
Attack NaN
Shot accuracy 0/2 (0%)
Big chance missed 1
Touches 15
Successful dribbles 1/2 (50%)
Dispossessed 2
Defence NaN
Tackles won 0
Duels NaN
Ground duels won 1/5 (20%)
Aerial duels won 1/1 (100%)
Was fouled 0
Fouls committed 1


# Scratchpad part 2

In [7]:
import pandas as pd

In [54]:
df = pd.DataFrame({'players': ['Tim', 'Birgit', 'Rob', 'Mariëlle'], 'grades': [4, 7, 5, 8], 'possession': [0, 0, 0, 0]})
df

Unnamed: 0,players,grades,possession
0,Tim,4,0
1,Birgit,7,0
2,Rob,5,0
3,Mariëlle,8,0


In [55]:
row_index = df.index[(df.players == 'Tim') & (df.grades == 2)]
if len(row_index) > 0:
    df.at[row_index[0], 'possession'] = 3
else:
    print('CAREFUL! PLAYER NOT FOUND FOR THESE VARIABLE VALUES:')
df

CAREFUL! PLAYER NOT FOUND FOR THESE VARIABLE VALUES:


Unnamed: 0,players,grades,possession
0,Tim,4,0
1,Birgit,7,0
2,Rob,5,0
3,Mariëlle,8,0


But what happens if we have a novel column name? 
This is important, as not all match cards contain the same variables

In [56]:
df.at[1, 'possession_2'] = 3
df

Unnamed: 0,players,grades,possession,possession_2
0,Tim,4,0,
1,Birgit,7,0,3.0
2,Rob,5,0,
3,Mariëlle,8,0,


In [61]:
pd.concat([df, pd.DataFrame({'players': ['Erik'], 'grades': [8]})], ignore_index = True)

Unnamed: 0,players,grades,possession,possession_2
0,Tim,4,0.0,
1,Birgit,7,0.0,3.0
2,Rob,5,0.0,
3,Mariëlle,8,0.0,
4,Erik,8,,


# Scratchpad Part 3

In [10]:
from bs4 import BeautifulSoup
import re
import requests

In [11]:
page = requests.get('https://www.fotmob.com/match/3602683/matchfacts')
soup = BeautifulSoup(page.content, 'html.parser')

In [12]:
matchcard_urls = []

# for field players
lineup = soup.find_all(class_=re.compile("TeamContainer"))
for squad in lineup:
    for rated_player_url in [a for a in squad.find_all('a') if 'match-card' in str(a)]:
        matchcard_urls.append(f"https://www.fotmob.com{rated_player_url['href']}")

# for bench players
for bench_item in soup.find_all(class_=re.compile("LeftBenchItemOuter")):
    if 'PlayerRating' in str(bench_item):
        matchcard_urls.append(f"https://www.fotmob.com{bench_item.find('a')['href']}")
matchcard_urls

['https://www.fotmob.com/match/3602683/matchfacts/player-match-card/949807',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/121642',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/264860',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/574234',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/970563',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/209372',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/1013279',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/614453',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/438572',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/873270',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/670030',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/655095',
 'https://www.fotmob.com/match/3602683/matchfacts/player-match-card/583138'

# Scratchpad part 4

In [13]:
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [32]:
variable_names = ['hashtag', 'name']

for url in matchcard_urls[:2]:
    # scrape page
    mc_page = requests.get(url)
    mc_soup = BeautifulSoup(mc_page.content, 'html.parser')

    # find variable values
    overlay = mc_soup.find(class_ = re.compile("BGOverlay"))
    rowcontainers = overlay.find_all(class_ = re.compile("RowContainer"))[1:]  # exclude the first element, as it is the 'Top stats' string
    variable_dict = {rc.find_all('span')[0].get_text(): (rc.find_all('span')[1].get_text() if len(rc.find_all('span')) > 1 else 'NaN') for rc in rowcontainers}

In [40]:
df = pd.DataFrame(columns = [k for k in variable_dict])
df

Unnamed: 0,FotMob rating,Minutes played,Goals,Assists,Total shots,Accurate passes,Key passes,Expected goals (xG),Expected goals on target (xGOT),Expected assists (xA),...,Blocks,Clearances,Headed clearance,Interceptions,Recoveries,Duels,Ground duels won,Aerial duels won,Was fouled,Fouls committed


In [41]:
for k in variable_dict:
    df.at[0, k] = variable_dict[k]
df

Unnamed: 0,FotMob rating,Minutes played,Goals,Assists,Total shots,Accurate passes,Key passes,Expected goals (xG),Expected goals on target (xGOT),Expected assists (xA),...,Blocks,Clearances,Headed clearance,Interceptions,Recoveries,Duels,Ground duels won,Aerial duels won,Was fouled,Fouls committed
0,7.9,90,0,0,1,87/97 (90%),1,0.21,0.04,0.02,...,1,3,1,1,11,,3/4 (75%),1/3 (33%),2,1


# Scratchpad Part 5 - The sequel

In [43]:
import pandas as pd

In [44]:
ROOT = "C:/Users/timjo/OneDrive - TU Eindhoven/Silva_Ducis/Scriptie/footballmsc"
df = pd.read_pickle(f"{ROOT}/data/raw/fotmob.pkl")

In [45]:
df

Unnamed: 0,hashtag,name,FotMob rating,Minutes played,Saves,Goals,Goals conceded,xGOT faced,Accurate passes,Accurate long balls,...,Big chance missed,Clearance off the line,Last man tackle,Own goal,Conceded penalty,Error led to goal,Penalties won,Errors led to goal,Crosses,Penalties miss
0,#gaehee,Warner Hahn,6.3,90,4,0,1,1.60,20/28 (71%),5/13 (38%),...,,,,,,,,,,
1,#gaehee,Bas Kuipers,7.4,90,,0,,,24/34 (71%),2/5 (40%),...,,,,,,,,,,
2,#gaehee,Justin Bakker,6.9,90,,0,,,55/67 (82%),12/19 (63%),...,,,,,,,,,,
3,#gaehee,Gerrit Nauber,7.4,90,,0,,,49/59 (83%),3/6 (50%),...,,,,,,,,,,
4,#gaehee,Mats Deijl,7.1,90,,0,,,24/31 (77%),3/9 (33%),...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
841,#azpsv,Yukinari Sugawara,6.0,19,,0,,,14/17 (82%),0/1 (0%),...,,,,,,,,,,
842,#azpsv,Bruma,7.1,35,,0,,,14/19 (74%),1/2 (50%),...,1,,,,,,,,,
843,#azpsv,Ritsu Doan,7.6,35,,1,,,6/6 (100%),,...,,,,,,,,,,
844,#azpsv,Yorbe Vertessen,8.1,28,,1,,,3/4 (75%),,...,,,,,,,,,,
