# UEFA European Championship 

# Imports 

In [1]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re 
import os 
import csv 
import unicodedata
import chardet

# Step 1 : Get Wiki Pages For Matches and Squads 

In [2]:
page = requests.get("https://en.wikipedia.org/wiki/UEFA_European_Championship")

In [3]:
page.content

b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>UEFA European Championship - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Xr7LAApAICgAAE-5pAgAAACR","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"UEFA_European_Championship","wgTitle":"UEFA European Championship","wgCurRevisionId":956295717,"wgRevisionId":956295717,"wgArticleId":249510,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Turkish-language sources (tr)","CS1 maint: location","Articles with short description","EngvarB from August 2018","Use dmy dates f

In [4]:
soup = BeautifulSoup(page.content, 'html.parser')

In [5]:
soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   UEFA European Championship - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Xr7LAApAICgAAE-5pAgAAACR","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"UEFA_European_Championship","wgTitle":"UEFA European Championship","wgCurRevisionId":956295717,"wgRevisionId":956295717,"wgArticleId":249510,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 Turkish-language sources (tr)","CS1 maint: location","Articles with short description","EngvarB from August 2018

In [6]:
tournaments = ['https://en.wikipedia.org'+item['href'] for item in soup.find_all('td',{'class':'navbox-list navbox-odd hlist'})[0].find_all('a')]
tournaments = tournaments[0:-3]

In [7]:
squads = ['https://en.wikipedia.org'+item['href'] for item in soup.find_all('td',{'class':'navbox-list navbox-even hlist'})[1].find_all('a')]

# Step 2 : Prepare Folders for all Tournaments 

In [8]:
def create_folders(folders_names):
    root_path = 'C:/Users/Essam/Desktop/Datasets/UEFA European Championship Dataset'
    for folder in folders_names:
        os.mkdir(os.path.join(root_path,folder))
    return 
def save_data(folder,data,name):
    root_path = 'C:/Users/Essam/Desktop/Datasets/UEFA European Championship Dataset'
    if folder is not None : 
       root_path = os.path.join(root_path,folder)
    sv = os.path.join(root_path,name)
    data.to_csv(sv,index=False)
    return 

In [9]:
folders_names = ['Euro_'+str(i)for i in range(1960,2020,4)]
#create_folders(folders_names)

# Step3 : Get Squads For all Tournaments 

In [10]:
# test to investigate the source of page and get knowledge about required tags 
path = squads[0] 
page = requests.get(path)
soup = BeautifulSoup(page.content, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   1960 European Nations\' Cup squads - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"Xr9PsgpAICMAACCEiwIAAADG","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"1960_European_Nations\'_Cup_squads","wgTitle":"1960 European Nations\' Cup squads","wgCurRevisionId":941559640,"wgRevisionId":941559640,"wgArticleId":10961412,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Use dmy dates from June 2013","UEFA European Championship squads","1960 European Nations\'

In [76]:
def scrape_squads(path,idx): 
    total_squads=[]
    page = requests.get(path)
    soup = BeautifulSoup(page.content, 'html.parser')
    country = None 
    for item in soup.find_all():
        if item.name =='h2'or item.name=='h3':
            country = item.get_text()
        if item.name=='table':
            if item['class']==['sortable' ,'wikitable', 'plainrowheaders']:
                 tmp = pd.read_html(item.prettify())[0]
                 data = [[] for i in range(7)]
                 for its in item.find_all('tr',{'class':'nat-fs-player'}):
                      i = 0 
                      for it in its.find_all('td'):
                          data[i].append(it.get_text().rstrip("\n"))
                          i+=1
                 date = np.array([None if item =="" else item for item in data[2]])
                 if len(tmp.columns)>6:
                   tmp.rename(columns={'No.': "ShirtNumber", 'Pos.': "Position",'Player': "PlayerName(Captain)",3: "DateofBirth(age)",4: "Caps",5: "Goals",6: "Club"},inplace=True)
                 else: 
                   tmp.rename(columns={'No.': "ShirtNumber", 'Pos.': "Position",'Player': "PlayerName(Captain)",3: "DateofBirth(age)",4: "Caps",5: "Club"},inplace=True)
                 tmp["DateofBirth(age)"] = date    
                 tmp['Year']=[ folders_names[idx].split('_')[-1] for i in range(np.shape(tmp)[0])]
                 tmp['Country']=[ country.split('[')[0] for i in range(np.shape(tmp)[0])]
                 total_squads.append(tmp)
    return pd.concat(total_squads,axis=0)

In [77]:
final_squads=[]
j = 0
for item in squads:
    r = scrape_squads(item,j)
    final_squads.append(r)
    j+=1

In [78]:
final_squads = pd.concat(final_squads,axis=0)

In [79]:
final_squads

Unnamed: 0,ShirtNumber,Position,PlayerName(Captain),Date of birth (age),Caps,Club,DateofBirth(age),Year,Country,Goals
0,,GK,Justín Javorek,,0.0,ČH Bratislava,(1932-08-23)23 August 1932 (aged 27),1960,Czechoslovakia,
1,,GK,Viliam Schrojf,,9.0,ŠK Slovan Bratislava,(1931-08-02)2 August 1931 (aged 28),1960,Czechoslovakia,
2,,DF,Ladislav Novák ( captain ),,48.0,Dukla Prague,(1931-12-05)5 December 1931 (aged 28),1960,Czechoslovakia,
3,,DF,Ján Popluhár,,14.0,ŠK Slovan Bratislava,(1935-08-12)12 August 1935 (aged 24),1960,Czechoslovakia,
4,,DF,František Šafránek,,15.0,Dukla Prague,(1931-01-02)2 January 1931 (aged 29),1960,Czechoslovakia,
...,...,...,...,...,...,...,...,...,...,...
18,19.0,FW,Tamás Priskin,,56.0,Slovan Bratislava,(1986-09-27)27 September 1986 (aged 29),2016,Hungary,17.0
19,20.0,DF,Richárd Guzmics,,14.0,Wisła Kraków,(1987-04-16)16 April 1987 (aged 29),2016,Hungary,1.0
20,21.0,DF,Barnabás Bese,,1.0,MTK Budapest,(1994-05-06)6 May 1994 (aged 22),2016,Hungary,0.0
21,22.0,GK,Péter Gulácsi,,3.0,RB Leipzig,(1990-05-06)6 May 1990 (aged 26),2016,Hungary,0.0


In [80]:
columns_titles = ["ShirtNumber","Position","PlayerName(Captain)","DateofBirth(age)","Caps","Goals","Club",'Country','Year']
              
final_squads=final_squads.reindex(columns=columns_titles)
#save_data(None,final_squads,'Uefa Euro Cup All Players.csv')

In [82]:
save_data(None,final_squads,'Uefa Euro Cup All Players.csv')

# Step 4 : Get Matches Data

In [92]:
def left_part(football_box): 
    item = football_box.find('div',{'class':'mobile-float-reset fleft'}) 
    date = []
    for it in item.find_all('div'):
        date.append(it.get_text())
    return date
def right_part(football_box): 
    item = football_box.find('div',{'class':'mobile-float-reset fright'}) 
    location = []
    for it in item.find_all('div'):
        location.append(it.get_text())
    return location
def middle_part(football_box): 
    item = football_box.find('table',{'class':'fevent'}) 
    i = 0 
    home_name = None 
    away_name = None 
    home_score = None 
    away_score = None 
    win_condition = None 
    pen = None 
    # Home - Score - Away 
    for it in item.find_all('th'):
        c = it.get_text()
        i+=1
        if i == 1 : 
            home_name = c 
            home_name = unicodedata.normalize("NFKD", home_name)
        elif i == 2: 
             tmp = c 
             if tmp == 'w/o' or tmp == 'Cancelled': 
                win_condition = home_name + " wins due to disqualification of other team"
                continue 
             if '(' in c : 
                 tmp = c.split('(')[0]
                 for it in item.find_all('tr',{'class':'fgoals'}): 
                     if it.find('th')!=None: 
                            pen = it.find('th').get_text()
                            win_condition =" win on Penalities " + pen 
                 if win_condition is None : 
                    win_condition  = " win after extra time "

             tmp = tmp.split('–')
             if len(tmp) == 1:
                tmp = tmp[0].split('−')
                
             home_score = int(tmp[0])
             away_score = int(tmp[1])
        elif i == 3: 
            away_name = c 
            away_name = unicodedata.normalize("NFKD", away_name)
        elif i == 4:
            if win_condition is not None : 
    
               if 'extra' in win_condition : 
                    if home_score > away_score : 
                        win_condition = home_name + win_condition 
                    else : 
                        win_condition = away_name + win_condition 
               elif 'Pen' in win_condition : 
                     pen = pen.split('–')
                     home_tmp = int(pen[0])
                     away_tmp = int(pen[1])
                     if home_tmp > away_tmp : 
                        win_condition = home_name + win_condition 
                     else:
                        win_condition = away_name + win_condition 
                
    return (home_name,away_name,home_score,away_score,win_condition)

In [104]:
def scrape_matches(path,idx): 
    data = [[] for i in range(11)]
    page = requests.get(path)
    soup = BeautifulSoup(page.content, 'html.parser')
    stage = None 
    for item in soup.find('div',{'id':'mw-content-text'}):
        
        for it in item.find_all(): 
            if it.name == 'h3' :
                stage = it.get_text()
                if '[' in stage : 
                    stage = stage.split('[')[0]
                    
                
            if it.name == 'div':

                if it.has_attr('class'):
                        
                    if it['class'][0]=="footballbox":
                         home_name,away_name,home_score,away_score,win_condition= middle_part(it)
                         date_and_time = left_part(it)
                         date_and_time = [unicodedata.normalize("NFKD", word) for word in date_and_time]
                         # append 1st item of tmp in date , 2nd (if exist) in time 
                         data[0].append(date_and_time[0])
                         if len(date_and_time) == 1 : 
                            data[1].append(None)
                         else:
                            data[1].append(date_and_time[1])   
                         data[2].append(home_name)
                         data[3].append(away_name)
                         data[4].append(home_score)
                         data[5].append(away_score)
                         data[6].append(stage)
                         if win_condition  == " win after extra time ":
                                if home_score > away_score : 
                                   win_condition = home_name + win_condition
                                else:
                                    win_condition = away_name + win_condition 
                         data[7].append(win_condition)
                         # check if the match played or not before get stadium and city and attendance
                         if win_condition is None or 'disqualification' not in win_condition : 
                            tmp = right_part(it)
                            data[8].append(tmp[0].split(',')[0])
                            data[9].append(tmp[0].split(',')[1])
                            if len(tmp)>1:
                                if tmp[1].split(':')[0]=='Attendance':
                                    str_attendance = tmp[1].split(':')[-1] 
                                    if '[' in str_attendance: 
                                        str_attendance = str_attendance.split('[')[0]
                                    str_attendance = str_attendance.replace(',','')
                                    attendance=int(str_attendance)
                                    data[10].append(attendance)
                         else :
                              data[8].append(None)
                              data[9].append(None)
                              data[10].append(None)
    res = pd.DataFrame(data).T
    res['Year']=[ folders_names[idx].split('_')[-1] for i in range(np.shape(res)[0])]
    res.rename(columns={0: "Date", 1: "Time",2: "HomeTeamName",3: "AwayTeamName",4:"HomeTeamGoals",5: "AwayTeamGoals",6: "Stage",7: "SpecialWinConditions",8: "Stadium",9: "City",10: "Attendance"},inplace=True)
    return res

In [105]:
final_Matches=[]
j = 0
for item in tournaments:
    r = scrape_matches(item,j)
    final_Matches.append(r)
    j+=1
    print(j)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15


In [106]:
final_Matches = pd.concat(final_Matches,axis=0)

In [107]:
final_Matches

Unnamed: 0,Date,Time,HomeTeamName,AwayTeamName,HomeTeamGoals,AwayTeamGoals,Stage,SpecialWinConditions,Stadium,City,Attendance,Year
0,6 July 1960 (1960-07-06),20:00,France,Yugoslavia,4,5,Semi-finals,,Parc des Princes,Paris,26370,1960
1,6 July 1960 (1960-07-06),21:30,Czechoslovakia,Soviet Union,0,3,Semi-finals,,Stade Vélodrome,Marseille,25184,1960
2,9 July 1960 (1960-07-09),21:30,Czechoslovakia,France,2,0,Third place play-off,,Stade Vélodrome,Marseille,9438,1960
3,10 July 1960 (1960-07-10),21:30,Soviet Union,Yugoslavia,2,1,Final,Soviet Union win after extra time,Parc des Princes,Paris,17966,1960
0,17 June 1964 (1964-06-17),20:00,Spain,Hungary,2,1,Semi-finals,Spain win after extra time,Santiago Bernabéu,Madrid,34713,1964
...,...,...,...,...,...,...,...,...,...,...,...,...
46,2 July 2016 (2016-07-02),21:00,Germany,Italy,1,1,Quarter-finals,Germany win on Penalities 6–5,Nouveau Stade de Bordeaux,Bordeaux,38764,2016
47,3 July 2016 (2016-07-03),21:00,France,Iceland,5,2,Quarter-finals,,Stade de France,Saint-Denis,76833,2016
48,6 July 2016 (2016-07-06),21:00,Portugal,Wales,2,0,Semi-finals,,Parc Olympique Lyonnais,Décines-Charpieu,55679,2016
49,7 July 2016 (2016-07-07),21:00,Germany,France,0,2,Semi-finals,,Stade Vélodrome,Marseille,64078,2016


In [108]:
save_data(None,final_Matches,'Uefa Euro Cup All Matches.csv')

# Step 4 : Get All Time Statistics 

In [110]:
path = 'https://en.wikipedia.org/wiki/List_of_UEFA_European_Championship_records_and_statistics'
page = requests.get(path)
soup = BeautifulSoup(page.content, 'html.parser')

In [115]:
stats = pd.read_html(soup.find('table',{'class':'wikitable'}).prettify())[0]

In [117]:
stats
save_data(None,stats,'Uefa Euro Cup General Stats.csv')

In [118]:
path = 'https://en.wikipedia.org/wiki/National_team_appearances_in_the_UEFA_European_Championship'
page = requests.get(path)
soup = BeautifulSoup(page.content, 'html.parser')

In [125]:
stats = soup.find_all('table',{'class':['wikitable','sortable']})[5]

In [127]:
stats = pd.read_html(stats.prettify())[0]

In [128]:
stats

Unnamed: 0,Rank,Team,Part.,Pld,W,D,L,GF,GA,GD,Pts,Pts/match
0,1,Germany [a],12,49,26,12,11,72,48,+24,90,1.84
1,2,France,9,39,20,9,10,62,44,+18,69,1.77
2,3,Spain,10,40,19,11,10,55,36,+19,68,1.7
3,4,Italy,9,38,16,16,6,39,27,+12,64,1.68
4,5,Portugal,7,35,18,9,8,49,31,+18,63,1.8
5,6,Netherlands,9,35,17,8,10,57,37,+20,59,1.69
6,7,Czech Republic [b],9,32,13,6,13,42,43,−1,45,1.41
7,8,Russia [c],11,33,12,7,14,38,45,−7,43,1.3
8,9,England,9,31,10,11,10,40,35,+5,41,1.32
9,10,Croatia,5,18,8,5,5,23,20,+3,29,1.61


In [129]:
save_data(None,stats,'Uefa Euro Cup Participated Teams General Statistics.csv')