# Africa Cup of Nations Dataset 

# Imports 

In [31]:
import requests 
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re 
import os 
import csv 
import unicodedata
import chardet

# Step 1 : Get Wikipedia Page of every Tournament 

In [3]:
page = requests.get("https://en.wikipedia.org/wiki/Africa_Cup_of_Nations_records_and_statistics")

In [4]:
page.content

b'\n<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>Africa Cup of Nations records and statistics - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XrgzEgpAMNAAAPwzZBcAAAAA","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Africa_Cup_of_Nations_records_and_statistics","wgTitle":"Africa Cup of Nations records and statistics","wgCurRevisionId":946002327,"wgRevisionId":946002327,"wgArticleId":38206129,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with unsourced statements","Articles with unsourced statements fr

In [5]:
soup = BeautifulSoup(page.content, 'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Africa Cup of Nations records and statistics - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XrgzEgpAMNAAAPwzZBcAAAAA","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Africa_Cup_of_Nations_records_and_statistics","wgTitle":"Africa Cup of Nations records and statistics","wgCurRevisionId":946002327,"wgRevisionId":946002327,"wgArticleId":38206129,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with unsourced statements","Articles with unsourced stat

In [7]:
table = soup.find_all('table')

In [8]:
table = table[-1]

In [9]:
pages = table.find_all('a',{"href":re.compile("\/\wiki/[0-9].*\_Africa_Cup_of_Nations$")})

In [10]:
# Exclude the future tournaments 
pages = pages[0:-3]

In [11]:
# remove everything but link to page 
pages = ['https://en.wikipedia.org'+page['href']for page in pages]

In [12]:
pages

['https://en.wikipedia.org/wiki/1957_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1959_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1962_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1963_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1965_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1968_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1970_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1972_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1974_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1976_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1978_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1980_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1982_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1984_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1986_Africa_Cup_of_Nations',
 'https://en.wikipedia.org/wiki/1988_Africa_Cup_of_Nations',
 'https://en.wikipedia.o

# Step 2 : Prepare the folders (folder for every tournament)

In [44]:
def create_folders(folders_names):
    root_path = 'C:/Users/Essam/Desktop/Datasets/Africa Cup of Nations Dataset'
    for folder in folders_names:
        os.mkdir(os.path.join(root_path,folder))
    return 
def save_data(folder,data,name):
    root_path = 'C:/Users/Essam/Desktop/Datasets/Africa Cup of Nations Dataset'
    if folder is not None : 
       root_path = os.path.join(root_path,folder)
    sv = os.path.join(root_path,name)
    data.to_csv(sv,index=False)
    return 

In [13]:
folders_names = [page.split('/')[-1].split('_')[0] for page in pages] 
#create_folders(folders_names)
print(folders_names)

['1957', '1959', '1962', '1963', '1965', '1968', '1970', '1972', '1974', '1976', '1978', '1980', '1982', '1984', '1986', '1988', '1990', '1992', '1994', '1996', '1998', '2000', '2002', '2004', '2006', '2008', '2010', '2012', '2013', '2015', '2017', '2019']


# Step 3 : Scraping squads of all teams in every tournament 

In [13]:
squads_pages = [page+'_squads' for page in pages] 

In [15]:
# test to investigate the source of page and get knowledge about required tags 
path = squads_pages[0] 
page = requests.get(path)
soup = BeautifulSoup(page.content, 'html.parser')
soup.prettify()

'<!DOCTYPE html>\n<html class="client-nojs" dir="ltr" lang="en">\n <head>\n  <meta charset="utf-8"/>\n  <title>\n   1957 Africa Cup of Nations squads - Wikipedia\n  </title>\n  <script>\n   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XrpIawpAMNIAAmVcJ60AAAAK","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"1957_Africa_Cup_of_Nations_squads","wgTitle":"1957 Africa Cup of Nations squads","wgCurRevisionId":948170935,"wgRevisionId":948170935,"wgArticleId":37096421,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Africa Cup of Nations squads","1957 African Cup of Nations"],"wgPageContentLanguage":"en","wg

In [36]:
def scrape_squads(page_path):
    page = requests.get(page_path)
    soup = BeautifulSoup(page.content, 'html.parser')
    # Intialize list of 7 lists to store the attributes of players 
    # list 0 : shirt number , list 1 : position , list 2 : Date of birth (age) , list 3 : Caps
    # list 4 : Goals , list 5 : Club , list 6 : Player Name (Captain)
    data = [[] for i in range(7)]
    for item in soup.find_all('tr',{'class':'nat-fs-player'}):
        i = 0 
        for it in item.find_all('td'):
            data[i].append(it.get_text().rstrip("\n"))
            i+=1
        data[i].append(item.find('th').get_text().rstrip("\n"))
    
    shirt = np.array([None if item =="" else item for item in data[0]])
    position = np.array([None if item =="" else item[1:] for item in data[1]]) 
    date = np.array([None if item =="" else item for item in data[2]])
    caps = np.array([None if item =="" else item for item in data[3]])
    goals = np.array([None if item =="" else item for item in data[4]])
    club = np.array([None if item =="" else item for item in data[5]])
    name =  np.array([None if item =="" else item for item in data[6]])
    res = pd.DataFrame([shirt,position,name,date,caps,goals,club]).T
    res.rename(columns={0: " Shirt Number ", 1: " Position ",2: "Player Name (Captain) ",3: " Date of Birth (age) ",4: " Caps ",5: "Goals",6: " Club "},inplace=True)
    return res 

In [39]:
j = 0 
for page in squads_pages: 
    df = scrape_squads(page)
    save_data(folders_names[j],df,'squads.csv')
    j+=1
    print(j)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


# Step 4 : Scraping Matches data for all Tournaments

In [13]:
page = pages[0] 
page = requests.get(page)
soup = BeautifulSoup(page.content, 'html.parser')

In [269]:
def left_part(football_box): 
    item = football_box.find('div',{'class':'mobile-float-reset fleft'}) 
    date = []
    for it in item.find_all('div'):
        date.append(it.get_text())
    return date
def right_part(football_box): 
    item = football_box.find('div',{'class':'mobile-float-reset fright'}) 
    location = []
    for it in item.find_all('div'):
        location.append(it.get_text())
    return location
def middle_part(football_box): 
    item = football_box.find('table',{'class':'fevent'}) 
    i = 0 
    home_name = None 
    away_name = None 
    home_score = None 
    away_score = None 
    win_condition = None 
    pen = None 
    # Home - Score - Away 
    for it in item.find_all('th'):
        c = it.get_text()
        i+=1
        if i == 1 : 
            home_name = c 
            home_name = unicodedata.normalize("NFKD", home_name)
        elif i == 2: 
             tmp = c 
             if tmp == 'w/o' or tmp == 'Cancelled': 
                win_condition = home_name + " wins due to disqualification of other team"
                continue 
             if '(' in c : 
                 tmp = c.split('(')[0]
                 for it in item.find_all('tr',{'class':'fgoals'}): 
                     if it.find('th')!=None: 
                            pen = it.find('th').get_text()
                            win_condition =" win on Penalities " + pen 
                 if win_condition is None : 
                    win_condition  = " win after extra time "

             tmp = tmp.split('–')
             if len(tmp) == 1:
                tmp = tmp[0].split('−')
                
             home_score = int(tmp[0])
             away_score = int(tmp[1])
        elif i == 3: 
            away_name = c 
            away_name = unicodedata.normalize("NFKD", away_name)
        elif i == 4:
            if win_condition is not None : 
    
               if 'extra' in win_condition : 
                    if home_score > away_score : 
                        win_condition = home_name + win_condition 
                    else : 
                        win_condition = away_name + win_condition 
               elif 'Pen' in win_condition : 
                     pen = pen.split('–')
                     home_tmp = int(pen[0])
                     away_tmp = int(pen[1])
                     if home_tmp > away_tmp : 
                        win_condition = home_name + win_condition 
                     else:
                        win_condition = away_name + win_condition 
                
    return (home_name,away_name,home_score,away_score,win_condition)

In [282]:
def scrape_matches(path): 
    data = [[] for i in range(11)]
    page = requests.get(path)
    soup = BeautifulSoup(page.content, 'html.parser')
    stage = None 
    for item in soup.find('div',{'id':'mw-content-text'}):
        
        for it in item.find_all(): 
            if it.name == 'h3' :
                stage = it.get_text()
                if '[' in stage : 
                    stage = stage.split('[')[0]
                    
                
            if it.name == 'div':

                if it.has_attr('class'):
                        
                    if it['class'][0]=="footballbox":
                         home_name,away_name,home_score,away_score,win_condition= middle_part(it)
                         date_and_time = left_part(it)
                         date_and_time = [unicodedata.normalize("NFKD", word) for word in date_and_time]
                         # append 1st item of tmp in date , 2nd (if exist) in time 
                         data[0].append(date_and_time[0])
                         if len(date_and_time) == 1 : 
                            data[1].append(None)
                         else:
                            data[1].append(date_and_time[1])   
                         data[2].append(home_name)
                         data[3].append(away_name)
                         data[4].append(home_score)
                         data[5].append(away_score)
                         data[6].append(stage)
                         data[7].append(win_condition)
                         # check if the match played or not before get stadium and city and attendance
                         if win_condition is None or 'disqualification' not in win_condition : 
                            tmp = right_part(it)
                            data[8].append(tmp[0].split(',')[0])
                            data[9].append(tmp[0].split(',')[1])
                            if len(tmp)>1:
                                if tmp[1].split(':')[0]=='Attendance':
                                    str_attendance = tmp[1].split(':')[-1] 
                                    if '[' in str_attendance: 
                                        str_attendance = str_attendance.split('[')[0]
                                    str_attendance = str_attendance.replace(',','')
                                    attendance=int(str_attendance)
                                    data[10].append(attendance)
                         else :
                              data[8].append(None)
                              data[9].append(None)
                              data[10].append(None)
    res = pd.DataFrame(data).T
    res.rename(columns={0: " Date ", 1: " Time ",2: " Home Team Name ",3: " Away Team Name ",4:" Home Team Goals ",5: " Away Team Goals ",6: " Stage ",7: " Win Conditions ",8: " Stadium ",9: " City ",10: " Attendance "},inplace=True)
    
    return res

In [285]:
j = 0 
df = None 
for page in pages: 
    df = scrape_matches(page)
    save_data(folders_names[j],df,'Matches.csv')
    j+=1
    print(j)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32


# Step 5 : Get General Statistics for all Tournaments  

In [26]:
def scrape_general_stats(path1="https://en.wikipedia.org/wiki/Africa_Cup_of_Nations_records_and_statistics",path2="https://en.wikipedia.org/wiki/National_team_appearances_in_the_Africa_Cup_of_Nations"):
    page = requests.get(path1)
    soup = BeautifulSoup(page.content, 'html.parser')
    item = soup.find('table',{'class':'wikitable'})
    all_tournaments =pd.read_html(item.prettify())[0]
    page = requests.get(path2)
    soup = BeautifulSoup(page.content, 'html.parser')
    item = soup.find('table',{'class':'wikitable sortable'})
    all_nations  = pd.read_html(item.prettify())[0]
    save_data(None,all_tournaments,'General Statistics For each Tournaments.csv')
    save_data(None,all_nations,'General Statistics For each Participated Team.csv')
    return


In [35]:
scrape_general_stats()

# Step 6 : Combine Data of All Tournaments 

In [39]:
root_path = 'C:/Users/Essam/Desktop/Datasets/Africa Cup of Nations Dataset'
matches_data = [] 
players_data = []
for folder in os.listdir(root_path): 
    if folder  in folders_names: 
        pth = os.path.join(root_path,folder)
        i = 0 
        for item in os.listdir(pth):
            tmp = os.path.join(pth,item)
            if i ==0 : 
                match = pd.read_csv(tmp)
                Year = pd.DataFrame([folder for j in range(match.shape[0])])
                match = pd.concat([Year, match], axis=1) 
                match.rename(columns={0: " Year "},inplace=True)
                matches_data.append(match)
            else:
                with open(tmp, 'rb') as f:
                       result = chardet.detect(f.read())  # or readline if the file is large
                player = pd.read_csv(tmp,encoding=result['encoding'])
                Year = pd.DataFrame([folder for j in range(player.shape[0])])
                player = pd.concat([Year, player], axis=1) 
                player.rename(columns={0: " Year ","No.":" Shirt Number ","Pos.":" Player Position ","Player":" Player Name "},inplace=True)
                players_data.append(player)
            i+=1

In [40]:
total_players = pd.concat(players_data, axis=0) 
total_matches = pd.concat(matches_data, axis=0)

In [41]:
total_players

Unnamed: 0,Year,Shirt Number,Player Position,Player Name,Date of birth (age),Caps,Goals,Club,Country
0,1957,,GK,Ali Bakr,,,,Zamalek,Egypt
1,1957,,GK,"Paraskos ""Brascos"" Trimeritis",,,,El-Qanah,Egypt
2,1957,,GK,Abdel-Galil Hameida,,,,Al-Ahly,Egypt
3,1957,,DF,Mosaad Daoud,,,,El-Olympi,Egypt
4,1957,,DF,El-Sayed El-Arabi,,,,Teram,Egypt
...,...,...,...,...,...,...,...,...,...
547,2019,19,FW,Joseph Mendes,(1991-03-30)30 March 1991 (aged 28),0,0,Ajaccio,Guinea-Bissau
548,2019,20,MF,Sori Mané,(1996-04-03)3 April 1996 (aged 23),7,0,Cova da Piedade,Guinea-Bissau
549,2019,21,DF,Nanú,(1994-05-17)17 May 1994 (aged 25),0,0,Marítimo,Guinea-Bissau
550,2019,22,DF,Mamadu Candé,(1990-08-29)29 August 1990 (aged 28),14,0,Santa Clara,Guinea-Bissau


In [42]:
total_matches 

Unnamed: 0,Year,Date,Time,Home Team Name,Away Team Name,Home Team Goals,Away Team Goals,Stage,Win Conditions,Stadium,City,Attendance
0,1957,10 February 1957,,Sudan,Egypt,1.0,2.0,Semifinals,,Municipal Stadium,Khartoum,30000.0
1,1957,10 February 1957,,Ethiopia,South Africa,,,Semifinals,Ethiopia wins due to disqualification of othe...,,,
2,1957,16 February 1957,,Egypt,Ethiopia,4.0,0.0,Final,,Municipal Stadium,Khartoum,30000.0
0,1959,22-May-59,,Egypt,Ethiopia,4.0,0.0,Final Tournament,,Prince Farouk Stadium,Cairo,30000.0
1,1959,25-May-59,,Sudan,Ethiopia,1.0,0.0,Final Tournament,,Prince Farouk Stadium,Cairo,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
47,2019,11 July 2019 (2019-07-11),21:00,Madagascar,Tunisia,0.0,3.0,Quarter-finals,,Al Salam Stadium,Cairo,7568.0
48,2019,14 July 2019 (2019-07-14),18:00,Senegal,Tunisia,1.0,0.0,Semi-finals,win after extra time,30 June Stadium,Cairo,9143.0
49,2019,14 July 2019 (2019-07-14),21:00,Algeria,Nigeria,2.0,1.0,Semi-finals,,Cairo International Stadium,Cairo,49775.0
50,2019,17 July 2019 (2019-07-17),21:00,Tunisia,Nigeria,0.0,1.0,Third place play-off,,Al Salam Stadium,Cairo,6340.0


In [45]:
save_data(None,total_matches,'Africa Cup of Nations Matches.csv')
save_data(None,total_players,'Africa Cup of Nations Players.csv')