### Scraping data from all world cups

In [27]:
# import packages 
import pandas as pd 
import numpy as np 
import seaborn as sns 
from bs4 import BeautifulSoup 
import requests 

In [28]:
def get_matches(year):
    web = r'https://en.wikipedia.org/wiki/{}_FIFA_World_Cup'.format(year)
    # A web browser request to the server 
    response = requests.get(web)
    # Wikipedia webpages content
    content = response.text
    # Creating a Soup 
    soup = BeautifulSoup(content, 'lxml')
    matches = soup.find_all('div', class_='footballbox')
    
    # Empty lists to iterate through them to get the data we want to scrape
    home = []
    score = []
    away = []
    total_goals = []
    
    for match in matches:
        home.append(match.find('th', class_='fhome').get_text())
        score.append(match.find('th', class_='fscore').get_text())
        away.append(match.find('th', class_='faway').get_text())
        
        # this part provided to show that there are the result linked with a non-traditional result format like the matches extending to 120 mins
        # or walkover [w/o] due to the withdrawl of a certain team considered a win for the team or player who received the walkover
        score_str = score[-1].replace(' (a.e.t.)', '').replace('–', '-').replace('w/o[a]', '')
        # remove white spaces
        if not score_str.strip():
            home_goals = 0
            away_goals = 0
        # Extracting goals scored by each team from the 'score' column     
        else:
            score_parts = score_str.split('-')
            home_goals = int(score_parts[0])
            away_goals = int(score_parts[1])
        # Summing total goals scored in each match
        total_goals.append(home_goals + away_goals)
        # creating a Df
        matches_dict = {'home': home, 'score': score, 'away': away, 'total_goals': total_goals}
        matches_df = pd.DataFrame(matches_dict)
        matches_df ['year'] = year  
    return matches_df

In [5]:
years = [1930, 1934, 1938, 1950, 1954, 1958, 1962, 1966, 1970, 1974,
         1978, 1982, 1986, 1990, 1994, 1998, 2002, 2006, 2010, 2014,
         2018]
# results: historical data
fifa = [get_matches(year) for year in years]
fifa_wc_df = pd.concat(fifa, ignore_index=True)
fifa_wc_df.to_csv(r"D:\data\datasets\fifa_worldcup_historical_data.csv", index=False)
Qatar2022 = get_matches(2022)
Qatar2022.to_csv(r'D:\data\datasets\Qatar2022.csv', index=False)

In [6]:
Qatar2022.head()

Unnamed: 0,home,score,away,total_goals,year
0,Qatar,0–2,Ecuador,2,2022
1,Senegal,0–2,Netherlands,2,2022
2,Qatar,1–3,Senegal,4,2022
3,Netherlands,1–1,Ecuador,2,2022
4,Ecuador,1–2,Senegal,3,2022
