In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from aiohttp.client import ClientSession
import nest_asyncio
import string
import re
nest_asyncio.apply()

In [2]:
df = pd.read_csv("../raw_data/clean_data.csv")

In [3]:
url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [4]:
async def fetch(session, url, headers, semaphore):
    async with semaphore:
        async with session.get(url, headers=headers) as response:
            return await response.text()

In [5]:
clubs = pd.Series(df.club.unique())

In [6]:
clubs_url = clubs.map(lambda x: url + x.lower().replace(" ", "+"))
clubs_url

0       https://www.transfermarkt.com/schnellsuche/erg...
1       https://www.transfermarkt.com/schnellsuche/erg...
2       https://www.transfermarkt.com/schnellsuche/erg...
3       https://www.transfermarkt.com/schnellsuche/erg...
4       https://www.transfermarkt.com/schnellsuche/erg...
                              ...                        
3735    https://www.transfermarkt.com/schnellsuche/erg...
3736    https://www.transfermarkt.com/schnellsuche/erg...
3737    https://www.transfermarkt.com/schnellsuche/erg...
3738    https://www.transfermarkt.com/schnellsuche/erg...
3739    https://www.transfermarkt.com/schnellsuche/erg...
Length: 3740, dtype: object

In [7]:
async def scrape_page(url, headers, club_logo_selector, club_name_selector, semaphore):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, headers, semaphore)
        soup = BeautifulSoup(html, 'html.parser')

        # Extracting club logo
        club_logo_element = soup.select_one(club_logo_selector)
        club_logo = club_logo_element['src'] if club_logo_element else 'None'

        # Extracting club name
        club_name_element = soup.select_one(club_name_selector)
        club_name = club_name_element['alt'].strip() if club_name_element and 'alt' in club_name_element.attrs else 'None'

        return {'club_name': club_name, 'club_logo': club_logo}

async def main(names_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    # Specify the CSS selectors for the club logo and club name elements
    club_logo_selector = 'td.zentriert.suche-vereinswappen img'
    club_name_selector = 'td.zentriert.suche-vereinswappen img'


    # Set the maximum number of concurrent requests
    max_concurrent_requests = 4000
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use asyncio.gather to concurrently scrape multiple pages
    tasks = [scrape_page(url, headers, club_logo_selector, club_name_selector, semaphore) for url in names_url]
    results = await asyncio.gather(*tasks)

    # Convert the results list to a DataFrame
    df = pd.DataFrame(results)

    return df

if __name__ == "__main__":
    # Example Pandas Series with URLs
    names_url = clubs_url

    import time
    start_time = time.time()

    result_df = asyncio.run(main(names_url))

    end_time = time.time()

    print(f"Total time taken: {end_time - start_time} seconds")
result_df

Total time taken: 126.5225601196289 seconds


Unnamed: 0,club_name,club_logo
0,Manchester City,https://tmssl.akamaized.net/images/wappen/smal...
1,Tottenham Hotspur,https://tmssl.akamaized.net/images/wappen/smal...
2,Liverpool FC,https://tmssl.akamaized.net/images/wappen/smal...
3,Manchester United,https://tmssl.akamaized.net/images/wappen/smal...
4,Aston Villa,https://tmssl.akamaized.net/images/wappen/smal...
...,...,...
3735,Daytona Rush SC,https://tmssl.akamaized.net/images/wappen/smal...
3736,,
3737,OVF Alliance,https://tmssl.akamaized.net/images/wappen/smal...
3738,Austin FC Academy,https://tmssl.akamaized.net/images/wappen/smal...


In [8]:
result_df["fm_name"] = clubs

In [9]:
clubs_df = result_df
clubs_df

Unnamed: 0,club_name,club_logo,fm_name
0,Manchester City,https://tmssl.akamaized.net/images/wappen/smal...,Man City
1,Tottenham Hotspur,https://tmssl.akamaized.net/images/wappen/smal...,Tottenham
2,Liverpool FC,https://tmssl.akamaized.net/images/wappen/smal...,Liverpool
3,Manchester United,https://tmssl.akamaized.net/images/wappen/smal...,Man Utd
4,Aston Villa,https://tmssl.akamaized.net/images/wappen/smal...,Aston Villa
...,...,...,...
3735,Daytona Rush SC,https://tmssl.akamaized.net/images/wappen/smal...,Daytona Rush
3736,,,Eastside FC
3737,OVF Alliance,https://tmssl.akamaized.net/images/wappen/smal...,OVF Alliance
3738,Austin FC Academy,https://tmssl.akamaized.net/images/wappen/smal...,Austin FC Academy


In [10]:
clubs_df[clubs_df.fm_name == 'Man City'].club_logo[0]

'https://tmssl.akamaized.net/images/wappen/small/281.png?lm=1467356331'

In [11]:
clubs_df[clubs_df.club_name == 'None']

Unnamed: 0,club_name,club_logo,fm_name
48,,,FC København
81,,,Krylja Sovetov
85,,,Dinamo Moscow
98,,,Bld. Erzurumspor
107,,,Hapoel Be'er-Sheva
...,...,...,...
3717,,,Whitecaps Academy
3726,,,Juventus Sports Club
3728,,,Minn. Utd Academy
3730,,,Inter Miami Academy


In [12]:
clubs_df.to_csv('../raw_data/clubs_logo.csv', index=False)