In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from aiohttp.client import ClientSession
import nest_asyncio
import string
import re
nest_asyncio.apply()

# first scraping

In [2]:
df = pd.read_csv("../raw_data/clean_data.csv")

In [3]:
url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [13]:
names = pd.Series(df.name)[:20000]

In [14]:
%%time
names_url = names.map(lambda x: url + x.lower().replace(" ", "+"))
names_url

CPU times: user 16.8 ms, sys: 4.52 ms, total: 21.3 ms
Wall time: 20.9 ms


0        https://www.transfermarkt.com/schnellsuche/erg...
1        https://www.transfermarkt.com/schnellsuche/erg...
2        https://www.transfermarkt.com/schnellsuche/erg...
3        https://www.transfermarkt.com/schnellsuche/erg...
4        https://www.transfermarkt.com/schnellsuche/erg...
                               ...                        
19995    https://www.transfermarkt.com/schnellsuche/erg...
19996    https://www.transfermarkt.com/schnellsuche/erg...
19997    https://www.transfermarkt.com/schnellsuche/erg...
19998    https://www.transfermarkt.com/schnellsuche/erg...
19999    https://www.transfermarkt.com/schnellsuche/erg...
Name: name, Length: 20000, dtype: object

In [16]:
async def fetch(session, url, headers, semaphore):
    async with semaphore:
        async with session.get(url, headers=headers) as response:
            return await response.text()

async def scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, headers, semaphore)
        soup = BeautifulSoup(html, 'html.parser')

        # Extracting club
        club_element = soup.select_one(club_selector)
        club = club_element['alt'] if club_element else 'None'

        # Extracting profile image
        image_element = soup.select_one(image_selector)
        profile_image = image_element['src'] if image_element else 'None'

        # Extracting year
        year_element = soup.select_one(year_selector)
        year = year_element.text.strip() if year_element else 'None'

        # Extracting market value
        market_value_element = soup.select_one(market_value_selector)
        market_value = market_value_element.text.strip() if market_value_element else 'None'
        
        # Extracting name
        name_element = soup.select_one(name_selector)
        name = name_element.text.strip() if name_element else 'None'

        return {'name': name, 'club': club, 'profile_image': profile_image, 'year': year, 'market_value': market_value}

async def main(names_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    # Specify the CSS selectors for the club, image, year, and market value elements
    club_selector = 'a[title^=""] img.tiny_wappen'
    image_selector = 'img.bilderrahmen-fixed'
    year_selector = 'td.zentriert:nth-of-type(4)'  # Update with the correct selector for the year
    market_value_selector = 'td.rechts.hauptlink'  # Update with the correct selector for market value
    name_selector = 'td.hauptlink a'

    # Set the maximum number of concurrent requests
    max_concurrent_requests = 3000
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use asyncio.gather to concurrently scrape multiple pages
    tasks = [scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore) for url in names_url]
    results = await asyncio.gather(*tasks)

    # Convert the results list to a DataFrame
    df = pd.DataFrame(results)

    return df

if __name__ == "__main__":
    # Example Pandas Series with URLs
    names_url = names_url

    import time
    start_time = time.time()

    result_df = asyncio.run(main(names_url))

    end_time = time.time()

    print(f"Total time taken: {end_time - start_time} seconds")
result_df

Total time taken: 283.84322214126587 seconds


Unnamed: 0,name,club,profile_image,year,market_value
0,Kevin De Bruyne,Manchester City,https://img.a.transfermarkt.technology/portrai...,32,€70.00m
1,Harry Kane,Bayern Munich,https://img.a.transfermarkt.technology/portrai...,30,€110.00m
2,Mohamed Salah,Liverpool FC,https://img.a.transfermarkt.technology/portrai...,31,€65.00m
3,Sadio Mané,Al-Nassr FC,https://img.a.transfermarkt.technology/portrai...,31,€25.00m
4,,,,,
...,...,...,...,...,...
19995,Bogdan Jeler,Minaur Baia Mare,https://img.a.transfermarkt.technology/portrai...,20,-
19996,Alexandru Caia,AS Vointa Lupac,https://img.a.transfermarkt.technology/portrai...,20,-
19997,Fabian Østigård Ness,IK Start,https://img.a.transfermarkt.technology/portrai...,24,€150k
19998,,,,,


In [17]:
part_1 = result_df

In [18]:
part_1

Unnamed: 0,name,club,profile_image,year,market_value
0,Kevin De Bruyne,Manchester City,https://img.a.transfermarkt.technology/portrai...,32,€70.00m
1,Harry Kane,Bayern Munich,https://img.a.transfermarkt.technology/portrai...,30,€110.00m
2,Mohamed Salah,Liverpool FC,https://img.a.transfermarkt.technology/portrai...,31,€65.00m
3,Sadio Mané,Al-Nassr FC,https://img.a.transfermarkt.technology/portrai...,31,€25.00m
4,,,,,
...,...,...,...,...,...
19995,Bogdan Jeler,Minaur Baia Mare,https://img.a.transfermarkt.technology/portrai...,20,-
19996,Alexandru Caia,AS Vointa Lupac,https://img.a.transfermarkt.technology/portrai...,20,-
19997,Fabian Østigård Ness,IK Start,https://img.a.transfermarkt.technology/portrai...,24,€150k
19998,,,,,


In [19]:
part_1.to_csv('../raw_data/names_1.csv', index=False)

# second scraping

In [21]:
df = pd.read_csv("../raw_data/korean_players.csv")

In [22]:
df

Unnamed: 0,name,club,nat,position,dob,age,height,weight,wage,last trans. fee,...,winger,division_rating,nat_rating,club_rating,either_left,either_right,left,right,current_age,kor_names
0,Son Heung-Min,Tottenham,KOR,"M/AM (RL), ST (C)",1992-07-08,28.0,183.0,77.0,450000.0,24000000.0,...,1,93.8,1540.35,1837.0,1,0,0,0,31,아들 Heung-Min
1,Hwang In-Beom,Rubin,KOR,M (C),1996-09-20,23.0,177.0,70.0,64500.0,2500000.0,...,0,79.8,1540.35,1471.0,0,0,0,1,27,황은 인간
2,Kim Hyun-Woo,Slaven Belupo,KOR,"D (C), DM",1999-03-07,21.0,186.0,80.0,1800.0,0.0,...,0,40.8,1540.35,1314.0,0,0,0,1,24,김현우
3,Kim Do-Hyun,Aluminij,KOR,M (C),1994-04-09,26.0,181.0,77.0,900.0,0.0,...,0,65.0,1540.35,1297.0,0,0,0,1,29,김 도현
4,Lee Sanghyeok,Pardubice,KOR,M (L),2000-01-19,20.0,172.0,63.0,1000.0,0.0,...,1,78.4,1540.35,1156.0,0,0,1,0,23,리 상 게이오크
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1412,Joo Chil-Sung,Ogden City,KOR,M (RL),1999-03-12,21.0,171.0,66.0,9.0,0.0,...,1,40.8,1540.35,1156.0,0,0,0,1,24,Joo Chil-Sung
1413,Song Tae-Jin,Mississippi Brilla,KOR,AM (C),2002-08-05,17.0,172.0,64.0,9.0,0.0,...,0,40.8,1540.35,1156.0,0,0,1,0,21,송 태진
1414,Joseph Brody,Earthquakes Academy,KOR,ST (C),2004-09-07,15.0,177.0,58.0,9.0,0.0,...,0,40.8,1540.35,1156.0,0,0,0,1,19,조셉 브로디
1415,Lee Se-Min,FC Delco,KOR,ST (C),2003-01-29,17.0,189.0,82.0,9.0,0.0,...,0,45.9,1540.35,1156.0,0,0,0,1,20,이 세민


In [23]:
url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [24]:
names = pd.Series(df.kor_names)

In [25]:
%%time
names_url = names.map(lambda x: url + x.lower().replace(" ", "+"))
names_url

CPU times: user 1.72 ms, sys: 3.01 ms, total: 4.73 ms
Wall time: 5.51 ms


0       https://www.transfermarkt.com/schnellsuche/erg...
1       https://www.transfermarkt.com/schnellsuche/erg...
2       https://www.transfermarkt.com/schnellsuche/erg...
3       https://www.transfermarkt.com/schnellsuche/erg...
4       https://www.transfermarkt.com/schnellsuche/erg...
                              ...                        
1412    https://www.transfermarkt.com/schnellsuche/erg...
1413    https://www.transfermarkt.com/schnellsuche/erg...
1414    https://www.transfermarkt.com/schnellsuche/erg...
1415    https://www.transfermarkt.com/schnellsuche/erg...
1416    https://www.transfermarkt.com/schnellsuche/erg...
Name: kor_names, Length: 1417, dtype: object

In [26]:
async def fetch(session, url, headers, semaphore):
    async with semaphore:
        async with session.get(url, headers=headers) as response:
            return await response.text()

async def scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, headers, semaphore)
        soup = BeautifulSoup(html, 'html.parser')

        # Extracting club
        club_element = soup.select_one(club_selector)
        club = club_element['alt'] if club_element else 'None'

        # Extracting profile image
        image_element = soup.select_one(image_selector)
        profile_image = image_element['src'] if image_element else 'None'

        # Extracting year
        year_element = soup.select_one(year_selector)
        year = year_element.text.strip() if year_element else 'None'

        # Extracting market value
        market_value_element = soup.select_one(market_value_selector)
        market_value = market_value_element.text.strip() if market_value_element else 'None'
        
        # Extracting name
        name_element = soup.select_one(name_selector)
        name = name_element.text.strip() if name_element else 'None'

        return {'name': name, 'club': club, 'profile_image': profile_image, 'year': year, 'market_value': market_value}

async def main(names_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    # Specify the CSS selectors for the club, image, year, and market value elements
    club_selector = 'a[title^=""] img.tiny_wappen'
    image_selector = 'img.bilderrahmen-fixed'
    year_selector = 'td.zentriert:nth-of-type(4)'  # Update with the correct selector for the year
    market_value_selector = 'td.rechts.hauptlink'  # Update with the correct selector for market value
    name_selector = 'td.hauptlink a'

    # Set the maximum number of concurrent requests
    max_concurrent_requests = 3000
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use asyncio.gather to concurrently scrape multiple pages
    tasks = [scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore) for url in names_url]
    results = await asyncio.gather(*tasks)

    # Convert the results list to a DataFrame
    df = pd.DataFrame(results)

    return df

if __name__ == "__main__":
    # Example Pandas Series with URLs
    names_url = names_url

    import time
    start_time = time.time()

    result_df = asyncio.run(main(names_url))

    end_time = time.time()

    print(f"Total time taken: {end_time - start_time} seconds")
result_df

Total time taken: 77.23081684112549 seconds


Unnamed: 0,name,club,profile_image,year,market_value
0,,,,,
1,,,,,
2,Hyun-woo Kim,Daejeon Hana Citizen,https://img.a.transfermarkt.technology/portrai...,24,€300k
3,,,,,
4,,,,,
...,...,...,...,...,...
1412,,,,,
1413,,,,,
1414,,,,,
1415,,,,,


In [27]:
result_df[result_df.name == 'None']

Unnamed: 0,name,club,profile_image,year,market_value
0,,,,,
1,,,,,
3,,,,,
4,,,,,
5,,,,,
...,...,...,...,...,...
1412,,,,,
1413,,,,,
1414,,,,,
1415,,,,,
