In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
from aiohttp.client import ClientSession
import nest_asyncio
import string
import re
nest_asyncio.apply()

# first scraping

In [2]:
df = pd.read_csv("../raw_data/clean_data.csv")

In [3]:
url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [13]:
names = pd.Series(df.name)[:20000]

In [14]:
%%time
names_url = names.map(lambda x: url + x.lower().replace(" ", "+"))
names_url

CPU times: user 16.8 ms, sys: 4.52 ms, total: 21.3 ms
Wall time: 20.9 ms


0        https://www.transfermarkt.com/schnellsuche/erg...
1        https://www.transfermarkt.com/schnellsuche/erg...
2        https://www.transfermarkt.com/schnellsuche/erg...
3        https://www.transfermarkt.com/schnellsuche/erg...
4        https://www.transfermarkt.com/schnellsuche/erg...
                               ...                        
19995    https://www.transfermarkt.com/schnellsuche/erg...
19996    https://www.transfermarkt.com/schnellsuche/erg...
19997    https://www.transfermarkt.com/schnellsuche/erg...
19998    https://www.transfermarkt.com/schnellsuche/erg...
19999    https://www.transfermarkt.com/schnellsuche/erg...
Name: name, Length: 20000, dtype: object

In [16]:
async def fetch(session, url, headers, semaphore):
    async with semaphore:
        async with session.get(url, headers=headers) as response:
            return await response.text()

async def scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, headers, semaphore)
        soup = BeautifulSoup(html, 'html.parser')

        # Extracting club
        club_element = soup.select_one(club_selector)
        club = club_element['alt'] if club_element else 'None'

        # Extracting profile image
        image_element = soup.select_one(image_selector)
        profile_image = image_element['src'] if image_element else 'None'

        # Extracting year
        year_element = soup.select_one(year_selector)
        year = year_element.text.strip() if year_element else 'None'

        # Extracting market value
        market_value_element = soup.select_one(market_value_selector)
        market_value = market_value_element.text.strip() if market_value_element else 'None'
        
        # Extracting name
        name_element = soup.select_one(name_selector)
        name = name_element.text.strip() if name_element else 'None'

        return {'name': name, 'club': club, 'profile_image': profile_image, 'year': year, 'market_value': market_value}

async def main(names_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    # Specify the CSS selectors for the club, image, year, and market value elements
    club_selector = 'a[title^=""] img.tiny_wappen'
    image_selector = 'img.bilderrahmen-fixed'
    year_selector = 'td.zentriert:nth-of-type(4)'  # Update with the correct selector for the year
    market_value_selector = 'td.rechts.hauptlink'  # Update with the correct selector for market value
    name_selector = 'td.hauptlink a'

    # Set the maximum number of concurrent requests
    max_concurrent_requests = 3000
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use asyncio.gather to concurrently scrape multiple pages
    tasks = [scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore) for url in names_url]
    results = await asyncio.gather(*tasks)

    # Convert the results list to a DataFrame
    df = pd.DataFrame(results)

    return df

if __name__ == "__main__":
    # Example Pandas Series with URLs
    names_url = names_url

    import time
    start_time = time.time()

    result_df = asyncio.run(main(names_url))

    end_time = time.time()

    print(f"Total time taken: {end_time - start_time} seconds")
result_df

Total time taken: 283.84322214126587 seconds


Unnamed: 0,name,club,profile_image,year,market_value
0,Kevin De Bruyne,Manchester City,https://img.a.transfermarkt.technology/portrai...,32,€70.00m
1,Harry Kane,Bayern Munich,https://img.a.transfermarkt.technology/portrai...,30,€110.00m
2,Mohamed Salah,Liverpool FC,https://img.a.transfermarkt.technology/portrai...,31,€65.00m
3,Sadio Mané,Al-Nassr FC,https://img.a.transfermarkt.technology/portrai...,31,€25.00m
4,,,,,
...,...,...,...,...,...
19995,Bogdan Jeler,Minaur Baia Mare,https://img.a.transfermarkt.technology/portrai...,20,-
19996,Alexandru Caia,AS Vointa Lupac,https://img.a.transfermarkt.technology/portrai...,20,-
19997,Fabian Østigård Ness,IK Start,https://img.a.transfermarkt.technology/portrai...,24,€150k
19998,,,,,


In [17]:
part_1 = result_df

In [18]:
part_1

Unnamed: 0,name,club,profile_image,year,market_value
0,Kevin De Bruyne,Manchester City,https://img.a.transfermarkt.technology/portrai...,32,€70.00m
1,Harry Kane,Bayern Munich,https://img.a.transfermarkt.technology/portrai...,30,€110.00m
2,Mohamed Salah,Liverpool FC,https://img.a.transfermarkt.technology/portrai...,31,€65.00m
3,Sadio Mané,Al-Nassr FC,https://img.a.transfermarkt.technology/portrai...,31,€25.00m
4,,,,,
...,...,...,...,...,...
19995,Bogdan Jeler,Minaur Baia Mare,https://img.a.transfermarkt.technology/portrai...,20,-
19996,Alexandru Caia,AS Vointa Lupac,https://img.a.transfermarkt.technology/portrai...,20,-
19997,Fabian Østigård Ness,IK Start,https://img.a.transfermarkt.technology/portrai...,24,€150k
19998,,,,,


In [19]:
part_1.to_csv('../raw_data/names_1.csv', index=False)

# second scraping

In [2]:
df = pd.read_csv("../raw_data/russian_players.csv")

In [3]:
df

Unnamed: 0,name_x,club_x,profile_image,year,market_value,name_y,club_y,nat,position,dob,...,striker,winger,division_rating,nat_rating,club_rating,either_left,either_right,left,right,RUS_names
0,,,,,,Georgy Djikia,Spartak Moscow,RUS,D (C),21/11/1993 (26 years old),...,0,0,79.8,1497.63,1530.0,0,0,1,0,Georgiy Djikia
1,Oleg Shatov,,https://img.a.transfermarkt.technology/portrai...,33.0,-,Oleg Shatov,Rubin,RUS,"M (C), AM (RLC)",29/7/1990 (29 years old),...,0,0,79.8,1497.63,1471.0,0,0,0,1,Oleg Shatov
2,,,,,,Alexandr Yerokhin,Zenit,RUS,"M (RC), AM (LC)",13/10/1989 (30 years old),...,0,0,79.8,1497.63,1685.0,0,0,0,1,Aleksandr Yerokhin
3,1P Agency,,,,,Dmitry Barinov,Lokomotiv Moscow,RUS,"DM, M (C)",11/9/1996 (23 years old),...,0,0,79.8,1497.63,1544.0,0,0,0,1,Dmitriy Barinov
4,,,,,,Alexey Sutormin,Zenit,RUS,"WB (R), M (RL), AM (RLC)",10/1/1994 (26 years old),...,0,0,79.8,1497.63,1685.0,0,0,0,1,Aleksey Sutormin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,,,,,,Alexey Miranchuk,Atalanta,RUS,M/AM (RC),17/10/1995 (24 years old),...,0,0,89.5,1497.63,1726.0,0,0,1,0,Aleksey Miranchuk
276,,,,,,Alexandr Kokorin,Fiorentina,RUS,"AM (RC), ST (C)",19/3/1991 (29 years old),...,1,0,89.5,1497.63,1630.0,0,0,0,1,Aleksandr Kokorin
277,,,,,,Alexandr Zhirov,SVS 1916,RUS,D (C),24/1/1991 (29 years old),...,0,0,77.1,1497.63,1156.0,0,0,1,0,Aleksandr Zhirov
278,,,,,,Aleksei Wsewolodsky,Alvarado,RUS,AM (R),19/3/2004 (16 years old),...,0,1,71.4,1497.63,1156.0,0,0,0,1,Aleksei Wsewolodsky


In [4]:
url = "https://www.transfermarkt.com/schnellsuche/ergebnis/schnellsuche?query="
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}

In [5]:
names = pd.Series(df.RUS_names)

In [6]:
%%time
names_url = names.map(lambda x: url + x.lower().replace(" ", "+"))
names_url

CPU times: user 493 µs, sys: 50 µs, total: 543 µs
Wall time: 535 µs


0      https://www.transfermarkt.com/schnellsuche/erg...
1      https://www.transfermarkt.com/schnellsuche/erg...
2      https://www.transfermarkt.com/schnellsuche/erg...
3      https://www.transfermarkt.com/schnellsuche/erg...
4      https://www.transfermarkt.com/schnellsuche/erg...
                             ...                        
275    https://www.transfermarkt.com/schnellsuche/erg...
276    https://www.transfermarkt.com/schnellsuche/erg...
277    https://www.transfermarkt.com/schnellsuche/erg...
278    https://www.transfermarkt.com/schnellsuche/erg...
279    https://www.transfermarkt.com/schnellsuche/erg...
Name: RUS_names, Length: 280, dtype: object

In [7]:
async def fetch(session, url, headers, semaphore):
    async with semaphore:
        async with session.get(url, headers=headers) as response:
            return await response.text()

async def scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore):
    async with aiohttp.ClientSession() as session:
        html = await fetch(session, url, headers, semaphore)
        soup = BeautifulSoup(html, 'html.parser')

        # Extracting club
        club_element = soup.select_one(club_selector)
        club = club_element['alt'] if club_element else 'None'

        # Extracting profile image
        image_element = soup.select_one(image_selector)
        profile_image = image_element['src'] if image_element else 'None'

        # Extracting year
        year_element = soup.select_one(year_selector)
        year = year_element.text.strip() if year_element else 'None'

        # Extracting market value
        market_value_element = soup.select_one(market_value_selector)
        market_value = market_value_element.text.strip() if market_value_element else 'None'
        
        # Extracting name
        name_element = soup.select_one(name_selector)
        name = name_element.text.strip() if name_element else 'None'

        return {'name': name, 'club': club, 'profile_image': profile_image, 'year': year, 'market_value': market_value}

async def main(names_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
    }

    # Specify the CSS selectors for the club, image, year, and market value elements
    club_selector = 'a[title^=""] img.tiny_wappen'
    image_selector = 'img.bilderrahmen-fixed'
    year_selector = 'td.zentriert:nth-of-type(4)'  # Update with the correct selector for the year
    market_value_selector = 'td.rechts.hauptlink'  # Update with the correct selector for market value
    name_selector = 'td.hauptlink a'

    # Set the maximum number of concurrent requests
    max_concurrent_requests = 3000
    semaphore = asyncio.Semaphore(max_concurrent_requests)

    # Use asyncio.gather to concurrently scrape multiple pages
    tasks = [scrape_page(url, headers, club_selector, image_selector, year_selector, market_value_selector, name_selector, semaphore) for url in names_url]
    results = await asyncio.gather(*tasks)

    # Convert the results list to a DataFrame
    df = pd.DataFrame(results)

    return df

if __name__ == "__main__":
    # Example Pandas Series with URLs
    names_url = names_url

    import time
    start_time = time.time()

    result_df = asyncio.run(main(names_url))

    end_time = time.time()

    print(f"Total time taken: {end_time - start_time} seconds")
result_df

Total time taken: 4.088274955749512 seconds


Unnamed: 0,name,club,profile_image,year,market_value
0,,,,,
1,Oleg Shatov,,https://img.a.transfermarkt.technology/portrai...,33,-
2,,,,,
3,Dmitriy Barinov,Lokomotiv Moscow,https://img.a.transfermarkt.technology/portrai...,27,€8.00m
4,Aleksey Sutormin,Zenit St. Petersburg,https://img.a.transfermarkt.technology/portrai...,29,€1.80m
...,...,...,...,...,...
275,Aleksey Miranchuk,Atalanta BC,https://img.a.transfermarkt.technology/portrai...,28,€9.00m
276,Aleksandr Kokorin,Aris Limassol,https://img.a.transfermarkt.technology/portrai...,32,€1.00m
277,Aleksandr Zhirov,Baltika Kaliningrad,https://img.a.transfermarkt.technology/portrai...,32,€600k
278,,,,,


In [8]:
result_df[result_df.name == 'None']

Unnamed: 0,name,club,profile_image,year,market_value
0,,,,,
2,,,,,
9,,,,,
13,,,,,
20,,,,,
...,...,...,...,...,...
254,,,,,
259,,,,,
271,,,,,
273,,,,,
