In [9]:
#!pip install bs4

In [2]:
import pandas as pd
import asyncio
import nest_asyncio
nest_asyncio.apply()
import aiohttp
from bs4 import BeautifulSoup
from tqdm import tqdm

In [3]:

headers = {
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 YaBrowser/25.6.0.0 Safari/537.36'
}

page_min = 2859
page_max = 5917  # протестируй сначала на маленьком диапазоне!
my_range = range(page_min, page_max + 1)

final_dict = {
    'title': [],
    'out_year': [],
    'genres': [],
    'country': [],
    'duration': [],
    'description': [],
    'url_movie': [],
    'image_movie': [],
    'rating_votes': [],
    'film_rating_kp': [],
    'film_rating_imdb': []
}

sem = asyncio.Semaphore(20)  # ограничим число одновременных запросов


async def fetch(session, url):
    async with sem:
        async with session.get(url, headers=headers) as response:
            return await response.text()


async def parse_page(session, page):
    url = f'https://lux.kinogo.biz/page/{page}/'
    try:
        html = await fetch(session, url)
        soup = BeautifulSoup(html, 'html.parser')
        blocks = soup.find_all("div", class_="shortstory")

        for block in blocks:
            # Название
            h2 = block.find("h2")
            final_dict['title'].append(h2.get_text(strip=True) if h2 else None)

            # Год
            year_block = block.find("div", class_="shortstory__info-wrapper")
            year_a = year_block.find("a") if year_block else None
            final_dict['out_year'].append(year_a.get_text(strip=True) if year_a else None)

            # URL
            a_tag = block.find("a")
            final_dict['url_movie'].append(a_tag.get("href") if a_tag else None)

            # Постер
            poster_block = block.find("div", class_="shortstory__poster")
            img_tag = poster_block.find("img") if poster_block else None
            img_src = img_tag.get("data-src") if img_tag else None
            final_dict['image_movie'].append(f"https://lux.kinogo.biz{img_src}" if img_src else None)

            # Жанры
            genre_b = block.find('b', string='Жанр:')
            if genre_b:
                genre_links = genre_b.find_next_siblings('a')
                genres = [a.get_text(strip=True) for a in genre_links if a.get_text() not in ['Фильмы', 'Новинки']]
                final_dict['genres'].append(genres)
            else:
                final_dict['genres'].append(None)

            # Страна
            country_b = block.find('b', string='Страна:')
            if country_b:
                country_links = country_b.find_next_siblings('a')
                countries = [a.get_text(strip=True) for a in country_links]
                final_dict['country'].append(countries)
            else:
                final_dict['country'].append(None)

            # Продолжительность
            duration_b = block.find('b', string='Продолжительность:')
            final_dict['duration'].append(duration_b.next_sibling.strip() if duration_b and duration_b.next_sibling else None)

            # Описание
            excerpt = block.find("div", class_="excerpt")
            final_dict['description'].append(excerpt.get_text(strip=True) if excerpt else None)

            # Рейтинг сайта
            rating_div = block.find("div", class_="rating__votes")
            final_dict['rating_votes'].append(rating_div.get_text(strip=True) if rating_div else None)

            # КиноПоиск
            rating_kp = block.find("span", class_="kp")
            final_dict['film_rating_kp'].append(rating_kp.get_text(strip=True) if rating_kp else None)

            # IMDb
            rating_imdb = block.find("span", class_="imdb")
            final_dict['film_rating_imdb'].append(rating_imdb.get_text(strip=True) if rating_imdb else None)

    except Exception as e:
        print(f"Ошибка при обработке страницы {page}: {e}")


async def main():
    async with aiohttp.ClientSession() as session:
        tasks = [parse_page(session, page) for page in tqdm(my_range, desc="Парсинг")]
        await asyncio.gather(*tasks)

# Сохраняем результат
    df = pd.DataFrame(final_dict)
    df.to_csv("movies_dataset_async_20.csv", index=False)


if __name__ == "__main__":
    await main()

Парсинг: 100%|██████████| 3059/3059 [00:00<00:00, 715741.15it/s]


In [4]:
df = pd.read_csv('/Users/stanislawtsurkan/Downloads/movies_dataset_async_20.csv')

In [5]:
df.shape

(30590, 11)

In [6]:
df.head()

Unnamed: 0,title,out_year,genres,country,duration,description,url_movie,image_movie,rating_votes,film_rating_kp,film_rating_imdb
0,Том Сойер (2000),2000,[],['США'],86 мин.,"Заголовок: «Том Сойер» — мультфильм, в котором...",https://lux.kinogo.biz/84680-tom-sojer.html,https://lux.kinogo.biz/uploads/mini/minifull/7...,0/5 (0 гол.),KP 7.0,IMDB 5.9
1,Гражданин Дуэйн (2006),2006,['Комедия'],['Канада'],90 мин.,Гражданин Дуэйн – это легкая и увлекательная к...,https://lux.kinogo.biz/84764-grazhdanin-dujejn...,https://lux.kinogo.biz/uploads/mini/minifull/d...,0/5 (0 гол.),,IMDB 5.2
2,Чистюля Аояма (2017),2017,['Аниме'],['Япония'],24 мин,"«Чистюля Аояма» (2017) – японское аниме, полно...",https://lux.kinogo.biz/85779-chistjulja-aojama...,https://lux.kinogo.biz/uploads/mini/minifull/a...,4/5 (3 гол.),KP 6.4,IMDB 6.5
3,Вечность вечного 4: Ревущая тревога (2011),2011,['Аниме'],['Япония'],48 мин.,"""Вечность вечного 4: Ревущая тревога"" (оригина...",https://lux.kinogo.biz/86617-vechnost-vechnogo...,https://lux.kinogo.biz/uploads/mini/minifull/0...,0/5 (0 гол.),KP 6.6,IMDB 6.8
4,Умираю! Экода (2019),2019,['Аниме'],['Япония'],5 мин,"«Умираю! Экода» — это аниме 2019 года, предста...",https://lux.kinogo.biz/87322-umiraju-jekoda.html,https://lux.kinogo.biz/uploads/mini/minifull/5...,2.3/5 (3 гол.),,IMDB 6.0
