# Kinopoisk Reviews Parser

In [None]:
import os
import re
from datetime import datetime as dt 
import pandas as pd 
from tqdm.notebook import tqdm

from kinopoisk_reviews_scrapper import KinopoiskReviewsScrapper

## Сбор временных файлов // Parsing temp files 

Задаём список ссылок на фильмы. Ссылки должны вести на стартовую страницу фильма // Set a list of film links. Links should lead to the firm start page

In [None]:
urls = []

Запускаем парсинг // Start parsing 

In [None]:
urls_bar = tqdm(total=len(urls), desc='URLs progress', unit='url')
pages_bar = tqdm(unit='page')
reviews_bar = tqdm(unit='review')

In [None]:
parser = KinopoiskReviewsScrapper()
bad_urls = parser.collect_urls(urls, urls_bar, pages_bar, reviews_bar)

## Объединение временных файлов // Join temp files 

Собираем список временных папок, связанных с текущим парсингом // Gather list of temp folders related to current parsing 

In [None]:
movie_ids = tuple(map(lambda x: re.search('film/(\d+)', x).group(1), urls))
temp_folders = [p for p in os.listdir() if os.path.isdir(p) and any(map(lambda x: p.startswith(x), movie_ids))]

Открываем файлы во временных папках // Open files in temp folders 

In [None]:
folder_datas = list()
for folder in temp_folders:
    files_in_folder = tuple(map(lambda x: os.path.join(folder, x), os.listdir(folder)))
    dataframes = [pd.read_json(file, orient='records') for file in files_in_folder]
    if len(dataframes) > 0: 
        folder_data = pd.concat(dataframes)
        folder_datas.append(folder_data)

Собираем временные данные в один файл и сохраняем его // Gather temp data into one file and save it 

In [None]:
total_data = pd.json_normalize(pd.concat(folder_datas)['data'])
total_data = total_data.drop_duplicates(subset='comment_id')
print('Total reviews count in file:', total_data.shape[0])

today_date = dt.today().strftime('%y%m%d')
total_filename = f'kinopoisk_db_{today_date}.csv'
if os.path.exists(total_filename):
    copies_count = len([file for file in os.listdir() if total_filename.split('.')[0] in file])
    total_filename = total_filename.replace('.csv', f'_{copies_count}.csv')
total_data.to_csv(total_filename, index=False)
print('Saved to:', total_filename)