In [1]:
import json
import codecs
import logging
from collections import defaultdict

import regex
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool

# In case of Windows System go to https://www.lfd.uci.edu/~gohlke/pythonlibs/#pycurl and download binary
# ! pip install --user ./pycurl‑7.43.0.3‑cp37‑cp37m‑win_amd64.whl

### Get All Movie Links

In [2]:
base_url = 'https://www.kinopoisk.ru/lists/navigator/{0}/?page={1}&tab=all'
MOVIE_URLS = set()

In [3]:
# Insert your cookie from https://www.kinopoisk.ru/lists/navigator/2021/
cookies = '''
# SET YOUR COOKIE HERE
'''.replace('\n', '').split('; ')

cookies = {
    key: '='.join(values) for key, *values in [cookie.split('=') for cookie in cookies]
}

In [4]:
for year in [2021, 2020, 2019, 2018, 2017, 2016, 2015]:
    for page_idx in range(1, 30):
        result = requests.request('GET', base_url.format(year, page_idx), cookies=cookies)
        soup = BeautifulSoup(result.content.decode(), 'lxml')
        
        films_div = soup.find_all('div', {'class': regex.compile('desktop-seo-selection-film-item selection-list__film')})
        for film_div in films_div:
            film_href = film_div.findAll('a', {'class': regex.compile('selection-film-item-meta__link')})[0].attrs['href']
            MOVIE_URLS.add(film_href)

In [5]:
MOVIE_URLS = list(MOVIE_URLS)
print('Total movie urls: %d' % len(MOVIE_URLS))
print('Examples: \n%s' % '\n'.join(MOVIE_URLS[:5]))

Total movie urls: 7180
Examples: 
/film/957899/
/film/1297198/
/film/839818/
/film/843787/
/film/679565/


In [6]:
base_reviews_url = 'https://www.kinopoisk.ru{0}reviews/ord/date/status/all/perpage/200/'

In [7]:
def text_with_newlines(elem):
    text = ''
    for e in elem.recursiveChildGenerator():
        if isinstance(e, str):
            text += e.strip()
        elif e.name == 'br':
            text += '\n'
    return text

In [8]:
def process_movie(movie_href):
    results = []
    try:
        for idx in range(1):
            result = requests.request('GET', base_reviews_url.format(movie_href), cookies=cookies)
            soup = BeautifulSoup(result.content.decode(), 'lxml')

            reviews = soup.find_all('div', {'class': 'reviewItem userReview'})
            if not reviews:
                base = {'content': soup.extract(), 'name': movie_href}
                results.append(base)
                break
            for rewiev in reviews:
                rewiev_content = rewiev.find('div', {'class': regex.compile('response .*')})
                sentiment = rewiev_content.attrs['class'][1]
                text = rewiev_content.find('span', {'class': '_reachbanner_'})
                text = text_with_newlines(text).replace('\n', '')
                
                base = {'content': soup.extract(), 'name': movie_href}
                base['sentiment'] = sentiment
                base['text'] = text
                results.append(base)
            else:
                break
    except Exception as e:
        logging.error('%s %s' % (movie_href, str(e)))
    return results

In [9]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [10]:
pool_size = 24

all_reviews = []

total_reviews = 0
class_counters = defaultdict(int)
with open('./artifacts/data.csv', 'w', encoding='utf-8', errors='ignore') as file:
    for urls_chunk in tqdm(chunks(MOVIE_URLS, pool_size), total=len(MOVIE_URLS) // pool_size):
        pool = ThreadPool(pool_size)
        movies_reviews = pool.map(process_movie, urls_chunk)
        pool.close()
        pool.join()
        for movie_reviews in movies_reviews:
            all_reviews += movie_reviews
            for review in movie_reviews:
                if 'text' in review:
                    file.write('{0}\t{1}\t{2}\n'.format(review['name'], review['sentiment'], review['text']))
                    total_reviews += 1
                    class_counters[review['sentiment']] += 1
        file.flush()
                
print('Total {0:d} reviews'.format(total_reviews))
print('Class balance: \n{0}',format(json.dumps(class_counters, indent=4)))

300it [02:56,  1.70it/s]                         

Total 11986 reviews
Class balance: 
{0} {
    "good": 7384,
    "bad": 2311,
    "neutral": 2291
}



