In [13]:
import requests
import grab
import json
import logging
import codecs
from collections import defaultdict
from lxml.etree import ElementTree, fromstring
from bs4 import BeautifulSoup
from tqdm import tqdm
from multiprocessing.dummy import Pool as ThreadPool
from IPython.display import HTML

# In case of Windows System go to https://www.lfd.uci.edu/~gohlke/pythonlibs/#pycurl and download binary
# ! pip install --user ./pycurl‑7.43.0.3‑cp37‑cp37m‑win_amd64.whl

### Get All Movie Links

In [8]:
url = 'https://www.kinopoisk.ru/top/lists/186/filtr/all/sort/order/page/%d/perpage/200/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko)' + 
                  'Chrome/61.0.3163.100 Safari/537.36',
}
g = grab.Grab(headers=headers)

MOVIE_URLS = set()

for x in [1,2,3,4,5]:
    try:
        g.go(url % x)
        MOVIE_URLS.update([x for x in g.xpath_list('//a/@href') if x.startswith('/film/') and x.count('/')==3])
    except Exception as e:
        logging.error(str(e))
        logging.error('Error on %s' % x)

MOVIE_URLS = list(MOVIE_URLS)
print('Total movie urls: %d' % len(MOVIE_URLS))
print('Examples: \n%s' % '\n'.join(MOVIE_URLS[:5]))

  del sys.path[0]


Total movie urls: 1040
Examples: 
/film/3563/
/film/783486/
/film/252626/
/film/688832/
/film/103785/


In [9]:
def process_movie(x):
    results = []
    g = grab.Grab(headers=headers)
    name = x.split('/')[2]
    try:
        i = 1
        while True:
            g.go('https://www.kinopoisk.ru%sord/rating/perpage/200/page/%d/#list' % (x, i), timeout=30)
            reviews = g.xpath_list("//div[@class='reviewItem userReview']")
            if not reviews:
                break
            for y in reviews:
                d = {'name': name}
                if 'bad' in y.getchildren()[2].get('class', ''):
                    d['sentiment'] = 'negative'
                elif 'good' in y.getchildren()[2].get('class', ''):
                    d['sentiment'] = 'positive'
                else:
                    d['sentiment'] = 'neutral'
                d['text'] = y.getchildren()[2].getchildren()[2].text_content().replace('\n', ' ')\
                    .replace('\t', ' ').replace('\r', ' ').strip()
                results.append(d)
            else:
                break
            i += 1
    except Exception as e:
        logging.error('%s %s' % (x, str(e)))
    return results

In [10]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [11]:
%%time
pool_size = 16
with codecs.open('./data/data_mp.csv', 'w', encoding='utf-8') as f_out:
    total_reviews = 0
    class_counters = defaultdict(int)
    for x in tqdm(chunks(MOVIE_URLS, pool_size), total=len(MOVIE_URLS) // pool_size):
        pool = ThreadPool(pool_size)
        res = pool.map(process_movie, x)
        pool.close()
        pool.join()
        for y in res:
            for z in y:
                f_out.write('%s\t%s\t%s\n' % (z['name'], z['sentiment'], z['text']))
                total_reviews += len(res)
                class_counters[z['sentiment']] += 1
print('Total %d reviews' % total_reviews)
print('Class balance: \n%s' % json.dumps(class_counters, indent=4))

  if __name__ == '__main__':
100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 65/65 [00:49<00:00,  1.70it/s]


Total 55968 reviews
Class balance: 
{
    "positive": 2945,
    "neutral": 258,
    "negative": 295
}
Wall time: 49.9 s
