# requirements

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from tqdm import trange, tqdm
from tqdm.notebook import tqdm_notebook as tn

# scraping

Set initial url

In [2]:
url = "https://rozetka.com.ua/ua/mobile-phones/c80003/"

Find number of pages (due to pagination)

In [3]:
soup = BeautifulSoup(requests.get(url).content)

In [4]:
number_of_pages = int(soup.find_all('a',{"class":"pagination__link"})[-1]['href']\
                        .split('/')[-2].strip('page='))

Set special delimiter for the csv file (due to comments with commas, semicolons and other symbols 

In [None]:
DELIMITER = '\t\t\t'

In [8]:
with open('./data/data.csv',"a+") as f:
    f.write(f'smartphone_title{DELIMITER}comment_text{DELIMITER}score\n')

Scraping and parsing

In [9]:
# for each page
for page_number in trange(1,number_of_pages+1):
    # find page url
    page_url = url+f'page={page_number}'
    # get page
    response = requests.get(page_url)
    # find cells with phones in page
    soup = BeautifulSoup(response.content)for 
    phones_cells = soup.find_all("li",
                     {"class": "catalog-grid__cell catalog-grid__cell_type_slim ng-star-inserted"})
    # for each phone cell in page
    for phone_cell in tn(phones_cells):
        # get url to smartphone comments page
        smartphone_cell_url = phone_cell.find('a')['href']
        smartphone_cell_comments_url = smartphone_cell_url+'comments/'
        # scrape this page
        soup = BeautifulSoup(requests.get(smartphone_cell_comments_url).content)
        # find all comments
        comments = soup.find_all("li",
                         {"class":"product-comments__list-item ng-star-inserted"})
        # find title of the smartphone 
        smartphone_title = soup.find('title').text.strip(' – відгуки покупців | ROZETKA')
        # if comments exist
        if comments is not None:
            for comment in comments:
                if comment is not None:
                    # check if comment text exists
                    if comment.find('p'):
                        comment_text = comment.find('p').text
                    else:
                        comment_text = ''
                    # collect 5 star rating from comment
                    score = 0
                    for star in comment.find_all('rz-gradient-star'):
                        score += star.find('path')['fill']=="#ffa900"
                    # save data to the file
                    with open('./data/data.csv',"a+") as f:
                        f.write(fr'{smartphone_title}')
                        f.write(DELIMITER)
                        f.write(repr(fr'{comment_text}'))
                        f.write(DELIMITER)
                        f.write(fr'{score}')
                        f.write('\n')

  0%|                                                    | 0/67 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

  1%|▋                                           | 1/67 [00:39<43:16, 39.33s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  3%|█▎                                          | 2/67 [01:12<38:53, 35.90s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  4%|█▉                                          | 3/67 [01:47<37:42, 35.35s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  6%|██▋                                         | 4/67 [02:23<37:23, 35.61s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  7%|███▎                                        | 5/67 [02:54<35:11, 34.05s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

  9%|███▉                                        | 6/67 [03:20<31:49, 31.31s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 10%|████▌                                       | 7/67 [03:46<29:35, 29.59s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 12%|█████▎                                      | 8/67 [04:12<27:46, 28.25s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 13%|█████▉                                      | 9/67 [04:38<26:40, 27.60s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 15%|██████▍                                    | 10/67 [05:03<25:32, 26.89s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 16%|███████                                    | 11/67 [05:28<24:37, 26.39s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 18%|███████▋                                   | 12/67 [05:53<23:45, 25.91s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 19%|████████▎                                  | 13/67 [06:18<22:57, 25.51s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 21%|████████▉                                  | 14/67 [06:42<22:13, 25.16s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 22%|█████████▋                                 | 15/67 [07:07<21:37, 24.95s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 24%|██████████▎                                | 16/67 [07:31<21:07, 24.85s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 25%|██████████▉                                | 17/67 [07:58<21:07, 25.35s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 27%|███████████▌                               | 18/67 [08:23<20:35, 25.22s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 28%|████████████▏                              | 19/67 [08:48<20:07, 25.16s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 30%|████████████▊                              | 20/67 [09:15<20:10, 25.76s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 31%|█████████████▍                             | 21/67 [09:40<19:30, 25.45s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 33%|██████████████                             | 22/67 [10:05<18:59, 25.33s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 34%|██████████████▊                            | 23/67 [10:35<19:42, 26.88s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 36%|███████████████▍                           | 24/67 [11:00<18:45, 26.17s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 37%|████████████████                           | 25/67 [11:24<17:58, 25.68s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 39%|████████████████▋                          | 26/67 [11:49<17:15, 25.27s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 40%|█████████████████▎                         | 27/67 [12:14<16:49, 25.24s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 42%|█████████████████▉                         | 28/67 [12:40<16:33, 25.47s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 43%|██████████████████▌                        | 29/67 [13:04<15:59, 25.25s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 45%|███████████████████▎                       | 30/67 [13:32<15:56, 25.86s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 46%|███████████████████▉                       | 31/67 [13:56<15:18, 25.52s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 48%|████████████████████▌                      | 32/67 [14:21<14:40, 25.16s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 49%|█████████████████████▏                     | 33/67 [14:46<14:11, 25.05s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 51%|█████████████████████▊                     | 34/67 [15:11<13:50, 25.18s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 52%|██████████████████████▍                    | 35/67 [15:35<13:16, 24.89s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 54%|███████████████████████                    | 36/67 [15:59<12:45, 24.69s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 55%|███████████████████████▋                   | 37/67 [16:24<12:17, 24.60s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 57%|████████████████████████▍                  | 38/67 [16:48<11:50, 24.50s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 58%|█████████████████████████                  | 39/67 [17:13<11:30, 24.65s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 60%|█████████████████████████▋                 | 40/67 [17:38<11:10, 24.83s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 61%|██████████████████████████▎                | 41/67 [18:03<10:45, 24.82s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 63%|██████████████████████████▉                | 42/67 [18:29<10:24, 24.97s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 64%|███████████████████████████▌               | 43/67 [18:53<09:56, 24.84s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 66%|████████████████████████████▏              | 44/67 [19:18<09:32, 24.90s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 67%|████████████████████████████▉              | 45/67 [19:42<09:04, 24.73s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 69%|█████████████████████████████▌             | 46/67 [20:08<08:42, 24.87s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 70%|██████████████████████████████▏            | 47/67 [20:31<08:11, 24.56s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 72%|██████████████████████████████▊            | 48/67 [20:56<07:44, 24.43s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 73%|███████████████████████████████▍           | 49/67 [21:21<07:22, 24.59s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 75%|████████████████████████████████           | 50/67 [21:44<06:54, 24.38s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 76%|████████████████████████████████▋          | 51/67 [22:10<06:36, 24.75s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 78%|█████████████████████████████████▎         | 52/67 [22:35<06:12, 24.82s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 79%|██████████████████████████████████         | 53/67 [23:01<05:53, 25.22s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 81%|██████████████████████████████████▋        | 54/67 [23:26<05:24, 24.99s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 82%|███████████████████████████████████▎       | 55/67 [23:50<04:57, 24.79s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 84%|███████████████████████████████████▉       | 56/67 [24:14<04:30, 24.59s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 85%|████████████████████████████████████▌      | 57/67 [24:39<04:05, 24.58s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 87%|█████████████████████████████████████▏     | 58/67 [25:03<03:39, 24.40s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 88%|█████████████████████████████████████▊     | 59/67 [25:28<03:17, 24.65s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 90%|██████████████████████████████████████▌    | 60/67 [25:52<02:51, 24.56s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 91%|███████████████████████████████████████▏   | 61/67 [26:16<02:26, 24.40s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 93%|███████████████████████████████████████▊   | 62/67 [26:41<02:01, 24.37s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 94%|████████████████████████████████████████▍  | 63/67 [27:05<01:37, 24.40s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 96%|█████████████████████████████████████████  | 64/67 [27:30<01:13, 24.48s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 97%|█████████████████████████████████████████▋ | 65/67 [27:54<00:48, 24.37s/it]

  0%|          | 0/60 [00:00<?, ?it/s]

 99%|██████████████████████████████████████████▎| 66/67 [28:19<00:24, 24.49s/it]

  0%|          | 0/40 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 67/67 [28:36<00:00, 25.62s/it]


In [10]:
pd.read_csv('./data/data.csv',delimiter=DELIMITER)

  pd.read_csv('./data/data_2.csv',delimiter='\t\t\t')


Unnamed: 0,smartphone_title,comment_text,score
0,Мобільний телефон Samsung Galaxy M33 5G 6/128G...,'Гарний телефон! Дуже круто виглядає! Розмір з...,5
1,Мобільний телефон Samsung Galaxy M33 5G 6/128G...,"'Сегодня забрал телефон, как оказалось телефон...",1
2,Мобільний телефон Samsung Galaxy M33 5G 6/128G...,'Дуже добрий смартфон и гарне обслуговування у...,5
3,Мобільний телефон Samsung Galaxy M33 5G 6/128G...,"'Сучасний, не дорогий, за такими характеристик...",5
4,Мобільний телефон Samsung Galaxy M33 5G 6/128G...,'В целом телефон соответствует своей стоимости...,4
...,...,...,...
5303,Смартфон iHunt Titan P4000 Pro 2021 Red,'Вже як тиждень користуються даними смартфоном...,5
5304,Смартфон iHunt Titan P4000 Pro 2021 Red,"'Модель телефона новая, очень классно работает...",5
5305,Смартфон iHunt Titan P4000 Pro 2021 Red,"'Замечательный телефон, данную марку выбрал по...",5
5306,Смартфон iHunt Titan P4000 Pro 2021 Red,'Попробую может зайдёт)))',4
