Skip to content

Parsemachine lesson4 #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 30, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions parsing/4/bot.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import datetime
import os

from telegram import InlineKeyboardButton, InlineKeyboardMarkup
from telegram.ext import Updater, CommandHandler, CallbackQueryHandler

from main import OUT_FILENAME, OUT_XLSX_FILENAME


# TOKEN Telegram-бота, получить можно у @BotFather
TELEGRAM_TOKEN = ''


def start_handler(update, context):
text = 'Привет! Выбери, в каком формате прислать парсинг?'
keyboard = [
[InlineKeyboardButton(text='Получить JSON', callback_data='get_json'), ],
[InlineKeyboardButton(text='Получить XLSX', callback_data='get_xlsx'), ],
]
markup = InlineKeyboardMarkup(keyboard)
update.message.reply_text(text=text, reply_markup=markup)


def callback_handler(update, context):
query = update.callback_query
query.answer()
filenames = {
'get_json': OUT_FILENAME,
'get_xlsx': OUT_XLSX_FILENAME,
}
filename = filenames.get(query.data)
if filename:
modified_at = datetime.datetime.fromtimestamp(os.path.getmtime(filename)).strftime('%Y-%m-%d %H:%M:%S')
caption = 'Результат парсинга от {}.'.format(modified_at)
with open(filename, 'rb') as f:
context.bot.send_document(query.message.chat.id, document=f, caption=caption)


updater = Updater(TELEGRAM_TOKEN, use_context=True)
updater.dispatcher.add_handler(CommandHandler('start', start_handler))
updater.dispatcher.add_handler(CallbackQueryHandler(callback_handler))

updater.start_polling()
updater.idle()
111 changes: 111 additions & 0 deletions parsing/4/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
import json
import xlsxwriter

import requests
from bs4 import BeautifulSoup


PAGES_COUNT = 10
OUT_FILENAME = 'out.json'
OUT_XLSX_FILENAME = 'out.xlsx'


def dump_to_json(filename, data, **kwargs):
kwargs.setdefault('ensure_ascii', False)
kwargs.setdefault('indent', 1)

with open(filename, 'w') as f:
json.dump(data, f, **kwargs)


def dump_to_xlsx(filename, data):
if not len(data):
return None

with xlsxwriter.Workbook(filename) as workbook:
ws = workbook.add_worksheet()
bold = workbook.add_format({'bold': True})

headers = ['Название товара', 'Цена', 'Ссылка']
headers.extend(data[0]['techs'].keys())

for col, h in enumerate(headers):
ws.write_string(0, col, h, cell_format=bold)

for row, item in enumerate(data, start=1):
ws.write_string(row, 0, item['name'])
ws.write_string(row, 1, item['amount'])
ws.write_string(row, 2, item['url'])
for prop_name, prop_value in item['techs'].items():
col = headers.index(prop_name)
ws.write_string(row, col, prop_value)


def get_soup(url, **kwargs):
response = requests.get(url, **kwargs)
if response.status_code == 200:
soup = BeautifulSoup(response.text, features='html.parser')
else:
soup = None
return soup


def crawl_products(pages_count):
urls = []
fmt = 'https://parsemachine.com/sandbox/catalog/?page={page}'

for page_n in range(1, 1 + pages_count):
print('page: {}'.format(page_n))

page_url = fmt.format(page=page_n)
soup = get_soup(page_url)
if soup is None:
break

for tag in soup.select('.product-card .title'):
href = tag.attrs['href']
url = 'https://parsemachine.com{}'.format(href)
urls.append(url)

return urls


def parse_products(urls):
data = []

for url in urls:
print('\tproduct: {}'.format(url))

soup = get_soup(url)
if soup is None:
break

name = soup.select_one('#product_name').text.strip()
amount = soup.select_one('#product_amount').text.strip()
techs = {}
for row in soup.select('#characteristics tbody tr'):
cols = row.select('td')
cols = [c.text.strip() for c in cols]
techs[cols[0]] = cols[1]

item = {
'name': name,
'amount': amount,
'techs': techs,
'url': url,
}
data.append(item)

return data


def main():
urls = crawl_products(PAGES_COUNT)
data = parse_products(urls)
dump_to_json(OUT_FILENAME, data)
dump_to_xlsx(OUT_XLSX_FILENAME, data)


if __name__ == '__main__':
main()