In [9]:
# -*- coding: utf-8 -*-
#  File: parser.ipynb
#  Project: 'OTUS.PRO Homework #5'
#  Created by Gennady Matveev (gm@og.ly) on 22-05-2022.

<IPython.core.display.Javascript object>

# **$Homework$** **$5$**  
**OTUS Machine Learning Professional**

![Parsing and NLP](https://docs.google.com/uc?export=download&id=10d8UpDr67Ib2_GdlzCK2xVyuVrpNevsh)

#### Open notebook(s) on mybinder.org

### Import libraries

In [None]:
%load_ext nb_black
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
import re

%config InlineBackend.figure_format = 'retina'
from IPython.core.display import display, HTML

display(HTML("<style>.container { width:80% !important; }</style>"))

### Utility functions

In [None]:
# Convert day to month and day
def convert_day(day: int) -> tuple[int, int]:
    month_days = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
    m = 0
    d = 0
    while day > 0:
        d = day
        day -= month_days[m]
        m += 1
    return (m, d)


# Get claps number
def get_claps(claps_str: str) -> int:
    if (claps_str is None) or (claps_str == "") or (claps_str.split is None):
        return 0
    split = claps_str.split("K")
    claps = float(split[0])
    claps = int(claps * 1000) if len(split) == 2 else int(claps)
    return claps


# Get article cover image
def get_img(img_url: str, dest_folder: str, dest_filename: str) -> str:
    ext = img_url.split(".")[-1]
    if len(ext) > 4:
        ext = "jpg"
    dest_filename = f"{dest_filename}.{ext}"
    with open(f"{dest_folder}/{dest_filename}", "wb") as f:
        f.write(requests.get(img_url, allow_redirects=False).content)
    return dest_filename

### Main parser

In [None]:
def parse_medium(urls: str, dest_folder: str) -> pd.DataFrame:
    data = []
    article_id = 0
    year = 2021
    # Select number of days in a year at random
    selected_days = random.sample([i for i in range(1, 366)], 365)
    i = 0
    n = len(selected_days)
    for d in selected_days:
        i += 1
        month, day = convert_day(d)
        date = "{0}-{1:02d}-{2:02d}".format(year, month, day)
        print(f"{i} / {n} ; {date}")
        for publication, url in urls.items():
            time.sleep(random.sample([0.5, 1, 1.5, 2, 2.5], 1)[0])
            response = requests.get(url.format(year, month, day), allow_redirects=True)
            # Skip days with no publications
            if not response.url.startswith(url.format(year, month, day)):
                continue
            page = response.content
            soup = BeautifulSoup(page, "html.parser")
            articles = soup.find_all(
                "div",
                class_="postArticle postArticle--short js-postArticle js-trackPostPresentation js-trackPostScrolls",
            )
            for article in articles:
                title = article.find("h3", class_="graf--title")
                if title is None:
                    continue
                title = title.contents[0]

                article_id += 1

                author = article.find(
                    "a",
                    class_="ds-link ds-link--styleSubtle link link--darken link--accent u-accentColor--textNormal u-accentColor--textDarken",
                ).contents[0]

                subtitle = article.find("h4", class_="graf--subtitle")
                subtitle = subtitle.contents[0] if subtitle is not None else ""

                # image = article.find("img", class_="graf-image")
                # image = '' if image is None else get_img(image['src'], './data/images', f'{article_id}')

                article_url = article.find_all("a")[3]["href"].split("?")[0]

                if len(article.find_all("button")) > 1:
                    claps = get_claps(article.find_all("button")[1].contents[0])

                reading_time = article.find("span", class_="readingTime")
                reading_time = (
                    0
                    if reading_time is None
                    else int(reading_time["title"].split(" ")[0])
                )

                responses = article.find_all("a")
                if len(responses) == 7:  # 7
                    responses = responses[6].contents[0].split(" ")
                    if len(responses) == 0:
                        responses = 0
                    else:
                        responses = responses[0]
                else:
                    responses = 0

                data.append(
                    [
                        article_url,
                        author,
                        title,
                        subtitle,
                        claps,
                        responses,
                        reading_time,
                        publication,
                        date,
                    ]
                )
    medium_df = pd.DataFrame(
        data,
        columns=[
            # "id",
            "url",
            "author",
            "title",
            "subtitle",
            "claps",
            "responses",
            "reading_time",
            "publication",
            "date",
        ],
    )
    medium_df["responses"] = medium_df["responses"].astype(int)
    # Save to csv file
    medium_df.to_csv(dest_folder, index=False)
    return medium_df

### Parse it
#### Takes about one hour - excersise caution before proceeding)

In [None]:
urls = {
    "Towards Data Science": "https://towardsdatascience.com/archive/{0}/{1:02d}/{2:02d}",
}

destination_folder = "./data/medium_df.csv"

parse_medium(urls=urls, dest_folder=destination_folder)