In [None]:
from typing import List, Iterable

import csv
import time
import json
from datetime import datetime
from dateutil.parser import parse as parse_date

import bs4
import requests
from attrs import frozen
from lxml import etree
import feedparser
from snscrape.modules.telegram import TelegramChannelScraper
from facebook_scraper import get_posts as get_facebook_posts
from ensta import Host as InstaClient
from tqdm.notebook import tqdm

In [None]:
@frozen
class TextEntry:
    author: str
    text: str
    date: datetime
    source: str
    source_type: str

In [None]:
NAVALNY_BLOG_URL = r'https://navalny.com'
NUM_PAGES_IN_BLOG = 298
NAVALNY_TELEGRAM_NAME = r'navalny'
NAVALNY_TWITTER_NAME = r'navalny'
NAVALNY_FB_NAME = r'navalny'
NAVALNY_INSTA_NAME = r'navalny'

VOLKOV_LJ_URL = r'https://leonwolf.livejournal.com'
VOLKOV_TELEGRAM_NAME = r'leonid_volkov'
VOLKOV_TWITTER_NAME = r'leonidvolkov'
VOLKOV_FB_NAME = r'leonid.m.volkov'
VOLKOV_INSTA_NAME = r'leonidvolkov'

SOBOL_LJ_URL = r'https://sobollubov.livejournal.com'
SOBOL_TELEGRAM_NAME = r'TeamSobol'
SOBOK_TWITTER_NAME = r'SobolLubov'
SOBOL_FB_NAME = r'soboll.ru'
SOBOL_INSTA_NAME = r'sobollubov'

PEVCHIKH_TWITTER_NAME = r'pevchikh'
PEVCHIKH_INSTA_NAME = r'maria_pevchikh'

ALBUROV_FB_NAME = r'alburov'
ALBUROV_TWITTER_NAME = r'alburov'
ALBUROV_INSTA_NAME = r'alburov'

ZHDANOV_TELEGRAM_NAME = r'ioannzh'
ZHDANOV_TWITTER_NAME = r'ioannZH'
ZHDANOV_FB_NAME = r'zhdanovivan'
ZHDANOV_INSTA_NAME = r'ioannzh'

YARMYSH_TELEGRAM_NAME = r'Kira_Yarmysh'
YARMYSH_TWITTER_NAME = r'Kira_Yarmysh'
YARMYSH_INSTA_NAME = r'kira_yarmysh'

In [None]:
with open('./credentials/lj_cookies.json') as file:
    lj_cookies_json = json.load(file)
    lj_cookies = requests.cookies.RequestsCookieJar()

    for cookie in lj_cookies_json:
        lj_cookies.set(cookie['name'], cookie['value'], domain=cookie['domain'], path=cookie['path'])

In [None]:
with open('credentials/insta_credentials.json') as file:
    credentials = json.load(file)
    insta_username = credentials['username']
    insta_password = credentials['password']

insta_client = InstaClient(insta_username, insta_password)

In [None]:
def get_lj_page(url: str, num_attempts: int = 10) -> bs4.BeautifulSoup:
    for _ in range(num_attempts):
        response = requests.get(url, cookies=lj_cookies)
        if response.status_code == 200:
            return bs4.BeautifulSoup(response.text)
        time.sleep(0.2)
    raise RuntimeError(f'Failed to get {url}')


def iter_lj_articles(lj_url: str) -> Iterable[bs4.element.Tag]:
    next_url = lj_url
    while True:
        page = get_lj_page(next_url)
        articles = page.find_all('article')
        articles = [article for article in articles if 'j-e-no-entries-message' not in article.get('class', [])]
        if not articles:
            break
        yield from articles
        next_link_element = page.find('li', class_='j-nav-item j-page-nav-item j-page-nav-item-prev')
        if not next_link_element:
            break
        next_url = next_link_element.find('a').get('href')


def parse_lj(author: str, lj_url: str) -> List[TextEntry]:
    result = []

    articles = iter_lj_articles(lj_url)
    for article in tqdm(articles, desc=f'Parsing {lj_url}'):
        date_elem = article.find('time')
        if not date_elem:
            continue
        date_str = date_elem.get('datetime')
        date = datetime.strptime(date_str, '%Y-%m-%d')
        title_elem = article.find('h3', class_='entryunit__title')
        post_url = title_elem.find('a').get('href')
        text = article.find('div', class_='entryunit__text').text
        result.append(TextEntry(author=author, date=date, source=post_url, source_type=lj_url, text=text))

    return result

In [None]:
def clean_xml(xml_string: str) -> str:
    build_text_list = etree.XPath("//text()")
    tree = etree.fromstring(xml_string, etree.HTMLParser())
    return "\n".join(build_text_list(tree))

In [None]:
def parse_rss_feed(author: str, source_type: str, url: str) -> List[TextEntry]:
    feed = feedparser.parse(url)
    result = []

    for item in feed['entries']:
        source = item['link']
        date = parse_date(item['published'])
        title = item['title']
        text_body = clean_xml(item['description'])
        text = f'{title}\n{text_body}'
        result.append(TextEntry(author=author, source=source, source_type=source_type, date=date, text=text))

    return result

In [None]:
def parse_telegram_channel(author: str, channel_name: str) -> List[TextEntry]:
    scraper = TelegramChannelScraper(channel_name)
    result = []
    for post in tqdm(scraper.get_items(), desc=f'Parsing telegram {channel_name}'):
        source = post.url
        text = post.content
        if not text:
            continue
        source_type = f't.me/{channel_name}'
        date = post.date
        result.append(TextEntry(author=author, source=source, text=text, source_type=source_type, date=date))

    return result

In [None]:
def parse_fb_feed(author: str, username: str, cookie_path: str = './credentials/fb_cookies.json', pages: int = 100) -> List[TextEntry]:
    posts = get_facebook_posts(username, cookies=cookie_path, pages=pages, options={"posts_per_page": 200})
    result = []
    for post in tqdm(posts, desc=f'Parsing facebook {username}'):
        date = post['time']
        text = post['post_text']
        if not text:
            continue
        source = post['post_url']
        source_type = f'facebook.com/{username}'
        result.append(TextEntry(author=author, date=date, text=text, source=source, source_type=source_type))

    return result

In [None]:
def parse_insta_feed(author: str, username: str, client: InstaClient) -> List[TextEntry]:
    posts = client.posts(username)
    result = []
    for post in tqdm(posts, desc=f'Parsing instagram {username}'):
        if post is None:
            continue
        text = post.caption_text
        if not text or post.user.username != username:
            continue
        source = post.share_url
        source_type = f'instagram.com/{username}'
        date = datetime.fromtimestamp(post.taken_at)
        result.append(TextEntry(author=author, text=text, source=source, source_type=source_type, date=date))

    return result

In [None]:
navalny_blog = []

for page_no in tqdm(range(1, NUM_PAGES_IN_BLOG+1), desc='Parsing Navalny blog'):
    response = requests.get(f'{NAVALNY_BLOG_URL}?p={page_no}')
    soup = bs4.BeautifulSoup(response.text)
    posts = soup.find_all('div', class_='b-post')
    for post in posts:
        title_element = post.find('h2', class_='b-title')
        link = title_element.find('a').get('href')
        post_url = f'{NAVALNY_BLOG_URL}{link}'
        text = post.find('div', class_='b-post__content').text.strip()
        date_str = post.find('div', class_='b-post__info__item').text.strip()
        date = datetime.strptime(date_str, '%d.%m.%Y, %H:%M')
        navalny_blog.append(TextEntry(author='Navalny', text=text, source=post_url, source_type=NAVALNY_BLOG_URL, date=date))

In [None]:
navalny_telegram = parse_telegram_channel('Navalny', NAVALNY_TELEGRAM_NAME)
navalny_fb = parse_fb_feed('Navalny', NAVALNY_FB_NAME)
navalny_insta = parse_insta_feed('Navalny', NAVALNY_INSTA_NAME, insta_client)

volkov_lj = parse_lj('Volkov', VOLKOV_LJ_URL)
volkov_telegram = parse_telegram_channel('Volkov', VOLKOV_TELEGRAM_NAME)
volkov_fb = parse_fb_feed('Volkov', VOLKOV_FB_NAME)
volkov_insta = parse_insta_feed('Volkov', VOLKOV_INSTA_NAME, insta_client)

sobol_lj = parse_lj('Sobol', SOBOL_LJ_URL)
sobol_telegram = parse_telegram_channel('Sobol', SOBOL_TELEGRAM_NAME)
sobol_fb = parse_fb_feed('Sobol', SOBOL_FB_NAME)
sobol_insta = parse_insta_feed('Sobol', SOBOL_INSTA_NAME, insta_client)

pevchikh_insta = parse_insta_feed('Pevchikh', PEVCHIKH_INSTA_NAME, insta_client)

alburov_fb = parse_fb_feed('Alburov', ALBUROV_FB_NAME)
alburov_insta = parse_insta_feed('Alburov', ALBUROV_INSTA_NAME, insta_client)

zhdanov_telegram = parse_telegram_channel('Zhdanov', ZHDANOV_TELEGRAM_NAME)
zhdanov_fb = parse_fb_feed('Zhdanov', ZHDANOV_FB_NAME)
zhdanov_insta = parse_insta_feed('Zhdanov', ZHDANOV_INSTA_NAME, insta_client)

yarmysh_telegram = parse_telegram_channel('Yarmysh', YARMYSH_TELEGRAM_NAME)
yarmysh_insta = parse_insta_feed('Yarmysh', YARMYSH_INSTA_NAME, insta_client)

In [None]:
all_texts = sum([
    navalny_blog,
    navalny_telegram,
    navalny_fb,
    navalny_insta,
    
    volkov_lj,
    volkov_telegram,
    volkov_fb,
    volkov_insta,

    sobol_lj,
    sobol_telegram,
    sobol_fb,
    sobol_insta,

    pevchikh_insta,

    alburov_fb,
    alburov_insta,

    zhdanov_telegram,
    zhdanov_fb,
    zhdanov_insta,

    yarmysh_telegram,
    yarmysh_insta,
], start=[])

In [None]:
with open('fbk_archive.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=['author', 'text', 'date', 'url', 'source_type'])
    for text in all_texts:
        writer.writerow(dict(
            author=text.author,
            text=text.text,
            date=text.date.strftime('%Y-%m-%d'),
            url=text.source,
            source_type=text.source_type
        ))