# **Hungarian financial blog (*portfolio.forum*) scraping.**

In [5]:
import requests
from bs4 import BeautifulSoup as bs
import unicodedata
import pandas as pd
import re
import os
from tqdm import tqdm

## **Get all topics**

In [1]:
BASE_TOPIC_URL = 'https://forum.portfolio.hu/topics?o='

In [3]:
result = requests.get(BASE_TOPIC_URL + '1')
soup = bs(result.content, 'lxml', from_encoding = 'utf-8')

In [4]:
topics_soup = soup.find_all('div', {'class': 'topic'})

In [61]:
get_topic_name = lambda soup: soup.find('div', {'class': 'topicname'}).find('a').text.strip()
get_topic_url = lambda soup: soup.find('div', {'class': 'topicname'}).find('a')['href']
get_topic_id = lambda soup: soup.find('div', {'class': 'topicname'}).find('a')['href'].split('/')[-1]
get_topic_comment_count = lambda soup: soup.find('div', {'class': 'commentcount'}).text.strip()

In [62]:
def get_topic_data(soup):
    topic_name = get_topic_name(soup)
    topic_id = get_topic_id(soup)
    topic_url = get_topic_url(soup)
    topic_comment_count = get_topic_comment_count(soup)
    topic_data = {
        'topic_id': topic_id,
        'topic_name': topic_name,
        'topic_url': topic_url,
        'topic_comment_count': topic_comment_count
    }
    return topic_data

get_topic_data(topics_soup[0])

{'topic_id': '25564',
 'topic_name': 'Szerinted mennyit keres egy szlovák autóipari munkás? Igen, többet',
 'topic_url': 'https://forum.portfolio.hu/topics/szerinted-mennyit-keres-egy-szlovak-autoipari-munkas-igen-tobbet/25564',
 'topic_comment_count': '4'}

Finding the last page

In [40]:
def get_last_topic(soup):
    pages = soup.find_all('li', {'class': 'page-item'})
    pages = [int(page.text) for page in pages if re.search('\d', page.text)]
    return max(pages)

get_last_topic(soup)

707

Iteration over pages

In [63]:
topic_data = []
for page_num in tqdm(range(1, get_last_topic(soup) + 1)):
    result = requests.get(f'{BASE_TOPIC_URL}{page_num}')
    soup = bs(result.content, 'lxml', from_encoding = 'utf-8')
    topics_soup = soup.find_all('div', {'class': 'topic'})
    topic_data.extend([get_topic_data(topic) for topic in topics_soup])

100%|██████████| 707/707 [05:08<00:00,  2.29it/s]


In [72]:
topic_data = pd.DataFrame(topic_data)
topic_data.to_csv(os.path.join('data', 'output', 'topic_data.csv'), index = False, encoding = 'utf-8')
topic_data

Unnamed: 0,topic_id,topic_name,topic_url,topic_comment_count
0,18567,ORBÁN TAKARODJ !!!,https://forum.portfolio.hu/topics/orban-takaro...,47 243
1,20627,Tréder Topik :o),https://forum.portfolio.hu/topics/treder-topik...,190 818
2,38715,CoronaVirus,https://forum.portfolio.hu/topics/coronavirus/...,47 081
3,27644,BLUE topic,https://forum.portfolio.hu/topics/blue-topic/2...,9 499
4,38930,ELLENZÉKI összefogás HAZUGSÁGAI,https://forum.portfolio.hu/topics/ellenzeki-os...,3 492
...,...,...,...,...
28254,2627,Autóbalesetben elhunyt Kulcsár Anita,https://forum.portfolio.hu/topics/autobalesetb...,6
28255,2619,Amerikai óriás alap 5% fölött a MOL-ban,https://forum.portfolio.hu/topics/amerikai-ori...,4
28256,2610,Portfolio.hu Tőzsdeklub: Szárnyalás előtt a ve...,https://forum.portfolio.hu/topics/portfoliohu-...,2
28257,1852,"Tovább zuhant a Richter, történelmi zárócsúcs ...",https://forum.portfolio.hu/topics/tovabb-zuhan...,2


In [9]:
topic_data = pd.read_csv(os.path.join('data', 'output', 'topic_data.csv'), index_col = 'topic_id', encoding = 'utf-8')

In [10]:
print('Összesen:', topic_data['topic_comment_count'].apply(lambda x: int(x.replace(' ', ''))).sum(), 'bejegyzés')

Összesen: 7799513 bejegyzés


## **Get all posts from a topic**

In [85]:
url = 'https://forum.portfolio.hu/topics/otp-reszvenyesek-ide/5224?oldal=1&limit=100'
result = requests.get(url)
soup = bs(result.content, 'lxml', from_encoding = 'utf-8')
content = soup.find_all('div', {'class': 'comment'})

In [90]:
def get_topic_stat(soup):
    topic_data = soup.find('div', {'class': 'topic-head'})
    topic_details = topic_data.find('div', {'class': 'pull-left'})
    topic_founder = topic_details.find('a').text
    topic_start = topic_details.text
    topic_start = topic_start.replace(topic_founder, '').replace('Topiknyitó:', '')
    topic_title = topic_data.find('div', {'class': 'comment'}).find('h1').text
    return topic_founder, topic_start, topic_title

In [None]:
def get_first_page(soup):
    links = soup.find('div', {'class': 'topic-navigation'}).find_all('a', {'class': 'page-link'})
    first = [link for link in links if link.text == 'első'][0]
    first = re.sub('.*oldal=(\d+).*', '\\1', first['href'])
    return int(first)

In [91]:
first_page = get_first_page(soup)
topic_stat = get_topic_stat(soup)

In [93]:
get_text = lambda x: x.find('div', {'class': 'text'}).text
get_username = lambda x: x.find('a', {'class': 'username'}).text
get_date = lambda x: x.find('span', {'class': 'date'}).text
get_id = lambda x: x.find('div', {'class': 'id'}).text
get_prev_id = lambda x: x.find('div', {'class': 'prevcomment'}).find('a').text
get_prev_name = lambda x: x.find('div', {'class': 'prevcomment'}).text.split(get_prev_id(x))[-1]
get_like = lambda x: x.find('span', {'class': 'vote', 'data-vote': '1'}).text
get_unlike = lambda x: x.find('span', {'class': 'vote', 'data-vote': '-1'}).text

In [94]:
def get_user_ranks(comment):
    avatar = comment.find('div', {'class': 'avatar'})['data-content']
    ranks = bs(avatar, 'lxml').find_all('span')
    rank_data = {rank.find('i')['class'][0].replace('icon-', ''): int(rank.text) for rank in ranks if rank.attrs}
    return rank_data

In [95]:
def get_comment_data(comment):
    try:
        text = get_text(comment)
    except:
        text = None
    
    try:
        username = get_username(comment)
    except:
        username = None
    
    try:
        date = get_date(comment)
    except:
        date = None
    
    try:
        comment_id = get_id(comment)
    except:
        comment_id = None
    
    try:
        like = int(get_like(comment))
    except:
        like = None
    
    try:
        unlike = int(get_unlike(comment))
    except:
        unlike = None
    
    try:
        prev_id = get_prev_id(comment)
    except:
        prev_id = None
    
    try:
        prev_name = get_prev_name(comment)
    except:
        prev_name = None
    
    data = {
        'text': text,
        'username': username,
        'date': date,
        'id': comment_id,
        'like': like,
        'unlike': unlike,
        'prev_id': prev_id,
        'prev_name': prev_name
    }
        
    try:
        user_data = get_user_ranks(comment)
    except:
        user_data = {}
        
    data.update(user_data)
    return data

In [96]:
def get_page_data(url):
    result = requests.get(url)

    if result.status_code != 200:
        return False

    soup = bs(result.content, 'lxml', from_encoding = 'utf-8')

    content = soup.find_all('div', {'class': 'comment'})
    
    data = []
    for i, comment in enumerate(content[1:]):
        data.append(get_comment_data(comment))

    return pd.DataFrame(data)

get_page_data('https://forum.portfolio.hu/topics/otp-reszvenyesek-ide/5224?oldal=2&limit=100')

Unnamed: 0,text,username,date,id,like,unlike,prev_id,prev_name,expert,activity,popularity
0,"\nVera! \nNyugi, várjuk ki a nap végét.\n",gentlemanus,2006. 07. 27. 10:13,#200,0,0,#198,\n ...,4.0,2.0,4.0
1,"\nSzemi, megtisztelnél a figyelmeddel, Ricsitő...",sailor,2006. 07. 27. 10:10,#199,0,0,#198,\n ...,4.0,4.0,4.0
2,\n...jelenleg nem úgy tűnik...\n,Vera76,2006. 07. 27. 10:05,#198,0,0,#197,\n ...,3.0,3.0,1.0
3,\nremélem ez a nap már pozitiv lesz!\n,koi,2006. 07. 27. 09:31,#197,0,0,,,1.0,1.0,1.0
4,\nJó reggelt! Vajon ma merre indul az oti? A v...,Vera76,2006. 07. 27. 09:09,#196,0,0,,,3.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
95,"\nLényegében én is erre gondoltam gentle, csak...",egyetleneim,2006. 07. 25. 12:43,#105,0,0,#101,\n ...,5.0,3.0,4.0
96,\nvaloszinü szeptember ????mert attol kezdödik...,koi,2006. 07. 25. 12:01,#104,0,0,#102,\n ...,1.0,1.0,1.0
97,\nszerintetek az emberek hány százaléka fogja ...,koi,2006. 07. 25. 11:59,#103,0,0,#101,\n ...,1.0,1.0,1.0
98,"\nSztem azert, mert altalaban a nyar pangosabb...",akpo,2006. 07. 25. 11:57,#102,0,0,#98,\n ...,5.0,3.0,4.0


In [97]:
fill_url = lambda x: f'https://forum.portfolio.hu/topics/otp-reszvenyesek-ide/5224?oldal={x}&limit=100'

In [132]:
def get_all_msg(pages):
    bad = []

    if not os.path.exists('Data'):
        os.makedirs('Data')

    for page in tqdm(pages):

        data = pd.DataFrame()

        data = get_page_data(fill_url(page))
        if not isinstance(data, pd.DataFrame):
            bad.append(page)

        data.to_csv(os.path.join('Data', f'data_{page}.csv'), sep = ';', encoding = 'utf-8', index = False)

    pd.DataFrame(bad, columns = ['bad_page']).to_csv(os.path.join('Data', f'bad_page.csv'), sep = ';', encoding = 'utf-8', index = False)

In [None]:
#range(1, 11)
get_all_msg(range(1, first_page))

In [146]:
pd.read_csv(os.path.join('Data', 'bad_page.csv'), sep = ';', encoding = 'utf-8', header = 0)

Unnamed: 0,bad_page


In [164]:
data = pd.DataFrame()

for page in tqdm(range(1, first_page)):
    data = pd.concat([data, pd.read_csv(os.path.join('Data', f'data_{page}.csv'), sep = ';', encoding = 'utf-8', header = 0)])

data.to_csv(os.path.join('Data', 'data.csv'), sep = ';', encoding = 'utf-8', index = False)

100%|██████████| 11283/11283 [1:30:28<00:00,  2.08it/s]


# **ASWAN help**

In [1]:
import aswan

The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`
  import dash_html_components as html
The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`
  import dash_core_components as dcc


In [105]:
config = aswan.AswanConfig.default_from_dir('portfolio-forum')

comment_table = config.get_prod_table('comment')

project = aswan.Project(config)

In [2]:
config = aswan.AswanConfig.default_from_dir("imdb-env")


celeb_table = config.get_prod_table("person")
movie_table = config.get_prod_table("movie")

project = aswan.Project(config) # this creates the env directories by default

@project.register_handler
class CelebHandler(aswan.UrlHandler):
    url_root = "https://www.imdb.com"

    def parse_soup(self, soup):
        return {
            "name": soup.find("h1").find("span").text.strip(),
            "dob": soup.find("div", id="name-born-info").find("time")["datetime"],
        }

@project.register_handler
class MovieHandler(aswan.UrlHandler):
    url_root = "https://www.imdb.com"
    def parse_soup(self, soup):

        for cast in soup.find("table", class_="cast_list").find_all("td", class_="primary_photo")[:3]:
            link = cast.find("a")["href"]
            self.register_link_to_handler(link, CelebHandler)
        
        return {
            "title": soup.find("title").text.replace(" - IMDb", "").strip(),
            "summary": soup.find("div", class_="summary_text").text.strip(),
            "year": int(soup.find("span", id="titleYear").find("a").text),
        }


# all this registering can be done simpler :)
project.register_t2_table(celeb_table)
project.register_t2_table(movie_table)

@project.register_t2_integrator
class MovieIntegrator(aswan.FlexibleDfParser):
    handlers = [MovieHandler]

    def url_parser(self, url):
        return {"id": url.split("/")[-1]}

    def get_t2_table(self):
        return movie_table

@project.register_t2_integrator
class CelebIntegrator(aswan.FlexibleDfParser):
    handlers = [CelebHandler]
    def get_t2_table(self):
        return celeb_table

def add_init_urls():
    movie_urls = [
        "https://www.imdb.com/title/tt1045772",
        "https://www.imdb.com/title/tt2543164",
    ]

    person_urls = ["https://www.imdb.com/name/nm0000190"]
    project.add_urls_to_handler(MovieHandler, movie_urls)
    project.add_urls_to_handler(CelebHandler, person_urls)

add_init_urls()

In [3]:
project.run(with_monitor_process=True)

2021-09-19 20:04.56 [info     ] running function reset_surls   env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] function reset_surls returned None env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] running function _expire_surls env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] function _expire_surls returned None env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] running function _register_starter_urls env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] function _register_starter_urls returned 0 env=prod function_batch=run_prep
2021-09-19 20:04.56 [info     ] running function _create_scheduler env=prod function_batch=run_prep
2021-09-19 20:05.16 [info     ] ray dashboard: http://127.0.0.1:8265
2021-09-19 20:05.16 [info     ] launched ray with resources    CPU=4.0 memory=1360151348.0 node:146.110.61.245=1.0 object_store_memory=680075673.0
2021-09-19 20:05.16 [info     ] function _create_scheduler returned None env=

AttributeError: Can't pickle local object 'create_engine.<locals>.connect'