In [1]:
from urllib.request import urlopen, Request
from lxml import html
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
from bs4 import BeautifulSoup
import requests

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

In [3]:
def page_content_print_time(thread_id):
    %time df_posts, df_users = page_content(thread_id)
    return df_posts, df_users

In [4]:
def thread_ids(pages, verbose=True):
    print('Fetching pages...')
    count = 0
    base_main_url = 'https://www.fxp.co.il/forumdisplay.php?f=46&page=%d'
    thread_ids = []
    for i in pages:
        main_url = base_main_url % i
        response = requests.get(main_url)
        contents = response.content.decode("utf-8")
        response.close()
        currenct_thread_ids = re.findall("showthread\.php\?t=(.*)\" id=", contents)
        thread_ids +=  [int(x) for x in currenct_thread_ids]
        count += 1
        if verbose and (count % 10 == 0):
            print('Fetched %s pages' % (count))
            print(datetime.now().time())
    thread_ids.remove(12069815)
    return thread_ids


In [5]:
def page_content(thread_id):
    df_thread_new = pd.DataFrame(columns=['thread_id', 'title'])
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    page = 1
    url='https://www.fxp.co.il/showthread.php?t=%s' % (thread_id)
    response_page = None
    while page == 1 or not response_page.history:
        if page > 1:
            url='https://www.fxp.co.il/showthread.php?t=%s&page=%s' % (thread_id,str(page))
        response_page = requests.get(url)
        content_page = response_page.content.decode("utf-8")
        if page == 1:
            soup = BeautifulSoup(content_page, 'lxml')
            title = title = soup.title.getText()
            df_thread_new = df_thread_new.append({'thread_id':thread_id, 'title':title}, ignore_index=True)
        df_post_new_, df_user_new_ = thread_single_page_content(thread_id, content_page, page)
        df_post_new = pd.concat([df_post_new,df_post_new_])
        df_user_new = pd.concat([df_user_new,df_user_new_])
        page += 1
    return df_thread_new, df_post_new, df_user_new

In [6]:
def thread_single_page_content(thread_id, contents, page):
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    soup = BeautifulSoup(contents, 'lxml')
    posts = soup.find_all(id=re.compile('^post_[0-9].*'))
    for post in posts:
        post_id = post.get('id').replace('_','')
        message = post.find('blockquote', {'class': 'postcontent restore'}).getText().strip()
        try:
            user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("span" , recursive=True)[0].getText()
        except:
            user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("strong" , recursive=True)[0].getText()
            #post_counter = post.find('a', {'class': 'postcounter'}).getText()
            #print(f'Error in thread: {thread_id} ,page: {page} in {message} {post_counter}')
        message_date = post.find('span', {'class': 'date'}).getText()
        cites = post.find_all('div', {'class': 'bbcode_quote'})
        cited_post = []
        for cite in cites:
            for c in cites:
                try:
                    link = c.find('a')['href']
                    cited_post.append(link[link.find('#')+1:])
                except:
                    cited_post.append('custom cites')
        user_details = post.find('dl', {'class': 'userstats-new'}).find_all('dd')
        user_reg_date = user_details[0].getText()
        user_message_count = user_details[1].getText()
        try:
            signiture_text = post.find('blockquote', {'class': 'signature restore'}).getText()
        except:
            signiture_text = ''
        df_post_new = df_post_new.append({'thread_id': thread_id, 'post_id': post_id,'user_name': user_name,'date':message_date,
                            'message': message,'cite1':cited_post[0] if len(cites) > 0 else '',
                            'cite2': cited_post[1] if len(cites) > 1 else '', 
                            'cite3': cited_post[2] if len(cites) > 2 else '',
                            'cite4': cited_post[3] if len(cites) > 3 else ''}, ignore_index=True)
        df_user_new = df_user_new.append({'user_name': user_name, 'register_date': user_reg_date, 'message_count': user_message_count, 'signiture_text': signiture_text}, ignore_index=True)
    return df_post_new, df_user_new

In [7]:
def thread_content(thread_ids, thread_file='thread.csv', post_file='post.csv', user_file='user.csv',verbose=True):
    print('Fetching threads...')
    count = 0
    df_thread_new = pd.DataFrame(columns=['thread_id', 'title','type'])
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        future_to_url = {executor.submit(page_content, thread_id): thread_id for thread_id in thread_ids}
        print(f'finish to submit all jobs')
        df = pd.DataFrame(columns=['thread', 'post'])
        for future in concurrent.futures.as_completed(future_to_url):
            df_thread_new_, df_post_new_, df_user_new_=future.result()
            df_thread_new = pd.concat([df_thread_new,df_thread_new_])
            df_post_new = pd.concat([df_post_new,df_post_new_])
            df_user_new = pd.concat([df_user_new,df_user_new_])
    df_thread_new.to_csv(thread_file, encoding='utf-8')        
    df_post_new.to_csv(post_file, encoding='utf-8')        
    df_user_new.to_csv(user_file, encoding='utf-8')

In [8]:
%time thread_content(thread_ids(range(1, 2)))

Fetching pages...
Fetching threads...
finish to submit all jobs
Wall time: 34.5 s
