In [1]:
from bs4 import BeautifulSoup as bs
import requests as rq
import pandas as pd
import glob
import re

### DataFrame Setup

In [24]:
df_columns = ['FORUM TITLE', 'THREAD TITLE', 'THREAD AUTHOR', 'REPLIES', 'VIEWS', 'URL', 'METADATA', 'DATE', 'BODY', 'RATINGS LOG']
all_thread_info = []
def create_dataframe(): return pd.DataFrame(all_thread_info, columns = df_columns)

### Load/Parse HTML Files

In [32]:
html_files = glob.glob('eforum/*.html')
print (html_files)

def fetch_thread_html(thread_url):
    try:
        return get_bs_object(rq.get(thread_url).text)
    except requests.exceptions.RequestException as e:
        print (e)
        
def get_bs_object(html): return bs(html, 'lxml')

def open_html_file(file_name): return open(file_name)

['eforum/forumdisplay.php_fid=13_page=1.html', 'eforum/forumdisplay.php_fid=13_page=10.html', 'eforum/forumdisplay.php_fid=13_page=11.html', 'eforum/forumdisplay.php_fid=13_page=12.html', 'eforum/forumdisplay.php_fid=13_page=13.html', 'eforum/forumdisplay.php_fid=13_page=14.html', 'eforum/forumdisplay.php_fid=13_page=15.html', 'eforum/forumdisplay.php_fid=13_page=16.html', 'eforum/forumdisplay.php_fid=13_page=17.html', 'eforum/forumdisplay.php_fid=13_page=18.html', 'eforum/forumdisplay.php_fid=13_page=19.html', 'eforum/forumdisplay.php_fid=13_page=2.html', 'eforum/forumdisplay.php_fid=13_page=3.html', 'eforum/forumdisplay.php_fid=13_page=4.html', 'eforum/forumdisplay.php_fid=13_page=5.html', 'eforum/forumdisplay.php_fid=13_page=6.html', 'eforum/forumdisplay.php_fid=13_page=7.html', 'eforum/forumdisplay.php_fid=13_page=8.html', 'eforum/forumdisplay.php_fid=13_page=9.html', 'eforum/forumdisplay.php_fid=17_page=1.html', 'eforum/forumdisplay.php_fid=17_page=2.html', 'eforum/forumdisplay.ph

### Helper Functions

In [26]:
def strip_newlines(string): return string.replace('\n', '')

### BeautifulSoup Parsing

In [27]:
def extract_thread_urls(thread): return thread.findAll('a')[0]['href']
def get_thread_tables(soup_html): return soup_html.find('form').findAll('table')
def get_thread_row_columns(row): return row.findAll('td')
def find_thread_table_partition(thread_tables):
    for i in range(len(thread_tables)):
        thread_cat_partitions = thread_tables[i].find('tr', {'class': 'category'})
        if thread_cat_partitions != None:
            if len(thread_cat_partitions.findAll(text='Threads')) > 0:
                return i
def extract_thread_table_info(thread_table):
    thread_url = thread_table.find('td', {'class': 'f_title'}).a['href'] or ''
    title = strip_newlines(thread_table.find('td', {'class': 'f_title'}).text) or ''
    author = thread_table.find('td', {'class': 'f_author'}).text.split('\n')[1] or ''
    views_and_replies = strip_newlines(thread_table.find('td', {'class': 'f_views'}).text) or ''
    replies, views = views_and_replies.split('/')
    return [title, author, replies, views, thread_url]
def get_forum_title(html): return html.find('tr', {'class': 'header'}).tr.td.text
def extract_user_info(thread_html):
    user_info = thread_html.find('td', {'class': 't_user'})
    if user_info == None:
        return ''
    else:
        metadata = user_info.find('div', {'class': 'smalltxt'}).text
        return metadata
def extract_post_date(thread_html):
    date_col = thread_html.find('table', {'class': 't_msg'})
    if date_col == None:
        return ''
    else:
        return strip_newlines(str(date_col.tr.td.div.findAll('div')[1].next)[9:])
def extract_post_body(thread_html):
    body = thread_html.find('table', {'class': 't_msg'})
    if body == None:
        return ''
    else:
        return body.findAll('div')[5].text 
def extract_fieldset(thread_html):
    rating_log = thread_html.find("a", title="View Rating Log")
    return '' if rating_log == None else rating_log.find_parent('fieldset')

### Main

In [28]:
count = 0
for html_file in html_files:
    print (html_file)
    html_page = get_bs_object(open_html_file(html_file))
    forum_title = get_forum_title(html_page)
    thread_tables = get_thread_tables(html_page)
    partition = find_thread_table_partition(thread_tables)
    filtered_thread_tables = thread_tables[partition:]
    for thread_table in filtered_thread_tables:
        thread_info = extract_thread_table_info(thread_table)
        thread_html = fetch_thread_html(thread_info[-1])
        thread_info.append(extract_user_info(thread_html))
        thread_info.append(extract_post_date(thread_html))
        thread_info.append(extract_post_body(thread_html))
        thread_info.append(extract_fieldset(thread_html))
        all_thread_info.append([forum_title] + thread_info)
    count += 1
    print (count, '/', len(html_files), 'DONE')
df = create_dataframe()

eforum/forumdisplay.php_fid=13_page=1.html
1 / 10 DONE
eforum/forumdisplay.php_fid=13_page=10.html
2 / 10 DONE
eforum/forumdisplay.php_fid=13_page=11.html
3 / 10 DONE
eforum/forumdisplay.php_fid=13_page=12.html
4 / 10 DONE
eforum/forumdisplay.php_fid=13_page=13.html
5 / 10 DONE
eforum/forumdisplay.php_fid=13_page=14.html
6 / 10 DONE
eforum/forumdisplay.php_fid=13_page=15.html
7 / 10 DONE
eforum/forumdisplay.php_fid=13_page=16.html
8 / 10 DONE
eforum/forumdisplay.php_fid=13_page=17.html
9 / 10 DONE
eforum/forumdisplay.php_fid=13_page=18.html
10 / 10 DONE


In [443]:
df.to_csv('Sex141ThreadData.csv')