# Scrape Forum Data
This notebook scrapes the Youbemom special needs forum and inserts the results into a SQLite database

## Data Sources
- Youbemom forum: https://www.youbemom.com/forum/special-needs

## Changes
- 2020-08-11: Started project
- 2020-08-18: Updated forum crawl
- 2020-08-22: Updated database structure

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post

## Imports

In [1]:
from time import sleep
import re
import sqlite3
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from pathlib import Path
from dateutil.parser import parse

## Functions
For accessing the database

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as err:
        print(err)
    return conn

In [3]:
def set_up_db(conn):
    """ if the database exists, drop it and create a
        SQLite database for the results
    :param conn: database connection
    :return: nothing
    """
    cur = conn.cursor()
    cur.executescript('''
        CREATE TABLE IF NOT EXISTS threads (
            id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
            url TEXT
        );
        CREATE TABLE IF NOT EXISTS posts (
            id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
            family_id INTEGER,
            message_id TEXT,
            parent_id INTEGER,
            date_recorded TEXT,
            date_created TEXT,
            title TEXT,
            body TEXT
        );
    ''')

In [4]:
def write_to_threads(url, conn):
    """ inserts the parsed data into the threads table
    :param parsed: a tuple of the parsed data
    :return: nothing
    """
    sql = ''' INSERT INTO threads(url)
    VALUES(?) '''
    parsed = (url,)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    conn.commit()

In [5]:
def url_not_in_threads(url, conn):
    """ checks to see if scraped url is already 
        in the threads database
    :return: True if not scraped, False if already scraped
    """
    sql = ''' SELECT id FROM threads WHERE url=?'''
    parsed = (url,)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    result = cur.fetchone()
    if result:
        return False
    else:
        return True

In [6]:
def write_to_posts(parsed, conn):
    """ inserts the parsed data into the posts table
    :param parsed: a tuple of the parsed data
    :return: nothing
    """
    sql = ''' INSERT INTO posts(family_id,message_id,parent_id,date_recorded,date_created,title,body)
    VALUES(?,?,?,?,?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, parsed)
    conn.commit()

In [7]:
def message_not_in_posts(message_id, conn):
    """ checks to see if the message_id is already 
        in the posts database, used to not overwrite
        posts with the deleted message when re-running
        the scraping of the posts
    :return: True if not scraped, False if already scraped
    """
    sql = ''' SELECT id FROM posts WHERE message_id=?'''
    parsed = (message_id,)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    result = cur.fetchone()
    if result:
        return False
    else:
        return True

For scraping the soup

In [8]:
def requests_retry_session(retries=5, backoff_factor=10, session=None):
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [9]:
def get_soup(next_url):
    """ get the soup from the url
    :param next_url: string of next url to query
    :return: soup of url html
    """
    try:
        res_next = requests_retry_session().get(next_url)
    except:
        return False
    soup = BeautifulSoup(res_next.content, "html.parser")
    return soup

In [10]:
def get_top_posts(soup):
    """ get each main list item from the page
    :param soup: url's html
    :return lis: list of threads from page
    """
    ol = soup.find('ol', id="thread-list")
    lis = ol.find_all('li', recursive=False)
    return lis

For parsing post text

In [11]:
def fix_ago(date_created, date_recorded):
    """ if the post date created includes a relative
        instead of absolute time (ago vs m-d-y), fix
        and replace the time
    :param date_created: the date the post was created
    :param date_recorded: date recorded by the scraper
    :return: date_created in datetime
    """
    if "hr" in date_created:
        if "min" in date_created:
            l = re.findall("[0-9]+", date_created)
            dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(hours=int(l[0]), minutes=int(l[1]))
        else:
            l = re.findall("[0-9]+", date_created)[0]
            dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(hours=int(l))        
    else:
        l = re.findall("[0-9]+", date_created)[0]
        dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(minutes=int(l))
    return dc

In [12]:
def fix_date(date_created):
    dc = date_created.replace('posted ','')
    dc = dc.replace(' in Special Needs','')
    return dc

In [13]:
def parse_thread(thread, conn):
    """ parse the thread url and date created
    :param thread: input thread soup
    :param conn: connection to db
    :return url: url of the thread
    :return date_created: date the thread was created
    """
    url = thread.find("a", text=re.compile("permalink"))["href"]
    if 'https://www.youbemom.com' in url: # some urls scraped with site
        url = url.replace('https://www.youbemom.com','')
    date_created = thread.find('span', {'class' : 'meta date'}).get_text()
    # if doesn't contain "ago", change time with strptime
    if "ago" in date_created:
        date_created = fix_ago(date_created, date_recorded)
    else:
        date_created = datetime.strptime(date_created, "%m-%d-%Y %I:%M%p")
    if url_not_in_threads(url, conn):
        write_to_threads(url, conn)
    return date_created

In [14]:
def clean_text(text):
    """ clean the text of extra spaces, new lines,
        ellipses, and (more) text
    :param text: input text
    :return text: cleaned text
    """
    text = text.strip()
    text = re.sub("\(more\)", "", text)
    text = text.strip()
    text = re.sub("\s+", " ", text)
    text = re.sub("\n", "", text)
    text = re.sub("\.{3}", "", text)
    return text

In [15]:
def parse_post_parent(soup, conn, family_id, date_recorded):
    """ parse the list items into a format that can be
        inserted into the database (top post in thread)
    :param soup: input soup of parent post
    :param conn: connection to db
    :param date_recorded: date scraping the data
    """
    title_html = soup.find("h1")
    title = title_html.get_text()
    title = clean_text(title)
    message_id = title_html["id"]
    body_html = soup.find('div', {'class' : 'message', 'id' : "p" + message_id}, recursive=False)
    if body_html:
        body = body_html.get_text()
        body = body.replace('log in or sign up to post a comment', '')
        body = clean_text(body)
    else:
        body = ""
    date_created = soup.find('div', {'class' : 'date'}).get_text()
    # if doesn't contain "ago", change time with strptime
    if "ago" in date_created:
        date_created = fix_ago(date_created, date_recorded)
    else:
        date_created = fix_date(date_created)
        date_created = parse(date_created)
    if message_not_in_posts(message_id, conn):
        parsed = (family_id,message_id,"",date_recorded,date_created,title,body)
        write_to_posts(parsed, conn)
    return message_id

In [16]:
def parse_post_child(soup, conn, family_id, parent_id, date_recorded):
    """ parse the list items into a format that can be
        inserted into the database (child replys)
    :param soup: input soup of child post
    :param conn: connection to db
    :param family_id: id of the family thread
    :param parent_id: id of the parent post to this child
    :param date_recorded: date scraping the data
    :NOTE: unlike top post, must re.compile class because
           it might be class='noskimwords reply removed'
    """
    title_html = soup.find('span', {'class' : re.compile('noskimwords reply')})
    title = title_html.get_text()
    title = clean_text(title)
    message_id = title_html["id"]
    body_html = soup.find('div', {'class' : 'message', 'id' : "p" + message_id}, recursive=False)
    if body_html:
        body = body_html.get_text()
        body = body.replace('log in or sign up to post a comment', '')
        body = clean_text(body)
    else:
        body = ""
    date_created = soup.find('span', {'class' : 'meta date'}).get_text()
    # if doesn't contain "ago", change time with strptime
    if "ago" in date_created:
        date_created = fix_ago(date_created, date_recorded)
    else:
        date_created = fix_date(date_created)
        date_created = parse(date_created)
    if message_not_in_posts(message_id, conn):
        parsed = (family_id,message_id,parent_id,date_recorded,date_created,title,body)
        write_to_posts(parsed, conn)
    return message_id

In [17]:
def search_children(children, conn, family_id, parent_id, date_recorded):
    for child in children:
        message_id = parse_post_child(child, conn, family_id, parent_id, date_recorded)
        replies = child.find('ul')
        if replies:
            grandchildren = replies.find_all("li", recursive=False)
            search_children(grandchildren, conn, family_id, message_id, date_recorded)

## File Locations

In [18]:
p = Path.cwd()
path_parent = p.parents[0]
path_db = path_parent / "database" / "youbemomTables.db"
path_db = str(path_db)

## Variables

In [19]:
forum_url = "https://www.youbemom.com/forum/special-needs"
next_url = forum_url[:]
earliest = datetime(2019, 1, 1, 0, 0, 0)
scraped_earliest = False
page = 1 # threads
batch = 1 # posts
batch_size = 100

## Scrape the Thread URLs
Connect to the database and create the tables. NOTE: checks to see if a thread url has already been collected

In [20]:
conn = create_connection(path_db)
set_up_db(conn)

Run while loop until scraper reaches the earliest date I want to scrape

In [21]:
while not scraped_earliest:
    date_recorded = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    if page % 10 == 0:
        print("page: " + str(page))
    soup = get_soup(next_url)
    if soup:
        top_posts =  get_top_posts(soup)
        # parse each top thread post for url
        for thread in top_posts:
            date_created = parse_thread(thread, conn)
            scraped_earliest = date_created < earliest
    else:
        print("error page " + str(page) + " with url " + url)
    page += 1
    next_url = forum_url + "?pg=" + str(page)

page: 10
page: 20
page: 30
page: 40
page: 50
page: 60
page: 70
page: 80
page: 90
page: 100


## Scrape Thread Posts
Select all from the threads table

In [22]:
sql = ''' SELECT * FROM threads '''
cur = conn.cursor()
cur.execute(sql)
bad_urls = []
bad_ids = []

For urls in the threads table, pulls a batch of 100 urls and scrapes each post in the thread

In [23]:
while True:
    print("batch: " + str(batch))
    rows = cur.fetchmany(batch_size)
    if not rows: break
    for row in rows:
        date_recorded = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
        family_id = row[0]
        url = row[1]
        if 'https://www.youbemom.com' not in url:
            url = 'https://www.youbemom.com' + url
        soup = get_soup(url)
        if soup:
            message_id = parse_post_parent(soup, conn, family_id, date_recorded)
            replies = soup.find('ul', {'id' : 'reply-list'})
            if replies:
                children = replies.find_all('li', recursive=False)
                search_children(children, conn, family_id, message_id, date_recorded)
        else:
            print("connection error")
            print("could not get url: " + url)
            print("with family id: " + str(family_id))
            bad_urls.append(url)
            bad_ids.append(family_id)
    batch += 1

batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
