# Scrape Forum Data
This notebook scrapes the Youbemom special needs forum and inserts the results into a SQLite database

## Data Sources
- Youbemom forum: https://www.youbemom.com/forum/special-needs

## Changes
- 2020-08-11: Started project
- 2020-08-18: Updated forum crawl
- 2020-08-22: Updated database structure
- 2020-10-22: Added additional subforums
- 2020-10-24: Added deleted post column

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
 - subforum: subforum name
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post
 - subforum: subforum name

## TODO
- 

## Imports

In [1]:
from time import sleep
import re
import sqlite3
from datetime import datetime, timedelta
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from pathlib import Path
from dateutil.parser import parse

## Functions
For accessing the database

In [2]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as err:
        print(err)
    return conn

In [3]:
def set_up_db(conn):
    """ if the database exists, drop it and create a
        SQLite database for the results
    :param conn: database connection
    :return: nothing
    """
    cur = conn.cursor()
    cur.executescript('''
        CREATE TABLE IF NOT EXISTS threads (
            id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
            url TEXT,
            subforum TEXT
        );
        CREATE TABLE IF NOT EXISTS posts (
            id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
            family_id INTEGER,
            message_id TEXT,
            parent_id INTEGER,
            date_recorded TEXT,
            date_created TEXT,
            title TEXT,
            body TEXT,
            subforum TEXT,
            deleted INTEGER
        );
    ''')

In [4]:
def write_to_threads(conn, url, subforum):
    """ inserts the parsed data into the threads table
    :param parsed: a tuple of the parsed data
    :return: nothing
    """
    sql = ''' INSERT INTO threads(url, subforum)
    VALUES(?,?) '''
    parsed = (url, subforum)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    conn.commit()

In [5]:
def url_not_in_threads(conn, url):
    """ checks to see if scraped url is already 
        in the threads database
    :return: True if not scraped, False if already scraped
    """
    sql = ''' SELECT id FROM threads WHERE url=?'''
    parsed = (url,)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    result = cur.fetchone()
    if result:
        return False
    else:
        return True

In [6]:
def write_to_posts(parsed, conn):
    """ inserts the parsed data into the posts table
    :param parsed: a tuple of the parsed data
    :return: nothing
    """
    sql = ''' INSERT INTO posts(family_id,message_id,parent_id,date_recorded,date_created,title,body,subforum,deleted)
    VALUES(?,?,?,?,?,?,?,?,?) '''
    cur = conn.cursor()
    cur.execute(sql, parsed)
    conn.commit()

In [7]:
def message_not_in_posts(conn, message_id, deleted):
    """ checks to see if the message_id is already 
        in the posts database, used to not overwrite
        posts with the deleted message when re-running
        the scraping of the posts
    :return: True if not scraped, False if already scraped
    """
    sql = ''' SELECT id, deleted FROM posts WHERE message_id=? '''
    parsed = (message_id, )
    cur = conn.cursor()
    cur.execute(sql, parsed)
    result = cur.fetchone()
    if result:
        if result[1] != deleted: # deleted after first scrape
            sql = ''' UPDATE posts SET deleted=1 WHERE message_id=? '''
            parsed = (message_id, )
            cur = conn.cursor()
            cur.execute(sql, parsed)
            conn.commit()
        return False
    else:
        return True

For scraping the soup

In [8]:
def requests_retry_session(retries=5, backoff_factor=.1, session=None):
    """ retry the request, backing off with longer rest each time
    :param retries: number of retries
    :param backoff_factor: each retry is longer by {backoff factor} * (2 ** ({number of total retries} - 1))
    :param session: persist session across requests
    :return session: session
    """
    session = session or requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

In [9]:
def get_soup(next_url):
    """ get the soup from the url
    :param next_url: string of next url to query
    :return soup: soup of url html
    :note: uses html5lib and not html because missing html returns errors
    """
    try:
        res_next = requests_retry_session().get(next_url)
    except:
        return False
    soup = BeautifulSoup(res_next.content, "html5lib")
    return soup

In [10]:
def get_top_posts(soup):
    """ get each main list item from the page
    :param soup: url's html
    :return lis: list of threads from page
    """
    ol = soup.find('ol', id="thread-list")
    lis = ol.find_all('li', recursive=False)
    return lis

For parsing post text

In [11]:
def fix_ago(date_created, date_recorded):
    """ if the post date created includes a relative
        instead of absolute time (ago vs m-d-y), fix
        and replace the time
    :param date_created: the date the post was created
    :param date_recorded: date recorded by the scraper
    :return dc: date_created in datetime
    """
    if "hr" in date_created:
        if "min" in date_created:
            l = re.findall("[0-9]+", date_created)
            dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(hours=int(l[0]), minutes=int(l[1]))
        else:
            l = re.findall("[0-9]+", date_created)[0]
            dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(hours=int(l))        
    else:
        l = re.findall("[0-9]+", date_created)[0]
        dc = datetime.strptime(date_recorded, "%m-%d-%Y %H:%M:%S") - timedelta(minutes=int(l))
    return dc

In [12]:
def fix_date(date_created):
    """ removes extra text from date
    :param date_created: string of date
    :return dc: stripped string of date
    """
    dc = date_created.replace('posted ','')
    dc = dc.replace(' in Tween/Teen','')
    dc = dc.replace(' in Elementary','')
    dc = dc.replace(' in Preschool','')
    dc = dc.replace(' in Toddler','')
    dc = dc.replace(' in Newborn','')
    dc = dc.replace(' in Special Needs','')
    dc = dc.replace(' in Expecting','')
    dc = dc.replace(' in TTC','')
    dc = dc.replace(' in Single Parents','')
    dc = dc.replace(' in Weight Watchers','')
    dc = dc.replace(' in YBM Feedback','')
    dc = dc.replace(' in Boston','')
    dc = dc.replace(' in Chicago','')
    dc = dc.replace(' in Los Angeles','')
    dc = dc.replace(' in New York City','')
    dc = dc.replace(' in NYC Schools','')
    return dc

In [13]:
def parse_thread(conn, thread, subforum, date_recorded):
    """ parse the thread url and date created
    :param thread: input thread soup
    :param conn: connection to db
    :return url: url of the thread
    :return date_created: date the thread was created
    """
    url_find = thread.find("a", text=re.compile("permalink"))
    if url_find:
        url = url_find["href"]
        if 'https://www.youbemom.com' in url: # some urls scraped with site
            url = url.replace('https://www.youbemom.com','')
        date_created = thread.find('span', {'class' : 'meta date'}).get_text()
        # if doesn't contain "ago", change time with strptime
        if "ago" in date_created:
            date_created = fix_ago(date_created, date_recorded)
        else:
            date_created = datetime.strptime(date_created, "%m-%d-%Y %I:%M%p")
        if url_not_in_threads(conn, url):
            write_to_threads(conn, url, subforum)
    else:
        print("didn't find url: " + str(thread))
        date_created = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    return date_created

In [14]:
def get_subforum(soup):
    action = soup.find('form', {'id' : 'search'})
    if action:
        subforum = action['action']
        subforum = subforum.replace("/forum/","")
        return subforum
    else:
        return False

In [15]:
def clean_text(text):
    """ clean the text of extra spaces, new lines,
        ellipses, and (more) text
    :param text: input text
    :return text: cleaned text
    """
    text = text.strip()
    text = re.sub("\(more\)", "", text)
    text = text.strip()
    text = re.sub("\s+", " ", text)
    text = re.sub("\n", "", text)
    text = re.sub("\.{3}", "", text)
    return text

In [16]:
def parse_post_parent(soup, conn, family_id, date_recorded, subforum):
    """ parse the list items into a format that can be
        inserted into the database (top post in thread)
    :param soup: input soup of parent post
    :param conn: connection to db
    :param date_recorded: date scraping the data
    """
    title_html = soup.find("h1")
    title = title_html.get_text()
    title = clean_text(title)
    if title_html.has_attr('class') and "removed" in title_html['class']:
        deleted = 1
    else:
        deleted = 0
    message_id = title_html["id"]
    body_html = soup.find('div', {'class' : 'message', 'id' : "p" + message_id}, recursive=False)
    if body_html:
        body = body_html.get_text()
        body = body.replace('log in or sign up to post a comment', '')
        body = clean_text(body)
    else:
        body = ""
    date_created = soup.find('div', {'class' : 'date'}).get_text()
    # if doesn't contain "ago", change time with strptime
    if "ago" in date_created:
        date_created = fix_ago(date_created, date_recorded)
    else:
        date_created = fix_date(date_created)
        date_created = parse(date_created)
    if message_not_in_posts(conn, message_id, deleted):
        parsed = (family_id,message_id,"",date_recorded,date_created,title,body,subforum,deleted)
        write_to_posts(parsed, conn)
    return message_id

In [17]:
def parse_post_child(soup, conn, family_id, parent_id, date_recorded, subforum):
    """ parse the list items into a format that can be
        inserted into the database (child replys)
    :param soup: input soup of child post
    :param conn: connection to db
    :param family_id: id of the family thread
    :param parent_id: id of the parent post to this child
    :param date_recorded: date scraping the data
    :NOTE: unlike top post, must re.compile class because
           it might be class='noskimwords reply removed'
    """
    title_html = soup.find('span', {'class' : re.compile('noskimwords reply')})
    title = title_html.get_text()
    title = clean_text(title)
    if title_html.has_attr('class') and "removed" in title_html['class']:
        deleted = 1
    else:
        deleted = 0
    message_id = title_html["id"]
    body_html = soup.find('div', {'class' : 'message', 'id' : "p" + message_id}, recursive=False)
    if body_html:
        body = body_html.get_text()
        body = body.replace('log in or sign up to post a comment', '')
        body = clean_text(body)
    else:
        body = ""
    date_created = soup.find('span', {'class' : 'meta date'}).get_text()
    # if doesn't contain "ago", change time with strptime
    if "ago" in date_created:
        date_created = fix_ago(date_created, date_recorded)
    else:
        date_created = fix_date(date_created)
        date_created = parse(date_created)
    if message_not_in_posts(conn, message_id, deleted):
        parsed = (family_id,message_id,parent_id,date_recorded,date_created,title,body,subforum,deleted)
        write_to_posts(parsed, conn)
    return message_id

In [18]:
def search_children(children, conn, family_id, parent_id, date_recorded, subforum):
    for child in children:
        message_id = parse_post_child(child, conn, family_id, parent_id, date_recorded, subforum)
        replies = child.find('ul')
        if replies:
            grandchildren = replies.find_all("li", recursive=False)
            search_children(grandchildren, conn, family_id, message_id, date_recorded, subforum)

For looping through the forum

In [19]:
def get_current_link():
    soup = get_soup("https://www.youbemom.com/forum/toddler")
    if soup:
        top_posts = get_top_posts(soup)
        thread = top_posts[0]
        url_find = thread.find("a", text=re.compile("permalink"))
        url = url_find["href"]
        url = url.replace('https://www.youbemom.com/forum/permalink/','')
        url = url.replace('/forum/permalink/','')
        current = re.findall('[0-9]+', url)[0]
        return current

In [20]:
def loop_threads(conn, path_db, subforum, earliest):
    forum_url = "https://www.youbemom.com/forum/" + subforum
    next_url = forum_url[:]
    scraped_earliest = False
    page = 1 # threads
    print("subforum: " + subforum)
    while not scraped_earliest:
        date_recorded = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
        if page > 1000:
            break
        if page % 10 == 0:
            print("page: " + str(page))
        soup = get_soup(next_url)
        if soup:
            top_posts = get_top_posts(soup)
            # parse each top thread post for url
            for thread in top_posts:
                date_created = parse_thread(conn, thread, subforum, date_recorded)
                scraped_earliest = date_created < earliest
        else:
            print("error page " + str(page) + " with url " + url)
        page += 1
        next_url = forum_url + "?pg=" + str(page)

In [21]:
def loop_link_threads(conn, path_db, earliest_link, last_link):
    forum_url = "https://www.youbemom.com/forum/permalink/"
    sql = """ SELECT MAX(id) FROM threads """
    cur = conn.cursor()
    cur.execute(sql)
    max_id = cur.fetchone()[0]
    if max_id:
        next_id = int(max_id) + 1
    else:
        next_id = 1
    print(next_id)
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print("Current Time: ", current_time)
    bad_urls = []
    bad_ids = []
    for post_num in range(earliest_link + next_id - 1, last_link + 1):
        if next_id % 1000 == 0:
            print("id: " + str(next_id))
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print("Current Time =", current_time)
        next_url = forum_url + str(post_num)
        soup = get_soup(next_url)
        if soup:
            url = "/forum/permalink/" + str(post_num)
            subforum = get_subforum(soup)
            if subforum:
                write_to_threads(conn, url, subforum)
                row = [next_id, url]
                bad = parse_row(conn, path_db, subforum, row)
                bad_urls = bad_urls + bad['bad_url']
                bad_ids = bad_ids + bad['bad_id']
        next_id += 1
    return {"bad_urls":bad_urls, "bad_ids":bad_ids}

In [22]:
def loop_posts(conn, path_db, subforum):
    sql = """ SELECT * FROM threads WHERE subforum='{}' """.format(subforum)
    cur = conn.cursor()
    cur.execute(sql)
    batch = 1 # posts
    batch_size = 100
    bad_urls = []
    bad_ids = []
    while True:
        print("batch: " + str(batch))
        rows = cur.fetchmany(batch_size)
        if not rows: break
        for row in rows:
            bad = parse_row(conn, path_db, subforum, row)
            bad_urls = bad_urls + bad['bad_url']
            bad_ids = bad_ids + bad['bad_id']
        batch += 1
    return {"bad_urls":bad_urls, "bad_ids":bad_ids}

In [42]:
def parse_row(conn, path_db, subforum, row):
    date_recorded = datetime.now().strftime("%m-%d-%Y %H:%M:%S")
    family_id = row[0]
    url = row[1]
    bad_url = []
    bad_id = []
    if 'https://www.youbemom.com' not in url and 'http://www.youbemom.com' not in url:
        url = 'https://www.youbemom.com' + url
    soup = get_soup(url)
    if soup:
        message_id = parse_post_parent(soup, conn, family_id, date_recorded, subforum)
        replies = soup.find('ul', {'id' : 'reply-list'})
        if replies:
            children = replies.find_all('li', recursive=False)
            search_children(children, conn, family_id, message_id, date_recorded, subforum)
    else:
        print("connection error")
        print("could not get url: " + url)
        print("with family id: " + str(family_id))
        bad_url.append(url)
        bad_id.append(family_id)
    return {"bad_url":bad_url, "bad_id":bad_id}

## File Locations

In [24]:
p = Path.cwd()
path_parent = p.parents[0]
path_db = path_parent / "database" / "youbemomTables.db"
path_db = str(path_db)

## Variables
The toddler forum is so much larger than the other forums so it is scraped separately (see below)

In [25]:
forum_list = ["special-needs", "newborn", "preschool", "elementary", "tween-teen"]

## Scrape the Thread URLs
Connect to the database and create the tables.
NOTE: Scrape from Permalink scrapes all posts 2018-01-01 to present
NOTE: Scrape from Subforum scrapes listed subforums up to earliest date

In [26]:
conn = create_connection(path_db)
set_up_db(conn)

### Scrape from Permalink
This would take ~27-55 days to run. I split the links into 10 parts and ran them in parallel.

In [27]:
# earliest_toddler = 9454477 # first toddler post in 2018
# current_toddler = int(get_current_toddler())
# bad = loop_toddler_threads(conn, path_db, earliest_toddler, current_toddler)
# print("bad urls: " + bad['bad_urls'])
# print("bad ids: " + bad['bad_ids'])

### Scrape from Subforum
Run while loop until scraper reaches the earliest post in the listed subforums

In [28]:
earliest = datetime(2014, 1, 1, 0, 0, 0)
for subforum in forum_list:
    loop_threads(conn, path_db, subforum, earliest)

subforum: special-needs
page: 10
page: 20
page: 30
page: 40
page: 50
page: 60
page: 70
page: 80
page: 90
page: 100
page: 110
page: 120
page: 130
page: 140
page: 150
page: 160
page: 170
page: 180
page: 190
page: 200
page: 210
page: 220
page: 230
page: 240
page: 250
page: 260
page: 270
page: 280
page: 290
page: 300
page: 310
page: 320
page: 330
page: 340
page: 350
page: 360
page: 370
page: 380
page: 390
page: 400
page: 410
page: 420
page: 430
page: 440
page: 450
page: 460
page: 470
page: 480
page: 490
page: 500
page: 510
page: 520
page: 530
page: 540
page: 550
page: 560
page: 570
page: 580
page: 590
page: 600
page: 610
page: 620
page: 630
page: 640
page: 650
page: 660
page: 670
page: 680
page: 690
page: 700
page: 710
page: 720
page: 730
page: 740
page: 750
page: 760
page: 770
page: 780
page: 790
page: 800
page: 810
page: 820
page: 830
page: 840
page: 850
page: 860
page: 870
page: 880
page: 890
page: 900
page: 910
page: 920
page: 930
page: 940
page: 950
page: 960
page: 970
page: 980
page:

## Scrape Thread Posts
For subforum in the forum and for urls in the threads table in that subforum, pulls a batch of 100 urls and scrapes each post in the thread

In [29]:
bad_urls = []
bad_ids = []
for subforum in forum_list:
    bad = loop_posts(conn, path_db, subforum)
    bad_urls = bad_urls + bad["bad_urls"]
    bad_ids = bad_ids + bad["bad_ids"]

batch: 1
batch: 2
batch: 3
batch: 4
batch: 5
batch: 6
batch: 7
batch: 8
batch: 9
batch: 10
batch: 11
batch: 12
batch: 13
batch: 14
batch: 15
batch: 16
batch: 17
batch: 18
batch: 19
batch: 20
batch: 21
batch: 22
batch: 23
batch: 24
batch: 25
batch: 26
batch: 27
batch: 28
batch: 29
batch: 30
batch: 31
batch: 32
batch: 33
batch: 34
batch: 35
batch: 36
batch: 37
batch: 38
batch: 39
batch: 40
batch: 41
batch: 42
batch: 43
batch: 44
batch: 45
batch: 46
batch: 47
batch: 48
batch: 49
batch: 50
batch: 51
batch: 52
batch: 53
batch: 54
batch: 55
batch: 56
batch: 57
batch: 58
batch: 59
batch: 60
batch: 61
batch: 62
batch: 63
batch: 64
batch: 65
batch: 66
batch: 67
batch: 68
batch: 69
batch: 70
batch: 71
batch: 72
batch: 73
batch: 74
batch: 75
batch: 76
batch: 77
batch: 78
batch: 79
connection error
could not get url: https://www.youbemom.com/forum/permalink/9079199/moms-of-kids-with-asd-who-are-older-than-say-5-7-what-do-you
with family id: 7818
connection error
could not get url: https://www.youb

Fix bad urls. Three worked the second time and twelve were http not https.

In [43]:
for i in range(len(bad_ids)):
    sql = ''' SELECT * FROM threads WHERE id=?'''
    parsed = (bad_ids[i],)
    cur = conn.cursor()
    cur.execute(sql, parsed)
    row = cur.fetchone()
    if row:
        bad = parse_row(conn, path_db, subforum, row)
        print(bad)
    else:
        print("did not get row {} from db".format(bad_ids[i]))

{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
{'bad_url': [], 'bad_id': []}
