# Scrape Forum Data
This notebook scrapes the Netmums and inserts the results into a SQLite database

## Data Sources
- Netmums forum: https://www.netmums.com/coffeehouse/

## Changes
- 2020-01-30: Created
- 2020-02-02: Added thread scraping
- 2020-02-05: Moved functions to netmums.py

## TODO
- 

## Imports

In [23]:
import sqlite3
from pathlib import Path
from netmums import *
from scraping import *
from datetime import datetime
import os
import csv
import re
from tqdm import tqdm

## File Locations

In [24]:
p = Path.cwd()
path_parent = p.parents[0]

In [25]:
path_db = str(path_parent / "database" / "netmums-merged.db")
path_groups = str(path_parent / "scripts" / "netmums-group_{}.py")

## Get Forum List

Create list of Netmum forum pages in the db. Skip if already created. 

In [26]:
conn = create_connection(path_db)
cur = conn.cursor()

In [27]:
cur.execute(''' SELECT count(name) FROM sqlite_master WHERE type='table' AND name='forums' ''')
if cur.fetchone()[0] == 0:
    set_up_merged_db(conn)

In [28]:
cur.execute(''' SELECT count(id) FROM forums ''')
if cur.fetchone()[0] == 0:
    url = "https://www.netmums.com/coffeehouse/"
    soup = get_soup(url)
    forums = soup.find_all("div", {"class": "left"})
    for forum in forums:
        if forum_link := forum.find('a', href=True):
            parsed = (forum_link['href'].strip(), forum_link.get_text().strip())
            sql = '''
                INSERT INTO forums(url,name)
                VALUES(?,?)
            '''
            cur.execute(sql, parsed)
conn.commit()

## Subforums

In [29]:
cur.execute(''' SELECT count(id) FROM subforums ''')
if cur.fetchone()[0] == 0:
    cur.execute(''' SELECT id, url FROM forums ''')
    rows = cur.fetchall()
    for row in rows:
        (forum_id, url) = row
        soup = get_soup(url)
        for subforum in soup.find_all("a", {"class": "cCatTopic"}):
            parsed = (subforum['href'].strip(), subforum.get_text().strip(),forum_id)
            sql = '''
                INSERT INTO subforums(url,name,forum_id)
                VALUES(?,?,?)
            '''
            cur.execute(sql, parsed)
conn.commit()

## Threads

In [30]:
cur.execute(''' SELECT count(id) FROM threads ''')
if cur.fetchone()[0] == 0:
    cur.execute(''' SELECT id, url FROM subforums ''')
    rows = cur.fetchall()
    for row in tqdm(rows):
        scrape_threads(cur, row)
        conn.commit()

## Posts
Create individual scraper files for each chunk of thread urls. Run with: ```nohup python3 netmums-group_xx.py > output_xx.txt &```

Posts can have different formatting in different situations, which leads to problems with parsing:
1. Must parse posts with citations to other posts differently than those without citations
2. Long urls are shortened when posted directly
3. WYSIWYG emojis must be translated from image to text
4. Skip the first post on each page after the first, it is duplicated from the last

This can be restart automatically. It deletes the last record collected in threads and starts rescraping that thread_id (if one exists).

In [31]:
cur.execute(''' SELECT max(id) FROM threads ''')
max_id = cur.fetchone()[0]

In [32]:
n_groups = 10
size = round(max_id / n_groups)

In [33]:
text = """#!/usr/bin/env python3
# coding: utf-8

## Imports

from netmums import *
from scraping import *
from pathlib import Path

## File Locations

p = Path.cwd()
path_parent = p.parents[0]
path_db_parent = str(path_parent / "database" / "netmums-merged.db")
path_db_child = str(path_parent / "database" / "netmums{0}.db")

## Connect to the database and create the tables.

### find max id in child db posts
conn = create_connection(path_db_child)
set_up_posts_db(conn)
cur = conn.cursor()
cur.execute(''' SELECT MAX(thread_id) FROM posts ''')
max_thread = cur.fetchone()[0]
if max_thread == None:
    first = {1}
else: # restart scraping from last complete thread id
    cur.execute(''' DELETE FROM posts WHERE thread_id=?''', (max_thread,))
    conn.commit()
    first = max_thread
conn.close()

### select rows from parent db threads
conn = create_connection(path_db_parent)
cur = conn.cursor()
cur.execute(''' SELECT id, url FROM threads WHERE id>=? AND id<={2} ''', (first,))
rows = cur.fetchall()
conn.close()

### connect back to child db
conn = create_connection(path_db_child)
set_up_posts_db(conn)

## Scrape threads

for row in rows:
    scrape_posts(conn, row)

conn.close()
"""


In [34]:
for g in range(n_groups):
    first = 1 + (size * g)
    if g == (n_groups - 1):
        last = max_id
    else:
        last = (size * (g + 1))
    if g + 1 < 10:
        num = "0{}".format(g + 1)
    else:
        num = "{}".format(g + 1)
    with open(path_groups.format(num), 'w') as w:
        w.write(text.format(num, first, last))

In [35]:
conn.close()

In [36]:
soup = get_soup("https://www.netmums.com/coffeehouse/becoming-mum-pregnancy-996/netmums-52/1547230-28-weeks-blood-advice.html")

In [37]:
soup

<!DOCTYPE html>
<html lang="en"><head><link href="https://fonts.googleapis.com" rel="preconnect"/><link href="https://fonts.gstatic.com" rel="preconnect"/><link href="https://fonts.googleapis.com/css?family=Amatic+SC:700%7CRoboto:400,500,700&amp;display=swap" media="all" rel="stylesheet"/><meta charset="utf-8"/><meta content="width=device-width,initial-scale=1" name="viewport"/><meta content="563844106964124" property="fb:app_id"/><meta content="Netmums" property="og:site_name"/><meta content="https://www.netmums.com/site/assets/images/meta_chat.png" property="og:image"/><meta content="1200" property="og:image:width"/><meta content="630" property="og:image:height"/><meta content="@netmums" name="twitter:site"/><meta content="summary_large_image" name="twitter:card"/><meta content="https://www.netmums.com/site/assets/images/meta_chat.png" name="twitter:image"/><link href="https://www.netmums.com/apple-touch-icon.png" rel="apple-touch-icon" sizes="180x180"/><link href="https://www.netmum

In [15]:
next_button = soup.find("div", {"class":"pagepresuiv_next"})

In [17]:
if link := next_button.find("a"):
    next_url = link.get('href')
    print(next_url)
else:
    print("no next")

AttributeError: 'NoneType' object has no attribute 'find'