# Merge Youbemom Databases
This notebook first finds and scrapes missing posts then merges the 30 + 1 youbemomTables databases into one sqlite database

## Data Sources
- database/youbemom-all/youbemomTables-[01 to 30].db (scraped with 1-Scrape_Forum.ipynb)
- database/youbemomTables.db (scraped with 1-Scrape_Forum.ipynb)

## Changes
- 2020-12-08: Created
- 2020-12-09: Added merge databases

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post

## TODO
-

## Imports

In [1]:
import sqlite3
from pathlib import Path
from youbemom import loop_list_links, create_connection, set_up_db

## Functions

For formatting strings

In [2]:
def pad(n):
    """ pad an int if less than 10 
    :param n: input int
    :return: string padded with 0 if less than 10
    """
    if n < 10:
        return "0" + str(n)
    else:
        return str(n)

In [3]:
def permalink_n(url):
    """ extract permalink number from url
    :param url: url string of permalink
    :return: int of permalink number
    """
    return int(url[17:])

For testing

In [4]:
def check_ids_for_missing(cur):
    """ checks for missing ids and scrapes them 
    :param cur: database cursor
    """
    cur.execute(ids_sql)
    ids = list(cur.fetchall())
    ids = [i[0] for i in ids]
    if ids:
        print("got ids for db-{}".format(n))
        missing_ids = find_missing(ids)
        if len(missing_ids) > 0:
            print(missing_ids)
            cur.execute(min_permalink_sql)
            min_permalink = cur.fetchone()
            if min_permalink:
                min_permalink = permalink_n(min_permalink[0])
                loop_list_links(conn, path_db, missing_ids, min_permalink)

In [5]:
def find_missing(number_list):
    """ extracts all missing ints in the range from the first value
        on the number list to the last number
    :param number_list: ideally sorted, a list of ints
    :return: list of numbers between first and last not in list
    """
    return [x for x in range(number_list[0], number_list[-1] + 1) if x not in number_list] 

## File Locations

In [6]:
p = Path.cwd()
path_parent = p.parents[0]

## Loop Databases
Find missing permalinks in each database, skipped due to errors

In [7]:
# count_sql = ''' SELECT Count(*) FROM threads '''
maxid_sql = ''' SELECT MAX(family_id) FROM threads '''
min_permalink_sql = ''' SELECT url FROM threads ORDER BY ROWID ASC LIMIT 1 '''
ids_sql = ''' SELECT family_id from threads ORDER BY family_id ASC '''
update_threads_sql = ''' UPDATE threads SET family_id = family_id + {} '''
update_posts_sql = ''' UPDATE posts SET family_id = family_id + {} '''

In [8]:
last_max_value = 0
for i in range(1, 31):
    n = pad(i)
    # n = str(i)
    fn = "youbemomTables-{}.db".format(n)
    path_db = path_parent / "database" / "youbemom-all" / fn
    path_db = str(path_db)
    conn = create_connection(path_db)
    cur = conn.cursor()
    # don't run more than once
    # won't mess anything up but
    # takes a lot of time
    # check_ids_for_missing(cur)
    cur.execute(maxid_sql)
    maxid = cur.fetchone() # must do this before updating values
    print("adding: {}".format(last_max_value))
    cur.execute(update_threads_sql.format(last_max_value))
    cur.execute(update_posts_sql.format(last_max_value))
    if maxid:
        last_max_value += int(maxid[0])
    else:
        print("no max id, adding 50k")
        last_max_value += 50000
    conn.commit()
    conn.close()

adding: 0
adding: 48819
adding: 97638
adding: 146457
adding: 195276
adding: 244095
adding: 292914
adding: 341733
adding: 390552
adding: 439371
adding: 488190
adding: 537009
adding: 585828
adding: 634647
adding: 683466
adding: 732285
adding: 781104
adding: 829923
adding: 878742
adding: 927561
adding: 976380
adding: 1025199
adding: 1074018
adding: 1122837
adding: 1171656
adding: 1220475
adding: 1269294
adding: 1318113
adding: 1366932
adding: 1415751


## Merge Databases
Merge all the databases into one SQLite DB

### Create new database

In [22]:
db = "youbemom-merged.db"
path_db = path_parent / "database" / db
path_db = str(path_db)
conn = create_connection(path_db)
set_up_db(conn)
cur = conn.cursor()
script = '''
    ATTACH DATABASE "{0}" AS db{1};
    INSERT INTO main.threads (family_id, url, subforum, dne) SELECT family_id, url, subforum, dne FROM db{1}.threads;
    INSERT INTO main.posts (family_id, message_id, parent_id, date_recorded, date_created, title, body, subforum, deleted) SELECT family_id, message_id, parent_id, date_recorded, date_created, title, body, subforum, deleted FROM db{1}.posts;
    DETACH DATABASE db{1};
'''
# attach_sql = ''' ATTACH DATABASE "{0}" AS db{1} '''
# insert_threads_sql = ''' INSERT INTO main.threads SELECT * FROM db{}.threads '''
# insert_posts_sql = ''' INSERT INTO main.posts SELECT * FROM db{}.posts '''

### Loop through databases

In [23]:
for i in range(1, 31):
    n = pad(i)
    fn = "youbemomTables-{}.db".format(n)
    path_db = path_parent / "database" / "youbemom-all" / fn
    path_db = str(path_db)
    cur.executescript(script.format(path_db, n))

In [24]:
conn.commit()
conn.close()