# Merge Forum Data
This notebook merges the data from individual netmums databases into netmumsTables.db

## TODO


## Imports

In [1]:
import sqlite3
from pathlib import Path
from scraping import create_connection
from netmums import set_up_merged_db
import pandas as pd

## File Locations

In [2]:
p = Path.cwd()
path_parent = p.parents[0]

## Clean Data

### Remove duplicate posts and renumber post counts

In [None]:
has_duplicate_posts_sql = """
    SELECT thread_id, post_id, user_url, date_created, COUNT(*)
    FROM posts
    GROUP BY thread_id, post_id, user_url, date_created
    HAVING COUNT(*)>1;
"""

In [None]:
drop_duplicate_posts_sql = """
    DELETE FROM posts
    WHERE id NOT IN (
        SELECT min(id)
        FROM posts
        GROUP BY thread_id, post_id, user_url, date_created
    );
"""

In [None]:
select_post_threads_sql = """
    SELECT id, post_count
    FROM posts
    WHERE thread_id={}
"""

In [None]:
update_post_count_sql = """
    UPDATE posts
    SET post_count={0}
    WHERE id={1} 
"""

In [None]:
# Loop through the databases and renumber post counts
for i in range(1, 6):
    db = "netmums0{}.db".format(i)
    path_db = str(path_parent / "database" / db)
    conn = create_connection(path_db)
    cur = conn.cursor()
    has_duplicate_posts = pd.read_sql(has_duplicate_posts_sql, conn)
    if not has_duplicate_posts.empty:
        unique_threads = list(has_duplicate_posts.thread_id.unique())
        cur.execute(drop_duplicate_posts_sql)
        for thread_id in unique_threads:
            id_df = pd.read_sql(select_post_threads_sql.format(thread_id), conn)
            id_df['post_count'] = id_df['post_count'].astype(int)
            id_df.sort_values(by=['post_count'], inplace=True)
            for i, r in id_df.iterrows():
                cur.execute(update_post_count_sql.format(i + 1, r['id']))
        conn.commit()
    conn.close()

Remove duplicate users, quotes, and links

In [None]:
duplicates_sql = '''
    DELETE FROM users
    WHERE id NOT IN (
        SELECT MIN(id)
        FROM users
        GROUP BY name, user_url
    );
    DELETE FROM quotes
    WHERE id NOT IN (
        SELECT MIN(id)
        FROM quotes
        GROUP BY thread_id, quoting_id, quoted_id, quoted_user, quoted_text, citation_n
    );
    DELETE FROM links
    WHERE id NOT IN (
        SELECT MIN(id)
        FROM links
        GROUP BY thread_id, post_id, link_count, link_text, link_url
    );
'''

In [None]:
for i in range(1, 6):
    db = "netmums0{}.db".format(i)
    path_db = str(path_parent / "database" / db)
    conn = create_connection(path_db)
    cur = conn.cursor()
    cur.executescript(duplicates_sql)
    conn.commit()
    conn.close()

## Merge Databases

### Set up SQL
Inserts individual databased tables into merged database

In [8]:
# Fix merged database tables to have correct columns
fix_tables_sql = """
DROP TABLE IF EXISTS users; 
DROP TABLE IF EXISTS posts; 
DROP TABLE IF EXISTS quotes; 
DROP TABLE IF EXISTS links; 
CREATE TABLE IF NOT EXISTS users(
	id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
	name TEXT,
	user_url
);
CREATE TABLE IF NOT EXISTS posts(
	id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
	thread_id INTEGER,
	post_count INTEGER,
	post_id TEXT,
	user_url TEXT,
	date_created TEXT,
	date_recorded TEXT,
	body TEXT,
	version INTEGER
);
CREATE TABLE IF NOT EXISTS links(
	id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
	thread_id TEXT,
	post_count INTEGER,
	post_id TEXT,
	link_count INTEGER,
	link_text TEXT,
	link_url TEXT
);
"""

In [9]:
db = "netmums-merged.db"
path_db = str(path_parent / "database" / db)
conn = create_connection(path_db)
cur = conn.cursor()
loop_sql = '''
    ATTACH DATABASE "{0}" AS db{1};
    INSERT INTO main.users (name, user_url) SELECT name, user_url FROM db{1}.users;
    INSERT INTO main.links (thread_id, post_count, post_id, link_count, link_text, link_url) SELECT thread_id, post_count, post_id, link_count, link_text, link_url FROM db{1}.links;
    INSERT INTO main.posts (thread_id, post_count, post_id, user_url, date_created, date_recorded, body, version) SELECT thread_id, post_count, post_id, user_url, date_created, date_recorded, body, version FROM db{1}.posts;
    DETACH DATABASE db{1};
'''
duplicates_sql = '''
    DELETE FROM users
    WHERE id NOT IN (
        SELECT MIN(id)
        FROM users
        GROUP BY name, user_url
    );
'''

In [10]:
cur.executescript(fix_tables_sql)

<sqlite3.Cursor at 0x7eff90a26880>

### Loop Databases

In [11]:
for i in range(1, 6):
    fn = "netmums0{}.db".format(i)
    path_add = str(path_parent / "database" / fn)
    cur.executescript(loop_sql.format(path_add, i))

### Remove duplicate users

In [12]:
cur.executescript(duplicates_sql)

<sqlite3.Cursor at 0x7eff90a26880>

### Close Conn

In [13]:
conn.commit()
conn.close()