# Quotes to Network
This notebook takes the quotes table from the individual netmums databases and creates a network edge table that connects quoting and quoted posts.

## TODO
- To connect quotes (network graphs) between posts, I still need to identify which post is quoting what post. The issues that remain are:
 - Duplicate posts by quoted user so can't match text to a unique post
 - Links as ::link_1:: are common as the only text in a quote
 - Currently excluding anonymous posts
- Finish `for` loop for quote chunks/

## Imports

In [4]:
import sqlite3
from pathlib import Path
from scraping import create_connection
from netmums import set_up_merged_db
import pandas as pd
import math

## File Locations

In [5]:
p = Path.cwd()
path_parent = p.parents[0]
path_db = str(path_parent / "database" / "netmums0{}.db")

## Functions

In [14]:
def chunker(n_row, chunk_size):
    """ create chunks of the correct size to portion the
        dataframe
    """
    n = math.ceil(n_row / chunk_size)
    chunk_list = []
    for i in range(n):
        list_min = i * 100000
        list_max = (i + 1) * 100000 - 1
        if list_max > n_row:
            list_max = n_row
        chunk_list.append((list_min, list_max))
    return(chunk_list)

## Database connection and SQL

In [None]:
conn = create_connection(path_db)

In [None]:
# temp_table_sql = ''' 
#     CREATE TEMPORARY TABLE
#         temp(
#             id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE,
#             thread_id TEXT,
#             post_count INTEGER,
#             quoting_id TEXT,
#             quoted_id TEXT,
#             quoted_user TEXT,
#             quoted_text TEXT,
#             citation_n INTEGER
#         );
# '''
# cur = conn.cursor()
# cur.execute(temp_table_sql)

In [None]:
drop_table_sql = ''' DROP TABLE IF EXISTS temp; '''
quotes_sql = '''
    SELECT *
    FROM quotes
    WHERE quoted_id="";
'''

In [None]:
chunk_matches_sql = """
SELECT
    q.thread_id AS quoting_thread_id,
    q.post_count AS quoting_post_count,
    q.quoted_text AS quoted_text,
    q.citation_n AS citation_n,
    s.thread_id AS quoted_thread_id,
    s.post_count AS quoted_post_count,
    s.body
FROM (
    SELECT
        thread_id,
        post_count,
        quoted_user,
        quoted_text,
        citation_n
    FROM temp
    WHERE quoted_id=""
        AND quoted_user<>"Anonymous"
        AND quoted_text<>""
) as q
LEFT JOIN (
    SELECT
        p.thread_id,
        p.post_count,
        p.body,
        u.name
    FROM posts AS p
    LEFT JOIN users AS u
        ON p.user_url = u.user_url
) as s
    ON s.thread_id=q.thread_id
        AND s.name=q.quoted_user
        AND q.quoted_text=s.body
        AND s.post_count<q.post_count
"""

## Create Network Links
Loop through the netmums individual databases and match quotes to quoted posts

In [None]:
chunk_size = 100000

In [None]:
path_db = str(path_parent / "database" / )
for i in range(1, 6):
    path_db = str(path_parent / "database" / db)
    db = "netmums0{}.db".format(i)
    conn = create_connection(path_db)
    quotes = pd.read_sql(quotes_sql, conn)
    chunks = chunker(quotes.shape[0], chunk_size)
    for chunk in chunks:
        # slice quotes data frame
        quotes_chunk = quotes.iloc[chunk[0]:chunk[1]].copy()
        
        # insert chunk into temporary table in database
        cur.execute(drop_table_sql)
        chunk.to_sql('temp', conn, index=False)
        
        # get matches
        chunk_matches = pd.read_sql(chunk_matches_sql, conn)
        
        # add group sizes
        chunk_matches['group_size'] = (
            chunk_matches
            .groupby(['q_thread_id','q_post_count','citation_n'])['s_thread_id']
            .transform(len)
        )
        
        # filter matches
        chunk_matches = chunk_matches.loc[chunk_matches['group_size'] == 1]
        
        # write to table
        chunk_matches = chunk_matches[[
            'quoting_thread_id',
            'quoting_post_count',
            'quoted_thread_id',
            'quoted_post_count'
        ]]
        chunk_matches.to_sql('quote_network', if_exists='append', conn, index=False)

## Non-exact matches

In [None]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.split()) 
    b = set(str2.split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
%%time
nonunique_matches['similarity'] = (
    nonunique_matches
    .apply(lambda x: get_jaccard_sim(
        x.quoted_text,
        x.body
    ),
           axis=1)
)

## Anonymous Matches

In [None]:
get_posts = '''
    SELECT
        p.thread_id AS thread_id,
        p.post_count AS post_count,
        p.body AS body
    FROM posts AS p
    LEFT JOIN users AS u
        ON p.user_url = u.user_url
    WHERE
        p.thread_id={0}
        AND u.name="{1}"
        AND p.post_count<{2};
'''

In [None]:
get_posts_anon = '''
    SELECT
        thread_id,
        post_count,
        body
    FROM posts
    WHERE
        thread_id={0}
        AND user_url="Anonymous"
        AND post_count<{1};
'''