# Visualize Sentiment
This notebook loads the sentiment data and explores results

## Data Sources
- youbemom-merged.db (with sentiment scores from 2.0-Sentiment-Create_Scores.ipynb)

## Changes
- 2020-01-09: Created

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
 - subforum: subforum of post
 - dne: post does not exist
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post
 - subforum: subforum of post
 - deleted: has post been deleted
- sentiment
 - message_id: message id connecting to posts
 - text: title + body
 - text_no_url: text without urls
 - neg_sen_all
 - neu_sen_all
 - pos_sen_all
 - com_sen_all
 - neg_sen_no_url
 - neu_sen_no_url
 - pos_sen_no_url
 - com_sen_no_url


## Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from pathlib import Path
from scraping import create_connection

## Functions

In [2]:
def create_dates(df):
    """ create date variables
    :param df: data frame
    :return: formatted data frame
    """
    df['date_created'] = pd.to_datetime(df['date_created'])
    df['before'] = df['date_created'] <= pd.Timestamp(2020,2,28)
    df['during'] = df['date_created'] >= pd.Timestamp(2020,4,1)
    df['march'] = ~df['before'] & ~df['during']
    df.loc[df['before'], 'period'] = 'before'
    df.loc[df['march'], 'period'] = 'march'
    df.loc[df['during'], 'period'] = 'during'
    df['weekday'] = df['date_created'].dt.day_name()
    df['week_n'] = df['date_created'].dt.isocalendar().week
    df['weekday_n'] = df['date_created'].dt.day
    df['month'] = df['date_created'].dt.month_name()
    df['month_n'] = df['date_created'].dt.month
#     df['ymd'] = df['date_created'].dt.to_period('D')
    return df

In [3]:
def filter_dates(df, start, end):
    """ creates filter for date created
    :param df: data frame
    :param start: datetime start date (inclusive)
    :param end: datetime end date (exclusive)
    :return: copy of masked data frame
    """
    mask = (df['date_created'] >= start) & (df['date_created'] < end)
    return df.loc[mask].copy()

## Path

In [4]:
p = Path.cwd()
path_parent = p.parents[0]

In [5]:
path_db = path_parent / "database" / "youbemom-merged.db"
path_db = str(path_db)
path_counts = path_parent / "clean_data" / "subforum-counts.csv"
path_counts = str(path_counts)

## Load Data

In [11]:
conn = create_connection(path_db)

In [13]:
cur = conn.cursor()
cur.execute('''
    SELECT threads.url, posts.date_created, posts.date_recorded, posts.title
    FROM threads
    JOIN posts
        ON threads.family_id=posts.family_id
        WHERE posts.parent_id=""
            AND posts.subforum="toddler"
    ORDER BY posts.date_created DESC LIMIT 1
    ''')
url = cur.fetchone()
print(url)

('/forum/permalink/10991516', '2020-12-02 12:31:00', '12-08-2020 13:13:48', 'Last night I asked dh to sit down with me and plan Xmas shopping for his family. He refused because he doesn’t get paid for another week. I told him I’m finished and this was last')


In [None]:
# posts: message_id (merge var), date_created, parent_id
# sentiment: message_id (merge var), *_sen_* (sentiment variables)
sn_sql = '''
    SELECT
        p.message_id AS message_id,
        p.date_created AS date_created,
        p.parent_id AS parent_id,
        s.neg_sen_all as neg_sen_all,
        s.neu_sen_all as neu_sen_all,
        s.pos_sen_all as pos_sen_all,
        s.com_sen_all as com_sen_all,
        s.neg_sen_no_url as neg_sen_no_url,
        s.neu_sen_no_url as neu_sen_no_url,
        s.pos_sen_no_url as pos_sen_no_url,
        s.com_sen_no_url as com_sen_no_url
    FROM posts AS p
    JOIN sentiment AS s
    ON p.message_id = s.message_id
    WHERE p.subforum = "special-needs"
'''

In [None]:
sn = pd.read_sql_query(sn_sql, conn)
sn = create_dates(sn)

In [None]:
# posts: message_id (merge var), date_created, parent_id
# sentiment: message_id (merge var), *_sen_* (sentiment variables)
td_sql = '''
    SELECT
        p.message_id AS message_id,
        p.date_created AS date_created,
        p.parent_id AS parent_id,
        s.neg_sen_all as neg_sen_all,
        s.neu_sen_all as neu_sen_all,
        s.pos_sen_all as pos_sen_all,
        s.com_sen_all as com_sen_all,
        s.neg_sen_no_url as neg_sen_no_url,
        s.neu_sen_no_url as neu_sen_no_url,
        s.pos_sen_no_url as pos_sen_no_url,
        s.com_sen_no_url as com_sen_no_url
    FROM posts AS p
    JOIN sentiment AS s
    ON p.message_id = s.message_id
    WHERE p.family_id IN (SELECT family_id FROM temp)
'''

In [None]:
td = pd.read_sql_query(td_sql, conn)
td = create_dates(td)

In [None]:
conn.close()

## Filter Data

In [None]:
td_sub = filter_dates(td, pd.Timestamp(2014, 1, 1, 0, 0, 0), pd.Timestamp(2020, 12, 1, 0, 0, 0))
sn_sub = filter_dates(sn, pd.Timestamp(2014, 1, 1, 0, 0, 0), pd.Timestamp(2020, 12, 1, 0, 0, 0))

## Post Frequency