# Run Sentiment Analysis on Forum Data
This notebook loads the Youbemom forum data and calculates sentiment

## Data Sources
- youbemom-merged.db (created with 1.1-Merge_Databases.ipynb)

## Changes
- 2020-08-13: Set up data cleaning
- 2020-08-20: Added t-tests
- 2020-08-26: Added plots
- 2020-09-14: Added more plots
- 2020-09-15: Compared parent and child sentiment
- 2020-12-10: Changed data set
- 2020-12-13: Moved data analysis to new file
- 2020-12-15: Created new sentiment table, removed urls from strings
- 2021-01-07: Chunked data analysis into loop
- 2021-01-25: Moved text creation and cleaning to 1.2-Create_Data-Filter_Spam

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
 - subforum: subforum of post
 - dne: post does not exist
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post
 - subforum: subforum of post
 - deleted: has post been deleted
- text
 - message_id: message id connecting to posts
 - text: title + body
 - text_clean: text without urls and extra spaces
 - probable_spam: marked as probable spam in 1.2
 - neg_sen_all
 - neu_sen_all
 - pos_sen_all
 - com_sen_all
 - neg_sen_no_url
 - neu_sen_no_url
 - pos_sen_no_url
 - com_sen_no_url

## TODO
- 

## Imports

In [1]:
import sqlite3
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime
from pathlib import Path
from scraping import create_connection
import re
from math import floor
from tqdm.notebook import tqdm

## Functions
For fetching the data

In [2]:
def get_size(conn):
    """ gets the size of the data set in number of rows
    :param conn: connection the the db
    :return size: size of the posts table
    """
    cur = conn.cursor()
    cur.execute(''' SELECT COUNT(message_id) FROM posts ''')
    size = cur.fetchone()
    if size:
        return int(size[0])
    raise SystemExit("No size found")

In [3]:
def process_data(chunksize):
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table
    :param chunksize: size of chunks
    """
    sql = ''' SELECT * FROM text '''
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    for i, df in enumerate(tqdm(reader)):
        df = gen_sentiment(df, 'text', 'all')
        df = gen_sentiment(df, 'text_clean', 'clean')
        df.drop('text', axis=1, inplace=True)
        df.drop('text_clean', axis=1, inplace=True)
        df.drop('probable_spam', axis=1, inplace=True)
        if i == 0:
            df.to_sql('sentiment', conn, if_exists='replace', index=False)
        else:
            df.to_sql('sentiment', conn, if_exists='append', index=False)

In [4]:
def sentiment_family(df, i):
    # group
    df = df[['family_id','text_clean']].groupby(['family_id'])['text_clean'].apply(' '.join)
    df = df.to_frame()
    df.reset_index(inplace=True)
    # create sentiment
    df = gen_sentiment(df, 'text_clean', 'clean')
    # drop extra columns
    df.drop('text_clean', axis=1, inplace=True)
    # write to database
    if i == 0:
        df.to_sql('sentiment_family', conn, if_exists='replace', index=False)
    else:
        df.to_sql('sentiment_family', conn, if_exists='append', index=False)
    conn.commit()

In [5]:
def process_data_family(chunksize):
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table grouped by the family_id
    :param chunksize: size of chunks
    """
    sql = '''
        SELECT p.family_id, t.message_id, t.text_clean
        FROM text AS t
        LEFT JOIN posts AS p
        WHERE t.message_id = p.message_id AND t.probable_spam = 0
    '''
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    orphans = pd.DataFrame()
    for i, chunk in enumerate(tqdm(reader)):
        # concat orphans from last chunk
        df = pd.concat((orphans, chunk))
        # identify new orphans
        last_val = df['family_id'].iloc[-1]
        is_orphan = df['family_id'] == last_val
        df, orphans = df[~is_orphan], df[is_orphan]
        # sentiment
        sentiment_family(df, i)
    # process last orphan
    if orphans.shape[0] > 0:
        sentiment_family(orphans, i)

For creating the sentiment values

In [6]:
def gen_sentiment(df, var, name):
    """ apply the sentiment score to the input var
    :param var: string name of column getting sentiment for
    :param name: string variable suffix
    :return score: a dictionary of scores (neg, neu, pos, compound)
    """
    sentiment = df[var].apply(lambda x: sentiment_scores(x, analyzer))
    name_neg = "neg_sen_{}".format(name)
    name_neu = "neu_sen_{}".format(name)
    name_pos = "pos_sen_{}".format(name)
    name_com = "com_sen_{}".format(name)
    df[name_neg] = sentiment.apply(lambda x: x.get('neg', 0))
    df[name_neu] = sentiment.apply(lambda x: x.get('neu', 0))
    df[name_pos] = sentiment.apply(lambda x: x.get('pos', 0))
    df[name_com] = sentiment.apply(lambda x: x.get('compound', 0))
    del sentiment
    return df

In [7]:
def sentiment_scores(sentence, analyzer):
    """ create sentiment scores with the VADER analyzer
    :param sentence: sentence to create scores for
    :param analyzer: VADER sentiment analyzer
    :return score: a dictionary of scores (neg, neu, pos, compound)
    """
    score = analyzer.polarity_scores(sentence)
    return score

## File Locations

In [8]:
p = Path.cwd()
path_parent = p.parents[0]

In [9]:
path_db = str(path_parent / "database" / "youbemom-merged.db")

## Process Data
Note: cannot process all data at once, breaks the data into chunks and processes each bit

In [10]:
conn = create_connection(path_db)

In [11]:
size = get_size(conn)
nchunks = 200
chunksize = floor(size / nchunks)

In [12]:
analyzer = SentimentIntensityAnalyzer()

In [13]:
# process_data(chunksize)

In [14]:
process_data_family(chunksize)

|          | 0/? [00:00<?, ?it/s]

In [15]:
conn.commit()
conn.close()