# Analyze Sentiment for Forum Data
This notebook loads the sentiment data and explores results

## Data Sources
- youbemom-merged.db (with sentiment scores from 2.0-Sentiment-Create_Scores.ipynb)

## Changes
- 2020-12-13: Created
- 2020-01-07: Seasonality

## Database Structure
- threads
 - id: automatically assigned
 - url: url of top post
 - subforum: subforum of post
 - dne: post does not exist
- posts
 - id: automatically assigned
 - family_id: thread->id
 - message_id: the unique id of the message from the html
 - parent_id: id of post this post is responding to, 0 if top post
 - date_recorded: date the data is fetched
 - date_created: date the data was created
 - title: title of the post
 - body: body of the post
 - subforum: subforum of post
 - deleted: has post been deleted
- sentiment
 - message_id: message id connecting to posts
 - text: title + body
 - text_no_url: text without urls
 - neg_sen_all
 - neu_sen_all
 - pos_sen_all
 - com_sen_all
 - neg_sen_no_url
 - neu_sen_no_url
 - pos_sen_no_url
 - com_sen_no_url

## TODO
- Does python have a regression function for continuous dependent variables between 0 and 1 or will I have to use R/Stata for a fractional regression model?
- Set up functions for the plots so I can plug in the variable I want to see
- Does sentiment differ between parents and children in threads?
 - Is sentiment more negative after negative posts? More positive after positive posts?
- Is there a time-dependent function here?
 - Seasonality?
 - Does tweaking the time period cutoffs affect the significance?
- Frequency distribution of words
- Robustness checks of different samples
- Scrape the rest of December so frequency counts are right

## Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from scipy.stats import ttest_ind, kde
from datetime import datetime
# from nltk.probability import FreqDist
from nltk.tokenize import RegexpTokenizer #, word_tokenize
from nltk.corpus import stopwords
import string
import re
from pathlib import Path
import matplotlib.pyplot as plt
import math
from statsmodels.formula.api  import ols
from youbemom import create_connection

## Functions

In [None]:
def ttest_sentiment(df, v):
    """ run a t-test on the sentiment scores for
        before and during the pandemic
    :param df: data frame
    :param v: the variable name to t-test on
    :return: nothing, prints ttest results
    """
    group_before = df.where(df['before'])[v].dropna()
    group_during = df.where(df['during'])[v].dropna()
    result = ttest_ind(group_before, group_during, equal_var=False, nan_policy="omit")
    print('\n')
    l = len(v)
    print(' '*(l - 8) + 'variable    before    during statistic    pvalue')
    print(v + '  {:1.6f}  {:1.6f} {:+1.6f}  {:1.6f}'.format(group_before.mean(), group_during.mean(), result.statistic, result.pvalue))

In [2]:
def create_dates(df):
    """ create date variables
    :param df: data frame
    :return: formatted data frame
    """
    df['date_created'] = pd.to_datetime(df['date_created'])
    df['before'] = df['date_created'] <= pd.Timestamp(2020,2,28)
    df['during'] = df['date_created'] >= pd.Timestamp(2020,4,1)
    df['march'] = ~df['before'] & ~df['during']
    df.loc[df['before'], 'period'] = 'before'
    df.loc[df['march'], 'period'] = 'march'
    df.loc[df['during'], 'period'] = 'during'
    df['weekday'] = df['date_created'].dt.day_name()
    df['week_n'] = df['date_created'].dt.isocalendar().week
    df['weekday_n'] = df['date_created'].dt.day
    df['month'] = df['date_created'].dt.month_name()
    df['month_n'] = df['date_created'].dt.month
#     df['ymd'] = df['date_created'].dt.to_period('D')
    return df

In [None]:
def compare_parent_child(df):
    parents = df[df["is_parent"]]
    parents = parents[["family_id","neg_sentiment","pos_sentiment","compound_sentiment"]]
    children = df[~df["is_parent"]]
    children = children[["family_id","neg_sentiment","pos_sentiment","compound_sentiment"]]
    children_ave = children.groupby("family_id", as_index=False).mean()
    compare = pd.merge(left=parents, right=children_ave, on="family_id", how="inner", suffixes=['_p','_c'])
    compare['pos_diff'] = compare['pos_sentiment_p'] - compare['pos_sentiment_c']
    compare['neg_diff'] = compare['neg_sentiment_p'] - compare['neg_sentiment_c']
    compare['compound_diff'] = compare['compound_sentiment_p'] - compare['compound_sentiment_c']
    return compare

In [None]:
def season_sentiment(df, name, var):
    grouping_ave = df.groupby([var])['compound_sentiment'].mean()
    path_sen = path_parent / "clean_data" / "sentiment_{0}_{1}.csv".format(name, var)
    path_sen = str(path_sen)
    grouping_ave.to_csv(path_sen)
    print(grouping_ave)

In [None]:
def season_freq(df, name, var):
    grouping_freq = df.groupby(var)[var].count()
    path_freq = path_parent / "clean_data" / "frequency_{0}_{1}.csv".format(name, var)
    path_freq = str(path_freq)
    grouping_freq.to_csv(path_freq)
    print(grouping_freq)

In [None]:
def ave_sent_day(df):
    daily_ave = df[['date_created','compound_sentiment']].copy()
    daily_ave = daily_ave.set_index('date_created')
    daily_ave = daily_ave.resample('D').mean().dropna(how='all')
    daily_ave['period'] = "March"
    daily_ave.loc[daily_ave.index <= pd.Timestamp(2020,2,28), 'period'] = "Before"
    daily_ave.loc[daily_ave.index >= pd.Timestamp(2020,4,1), 'period'] = "During"
    return daily_ave

In [None]:
def daily_freq(df):
    freq = df[['date_created', 'compound_sentiment']].copy()
    freq = freq.set_index('date_created')
    freq = freq.resample('D').size().dropna(how='all').to_frame()
    freq.columns = ['daily_count']
    freq['period'] = "March"
    freq.loc[freq.index <= pd.Timestamp(2020,2,28), 'period'] = "Before"
    freq.loc[freq.index >= pd.Timestamp(2020,4,1), 'period'] = "During"
    return freq

In [None]:
def filter_dates(df, start, end):
    mask = (df['date_created'] >= start) & (df['date_created'] < end)
    return df.loc[mask]

## File Locations

In [3]:
p = Path.cwd()
path_parent = p.parents[0]

In [4]:
path_db = path_parent / "database" / "youbemom-merged.db"
path_db = str(path_db)
path_counts = path_parent / "clean_data" / "subforum-counts.csv"
path_counts = str(path_counts)
path_sn = path_parent / "clean_data" / "sn_sentiment.csv"
path_td = path_parent / "clean_data" / "td_sentiment.csv"

## Load Data

In [5]:
conn = create_connection(path_db)

Counts of posts in each subforum

In [None]:
counts_sql = '''
    SELECT subforum, COUNT(family_id), MIN(date_created), MAX(date_created)
    FROM posts
    GROUP BY subforum
'''
counts = pd.read_sql_query(counts_sql, conn)
counts.to_csv(path_counts, index=False)

Special needs subforum

In [6]:
# posts: message_id (merge var), date_created, parent_id
# sentiment: message_id (merge var), *_sen_* (sentiment variables)
sn_sql = '''
    SELECT
        p.message_id AS message_id,
        p.date_created AS date_created,
        p.parent_id AS parent_id,
        s.neg_sen_all as neg_sen_all,
        s.neu_sen_all as neu_sen_all,
        s.pos_sen_all as pos_sen_all,
        s.com_sen_all as com_sen_all,
        s.neg_sen_no_url as neg_sen_no_url,
        s.neu_sen_no_url as neu_sen_no_url,
        s.pos_sen_no_url as pos_sen_no_url,
        s.com_sen_no_url as com_sen_no_url
    FROM posts AS p
    JOIN sentiment AS s
    ON p.message_id = s.message_id
    WHERE p.subforum = "special-needs"
'''

In [7]:
sn = pd.read_sql_query(sn_sql, conn)

In [8]:
sn = create_dates(sn)

In [9]:
sn.to_csv(path_sn, index=False)

Toddler subforum: generate a 10% sample of family_ids to make processing easier

In [None]:
td_ids_sql = ''' SELECT family_id FROM threads WHERE subforum="toddler" '''

In [None]:
conn = create_connection(path_db)

In [None]:
td_ids = pd.read_sql_query(td_ids_sql, conn)

In [None]:
id_10per_sample = td_ids.sample(frac = 0.1, random_state = 281)

In [None]:
id_10per_sample.info()

In [None]:
temp_table_sql = ''' 
    CREATE TEMPORARY TABLE
        temp(id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT UNIQUE, family_id INTEGER);
    '''

In [None]:
cur = conn.cursor()
cur.execute(temp_table_sql)

In [None]:
id_10per_sample.to_sql('temp', conn, if_exists='append', index=False)

In [10]:
# posts: message_id (merge var), date_created, parent_id
# sentiment: message_id (merge var), *_sen_* (sentiment variables)
td_sql = '''
    SELECT
        p.message_id AS message_id,
        p.date_created AS date_created,
        p.parent_id AS parent_id,
        s.neg_sen_all as neg_sen_all,
        s.neu_sen_all as neu_sen_all,
        s.pos_sen_all as pos_sen_all,
        s.com_sen_all as com_sen_all,
        s.neg_sen_no_url as neg_sen_no_url,
        s.neu_sen_no_url as neu_sen_no_url,
        s.pos_sen_no_url as pos_sen_no_url,
        s.com_sen_no_url as com_sen_no_url
    FROM posts AS p
    JOIN sentiment AS s
    ON p.message_id = s.message_id
    WHERE p.family_id IN (SELECT family_id FROM temp)
'''

In [11]:
td = pd.read_sql_query(td_sql, conn)

In [None]:
# drop temp table
conn.close()

In [12]:
td = create_dates(td)

In [13]:
td.to_csv(path_td, index=False)

## Compare Sentiment

### 1. Subsets in correct date range: 2018-01-01 to 2020-11-30

In [None]:
td_sub = filter_dates(td, pd.Timestamp(2014, 1, 1, 0, 0, 0), pd.Timestamp(2020, 12, 1, 0, 0, 0))
sn_sub = filter_dates(sn, pd.Timestamp(2014, 1, 1, 0, 0, 0), pd.Timestamp(2020, 12, 1, 0, 0, 0))

### 2. Comparing sentiment before and during pandemic with a t-test

In [None]:
for name, df in {"Special Needs":sn_sub, "Toddler":td_sub}.items():
    print(name)
    ttest_sentiment(df, 'neg_sen_no_url')
    ttest_sentiment(df, 'neu_sen_no_url')
    ttest_sentiment(df, 'pos_sen_no_url')
    ttest_sentiment(df, 'com_sen_no_url')
    print("\n")

In [None]:
for name, df in {"Special Needs":sn_sub, "Toddler":td_sub}.items():
    fig, ax = plt.subplots()
    width = 0.35
    sentiments = ['Negative', 'Positive']
    x_pos = np.arange(len(sentiments))

    before_mean = [df["neg_sentiment"][df['before']].mean(),
                   df["pos_sentiment"][df['before']].mean()]
    before_se = [df["neg_sentiment"][df['before']].std()/math.sqrt(len(df["neg_sentiment"][df['before']])),
                 df["pos_sentiment"][df['before']].std()/math.sqrt(len(df["pos_sentiment"][df['before']]))]
    during_mean = [df["neg_sentiment"][df['during']].mean(),
                   df["pos_sentiment"][df['during']].mean()]
    during_se = [df["neg_sentiment"][df['during']].std()/math.sqrt(len(df["neg_sentiment"][df['during']])),
                 df["pos_sentiment"][df['during']].std()/math.sqrt(len(df["pos_sentiment"][df['during']]))]

    rects_before = ax.bar(x_pos - width/2, before_mean, width, yerr=before_se,
                    label='Before', capsize=5, color="cornflowerblue")
    rects_during = ax.bar(x_pos + width/2, during_mean, width, yerr=during_se,
                    label='During', capsize=5, color="indianred")

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel('Average Sentiment Score\n(normalized 0 to 1)')
    ax.set_title('Comparison of Sentiment Scores: {}'.format(name))
    ax.set_xticks(x_pos)
    ax.set_xticklabels(sentiments)
    ax.legend()
    fig.tight_layout()
#     plt.savefig('../plots/sentiment_neg_pos.png')
    plt.show()

### 3. Comparing sentiment of parent and child posts

In [None]:
for df in [sn_sub, td_sub]:
    print("\nGROUP\n")
    compare = compare_parent_child(df)
    est_pos = ols(formula = 'pos_sentiment_c ~ pos_sentiment_p', data = compare).fit()
    print(est_pos.summary())
    est_neg = ols(formula = 'neg_sentiment_c ~ neg_sentiment_p', data = compare).fit()
    print(est_neg.summary())
    est_compound = ols(formula = 'compound_sentiment_c ~ compound_sentiment_p', data = compare).fit()
    print(est_compound.summary())

So this suggests if a parent comment's negative sentiment increases, the children's negative sentiment will increase as well. Perhaps redo this analysis on each observation rather than the average children's sentiment.

### 4. Compare density of compound sentiment

In [None]:
for name, df in {"Special Needs":sn_sub, "Toddler":td_sub}.items():
    density_before = kde.gaussian_kde(df["compound_sentiment"][df['before']])
    density_march = kde.gaussian_kde(df["compound_sentiment"][df['march']])
    density_during = kde.gaussian_kde(df["compound_sentiment"][df['during']])
    x = np.arange(-1.0, 1.0, 0.01)
    plt.scatter(x, density_before(x), alpha=0.5, label="Before")
    plt.scatter(x, density_march(x), alpha=0.5, label="March")
    plt.scatter(x, density_during(x), alpha=0.5, label="During")
    plt.legend(loc="upper right")
    plt.title('Density of Compound Sentiment Scores: {}'.format(name))
    plt.show()

In [None]:
size = 3
for name, df in {"Special Needs":sn, "Toddler":td_sub}.items():
    daily_ave = ave_sent_day(df)
    groups = daily_ave.groupby('period')
    for period, group in groups:
        plt.scatter(group.index, group.compound_sentiment, label=period, s=size)
    plt.legend(loc="lower left")
    plt.xticks(rotation=90)
    plt.ylabel('Compound Sentiment\n(normalized -1 to 1)')
    plt.title('Daily Average Compound Sentiment Scores: {}'.format(name))
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.5)
    plt.tight_layout()
    plt.show()

### 5. Compare post frequency over same period for special needs and toddler subforums

Frequency

In [None]:
size = 3
for name, df in {"Special Needs":sn, "Toddler":td_sub}.items():
    freq = daily_freq(df)
    groups = freq.groupby('period')
    for period, group in groups:
        plt.scatter(group.index, group.daily_count, label=period, s=size)
    plt.legend(loc="lower left")
    plt.xticks(rotation=90)
    plt.ylabel('Frequency')
    plt.title('Daily Frequency of Posts: {}'.format(name))
    plt.axhline(y=0, color='black', linestyle='-', alpha=0.5)
    plt.tight_layout()
    plt.show()