# Create Sentiment for Netmums

## Imports

In [1]:
import sqlite3
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pathlib import Path
from scraping import create_connection
from tqdm.notebook import tqdm
from math import floor

## File Locations

In [2]:
p = Path.cwd()
path_parent = p.parents[1]
path_db = str(path_parent / "database" / "netmums-merged.db")

## Functions

In [3]:
def get_size(conn):
    """ gets the size of the data set in number of rows
    :param conn: connection the the db
    :return size: size of the posts table
    """
    cur = conn.cursor()
    cur.execute(''' SELECT COUNT(id) FROM posts ''')
    size = cur.fetchone()
    if size:
        return int(size[0])
    raise SystemExit("No size found")

In [4]:
def process_data(chunksize):
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table
    :param chunksize: size of chunks
    """
    sql = ''' SELECT * FROM text '''
    reader = pd.read_sql_query(sql,
                               conn,
                               chunksize=chunksize)
    for i, df in enumerate(tqdm(reader)):
        df = gen_sentiment(df, 'text_clean', 'clean')
        df.drop('text_clean', axis=1, inplace=True)
        if i == 0:
            df.to_sql('sentiment', conn, if_exists='replace', index=False)
        else:
            df.to_sql('sentiment', conn, if_exists='append', index=False)

In [5]:
def process_data_month():
    """ read data in chunks from the table, format the text,
        apply the sentiemnt analyzer, and write chunks to 
        the sentiment table
    :param chunksize: size of chunks
    """
    sql = '''
        SELECT t.post_id, t.text_clean, p.user_url, p.date_created
        FROM posts AS p
        LEFT JOIN text AS t
        ON t.post_id = p.id
        WHERE p.date_created >= '{0}' AND p.date_created < '{1}'
    '''
    months = range(1, 13)
    years = range(2014, 2021)
    for year in tqdm(years):
        for month in tqdm(months, leave=False):
            begin_date = "{0}-{1}-01 00:00AM".format(year, str(month).zfill(2))
            end_date = "{0}-{1}-01 00:00AM".format(year, str(month + 1).zfill(2))
            sql = '''
                SELECT t.post_id, t.text_clean, p.user_url, p.date_created
                FROM posts AS p
                LEFT JOIN text AS t
                ON t.post_id = p.id
                WHERE p.date_created >= '{0}' AND p.date_created < '{1}'
            '''
            df = pd.read_sql_query(sql.format(begin_date, end_date), conn)
            df = df.loc[df['user_url'] != "Anonymous"]
            df = df.loc[(df['text_clean'] != "") & ~(df['text_clean'].isnull())]
            df['text_month'] = df.sort_values(['user_url','date_created']).groupby('user_url')['text_clean'].transform(lambda x : ' \n'.join(x))
            df = df[['user_url', 'text_month']].drop_duplicates().reset_index(drop=True)
            name = "month"
            name_neg = "neg_sen_{}".format(name)
            name_neu = "neu_sen_{}".format(name)
            name_pos = "pos_sen_{}".format(name)
            name_com = "com_sen_{}".format(name)
            df[name_neg] = 0
            df[name_neu] = 0
            df[name_pos] = 0
            df[name_com] = 0
            for index, row in df.iterrows():
                sentiment = sentiment_scores(row['text_month'], analyzer)
                df.loc[index, name_neg] = sentiment['neg']
                df.loc[index, name_neu] = sentiment['neu']
                df.loc[index, name_pos] = sentiment['pos']
                df.loc[index, name_com] = sentiment['compound']
            df['year'] = year
            df['month'] = month
            if month == 1 and year == 2014:
                df.to_sql('month_sentiment', conn, if_exists='replace', index=False)
            else:
                df.to_sql('month_sentiment', conn, if_exists='append', index=False)

In [6]:
def gen_sentiment(df, var, name):
    """ apply the sentiment score to the input var
    :param var: string name of column getting sentiment for
    :param name: string variable suffix
    :return score: a dictionary of scores (neg, neu, pos, compound)
    """
    sentiment = df[var].apply(lambda x: sentiment_scores(x, analyzer))
    name_neg = "neg_sen_{}".format(name)
    name_neu = "neu_sen_{}".format(name)
    name_pos = "pos_sen_{}".format(name)
    name_com = "com_sen_{}".format(name)
    df[name_neg] = sentiment.apply(lambda x: x.get('neg', 0))
    df[name_neu] = sentiment.apply(lambda x: x.get('neu', 0))
    df[name_pos] = sentiment.apply(lambda x: x.get('pos', 0))
    df[name_com] = sentiment.apply(lambda x: x.get('compound', 0))
    del sentiment
    return df

In [7]:
def gen_sentiment_monthly(df, var, name):
    """ apply the sentiment score to the input var
    :param var: string name of column getting sentiment for
    :param name: string variable suffix
    :return score: a dictionary of scores (neg, neu, pos, compound)
    """
    name_neg = "neg_sen_{}".format(name)
    name_neu = "neu_sen_{}".format(name)
    name_pos = "pos_sen_{}".format(name)
    name_com = "com_sen_{}".format(name)
    df[name_neg] = 0
    df[name_neu] = 0
    df[name_pos] = 0
    df[name_com] = 0
    for index, row in df.iterrows():
        sentiment = sentiment_scores(row['text_month'], analyzer)
        df.loc[index, name_neg] = sentiment['neg']
        df.loc[index, name_neu] = sentiment['neu']
        df.loc[index, name_pos] = sentiment['pos']
        df.loc[index, name_com] = sentiment['compound']
        if index % 100 == 0:
            print(index)
    return df

In [8]:
def sentiment_scores(sentence, analyzer):
    """ create sentiment scores with the VADER analyzer
    :param sentence: sentence to create scores for
    :param analyzer: VADER sentiment analyzer
    :return score: a dictionary of scores (neg, neu, pos, compound)
    """
    score = analyzer.polarity_scores(sentence)
    return score

## Loop through chunks

In [9]:
conn = create_connection(path_db)
size = get_size(conn)
nchunks = 200
chunksize = floor(size / nchunks)

In [10]:
analyzer = SentimentIntensityAnalyzer()

In [11]:
# process_data(chunksize)

In [12]:
process_data_month()

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [13]:
conn.commit()
conn.close()