# Generate sentiment and emotion by day

## Imports

In [1]:
import sqlite3
import pandas as pd
import numpy as np
# from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pathlib import Path
# from scraping import create_connection
# from tqdm.notebook import tqdm
# from math import floor
# import lemmatize as l
import dask.dataframe as dd
from dask.multiprocessing import get
import time
from torch import tensor
from scipy.special import softmax


## File Locations

In [2]:
p = Path.cwd()
path_parent = p.parents[1]
path_db = str(path_parent / "database" / "netmums-merged.db")

In [3]:
path_clean_data = path_parent / "clean_data" / "netmums"
path_text_pkl = str(path_clean_data / "daily_clean_text.pkl")
path_emote_text_pkl = str(path_clean_data / "daily_emote_clean_text_{}.pkl")
path_emote_processed_pkl = str(path_clean_data / "daily_emote_processed_{}.pkl")

## Get Daily Text
env=forum

In [None]:
sql = '''
    SELECT
        text.text_clean AS text_clean,
        s.name AS subforum_name,
        p.user_url AS user_url,
        p.date_created AS date_created
    FROM text
    LEFT JOIN posts AS p
    ON text.post_id = p.id
    LEFT JOIN threads AS t
    ON t.id=p.thread_id
    LEFT JOIN subforums AS s
    ON s.id=t.subforum_id
    LEFT JOIN forums AS f
    ON f.id=s.forum_id
    WHERE text.text_clean<>""
    AND p.user_url<>"Anonymous"
'''

In [None]:
conn = create_connection(path_db)
df = pd.read_sql_query(sql, conn)
conn.close()

In [None]:
ddf = dd.from_pandas(df, npartitions=200)
ddf['date_created'] = dd.to_datetime(ddf['date_created'])
ddf['day'] = ddf['date_created'].dt.date
ddf = ddf[['user_url', 'day', 'text_clean']].groupby(["user_url", "day"])["text_clean"].apply(lambda grp: ' '.join(grp), meta=('text', 'object'))
df = ddf.compute(scheduler='processes')

In [None]:
df = df.reset_index(drop=False)

In [None]:
df.to_pickle(path_text_pkl)

## Sentiment

In [None]:
df = pd.read_pickle(path_text_pkl)

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
df['sentiment'] = df['text'].apply(lambda x: analyzer.polarity_scores(x))

In [None]:
df.to_pickle(path_text_pkl)

## Chunk Text
env=hf

In [4]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
path_model = '/home/mwh/cardiffnlp/twitter-roberta-base-emotion/'
model = AutoModelForSequenceClassification.from_pretrained(path_model)
tokenizer = AutoTokenizer.from_pretrained(path_model)

In [5]:
def split_text(text, max_size):
    """ 
    """
    # filter long words
    text = " ".join([word for word in text.split(" ") if len(word) <=25])
    # tokenize text
    encoded = tokenizer.encode(text)[1:-1]
    n_tokens = len(encoded)
    if n_tokens >= max_size:
        n_chunks = n_tokens // max_size + (n_tokens % max_size > 0) # round up
        chunk_size = n_tokens // n_chunks + (n_tokens % n_chunks > 0)
        return [[0] + encoded[i:i + chunk_size] + [2] for i in range(0, n_tokens, chunk_size)]
    else:
        return [[0] + encoded + [2]]

In [6]:
def make_tensor(encoded_list):
    output = {
        'input_ids': tensor([encoded_list]),
        'attention_mask': tensor([[1 for i in encoded_list]])
    }
    return output

In [7]:
def get_emotion_scores(encoded_tensor):
    output = model(**encoded_tensor)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    return list(scores)

In [8]:
text_df = pd.read_pickle(path_text_pkl)
more_than_one = text_df.groupby("user_url")['day'].count()
more_than_one = more_than_one[more_than_one > 1].reset_index(drop=False)
text_df = text_df.loc[text_df['user_url'].isin(more_than_one['user_url'])].reset_index(drop=True)

In [9]:
remaining = text_df.shape[0] - 1000000
half = remaining // 2
start = 1000000
ranges = [(1, start, start+half), (2, start+half, start+remaining)]

In [11]:
for i, start, end in ranges:
    start_time = time.time()
    df = text_df.iloc[start:end].copy()
    print("splitting text")
    ddf = dd.from_pandas(df, npartitions=200)
    ddf['encoded'] = ddf.apply(lambda x: split_text(x['text'], 500), axis=1, meta=list)
    df = ddf.compute(scheduler='processes')
    df.to_pickle(path_emote_text_pkl.format(i))
    split_time = time.time()
    print("time: {}".format((split_time - start_time) / 60))
    print("stacking encoded words")
    df = df.set_index(['user_url', 'day'])['encoded'].apply(pd.Series).stack().reset_index()
    df.columns = ['user_url', 'day', 'list_number', 'encoded']
    df = df[['user_url','day','encoded']]
    df.to_pickle(path_emote_text_pkl.format(i))
    stack_time = time.time()
    print("time: {}".format((stack_time - split_time) / 60))
    print("making tensors")
    ddf = dd.from_pandas(df, npartitions=200)
    ddf['encoded'] = ddf.apply(lambda x: make_tensor(x['encoded']), axis=1, meta=dict)
    df = ddf.compute(scheduler='processes')
    df.to_pickle(path_emote_text_pkl.format(i))
    tensor_time = time.time()
    print("time: {}".format((tensor_time - stack_time) / 60))
    print("get emotion scores")
    ddf = dd.from_pandas(df, npartitions=10)
    ddf['scores'] = ddf.apply(lambda x: get_emotion_scores(x['encoded']), axis=1, meta=list)
    df = ddf.compute(scheduler='processes')
    df = df[['user_url', 'day', 'scores']]
    df[['anger', 'joy', 'optimism', 'sadness']] = pd.DataFrame(df["scores"].to_list(), index=df.index)
    df.to_pickle(path_emote_processed_pkl.format(i))
    emotion_time = time.time()
    print("time: {}".format((emotion_time - tensor_time) / 60))
    print("total loop time: {}".format((emotion_time - start_time) / 60))

splitting text
time: 0.7976126670837402
stacking encoded words
time: 1.1489242553710937
making tensors
time: 1.839649740854899
get emotion scores
time: 800.1931878527006
total loop time: 803.9793745160102
splitting text
time: 0.8340093731880188
stacking encoded words
time: 1.207130761941274
making tensors
time: 1.8554017265637717
get emotion scores
time: 800.7881178657213
total loop time: 804.6846597274144


In [12]:
df.head()

Unnamed: 0,user_url,day,scores,anger,joy,optimism,sadness
0,lisa-w-2207,2016-10-12,"[0.9061063, 0.0063915206, 0.027199049, 0.06030...",0.906106,0.006392,0.027199,0.060303
1,lisa-w-2207,2016-10-12,"[0.7534732, 0.0097232545, 0.06107918, 0.17572428]",0.753473,0.009723,0.061079,0.175724
2,lisa1st-b,2018-01-26,"[0.0438436, 0.031566557, 0.054847788, 0.8697421]",0.043844,0.031567,0.054848,0.869742
3,lisa1st-b,2018-11-15,"[0.5460365, 0.037408844, 0.010406182, 0.4061484]",0.546036,0.037409,0.010406,0.406148
4,lisa1st-b,2018-11-15,"[0.036367387, 0.014276224, 0.010516538, 0.9388...",0.036367,0.014276,0.010517,0.93884
