# sentiment analysis
- requires normalized dataset with emoji to text conversion
- outputs afinn sentiment values for english and swedish text

In [1]:
import numpy as np
import pandas as pd
import csv
# import matplotlib.pyplot as plt
# import seaborn as sns
import nltk
from datetime import datetime
import string
import sys
# afinn sentiment analysis
sys.path.append("/home/sol-nhl/dev/r-cran/scom-org/get/afinn/")
from afinn import Afinn


# function test

In [2]:
# use %cpaste and -- in ipython console
def small_test(dfn=None):
    if not dfn:
        # great things have small beginnings
        sent = "an amazing and excellent foo bar, but terrible and confusing."
        cols = ['id','text']
        data = np.array([[1,2,3],[sent,'',sent]]).T
        dfs = pd.DataFrame(data, columns=cols)
    else:
        # or, with some actual data
        dfs = dfn[['post_id','post_message']].head(100)
        dfs.columns = ['id', 'text']
    #
    return dfs


# function tokenize text

In [3]:
# use %cpaste and -- in ipython console
def unnest_tokens(dfs, lang='english'):
    print("ut: lang "+lang)
    dfs.columns = ['id', 'text']
    dfs = dfs.copy()
    # stop words
    stop = nltk.corpus.stopwords.words(lang) + list(string.punctuation)
    # ################## tokenize text
    # dfs["tokens"] = dfs["text"].str.lower()
    # dfs["tokens"] = dfs.tokens.apply(nltk.word_tokenize)
    ts = dfs.text.str.lower().str.split()
    dfs["tokens"] = ts
    dfs["tokens"] = dfs['tokens'].apply(lambda x: [item for item in x if item not in stop])
    # ################## unnest tokens
    # set an index (each column will inherit it)
    # dfi = df.set_index(['ID', 'Year'])
    dfi = dfs[['id','tokens']].set_index(['id'])
    # the trick
    unnested_lst = []
    for col in dfi.columns:
        unnested_lst.append(dfi[col].apply(pd.Series).stack())
    dft = pd.concat(unnested_lst, axis=1, keys=dfi.columns)
    #
    return dft


# function sentiment analysis

In [4]:
def sentiment_analysis(dft, slex='afinn', lang='en', emot=True):
    print("sa: lang "+lang+", aggregate")
    # ################## sentiment analysis, valence
    afinn = Afinn(language=lang, emoticons=emot)
    dft['sa_val'] = dft.tokens.apply(afinn.score)
    dft['sa_pos'] = dft.sa_val.where(dft.sa_val > 0)
    dft['sa_neg'] = dft.sa_val.where(dft.sa_val < 0)
    dft.reset_index(inplace=True)
    dft['sa_frq'] = np.where(dft['sa_val']!=0, 1.0, np.nan)
    print(dft.sa_frq.shape)
    # df = pd.DataFrame({'sa_frq': pd.Series(np.where(dft['sa_val']!=0, 1.0, np.nan))})
    # dft = pd.concat([dft, df], axis=1)
    #
    return dft


# function aggregate sentiment values over posts

In [5]:
def aggregate_tokens(dft):
    # aggregate sa, positive, negative sentiments
    # dfa = dft.groupby(['id'])['sa_'].agg({'returns':{'Mean': np.mean, 'Sum': np.sum}})
    dfa = dft.groupby(['id'])['sa_val','sa_pos','sa_neg','sa_frq'].sum().reset_index()
    #
    return dfa


# function merge emojis

In [6]:
def merge_emojis(dfa, dfa_em):
    dfe = pd.merge(dfa, dfa_em, how='left', on=['id'])
    dfe['sa_val'] = dfe.sa_val_x + dfe.sa_val_y
    dfe['sa_pos'] = dfe.sa_pos_x + dfe.sa_pos_y
    dfe['sa_neg'] = dfe.sa_neg_x + dfe.sa_neg_y
    dfe['sa_frq'] = dfe.sa_frq_x + dfe.sa_frq_y
    # check output
    dfe.to_csv('tmp/sa-text-emoji.csv', sep='\t', quoting=csv.QUOTE_NONNUMERIC, header=True, index=None)
    # 
    return dfe[['id','sa_val','sa_pos','sa_neg','sa_frq']]


# main script

In [9]:
if __name__ == '__main__':
    # ################## read, clean dataset
    # dfn = pd.read_csv('csv/civil-society-190415.csv', sep='\t')
    # dfn = pd.read_csv('csv/civil-society-191118.csv', sep='\t')
    # new ~100K facebook dataset 210304
    dfn = pd.read_csv('../csv/fb-emoji-text.csv', sep='\t')
    dfn.post_message.replace(np.nan, 'string', inplace=True)
    # added emoji analysis 191118
    dfn.emoji.replace(np.nan, 'string', inplace=True)
    dfn.post_lang.replace('se', 'sv', inplace=True)
