# NLP Analysis

1. Define function to get what percent of dataset was pos, neg, neutral
1. For loop function through all chunks

According to [geeksforgeeks](https://www.geeksforgeeks.org/python-sentiment-analysis-using-vader/):

> The Compound score is a metric that calculates the sum of all the lexicon ratings which have been normalized between -1(most extreme negative) and +1 (most extreme positive).
> * positive sentiment : (compound score >= 0.05) 
> * neutral sentiment : (compound score > -0.05) and (compound score < 0.05) 
> * negative sentiment : (compound score <= -0.05)

In [1]:
import pandas as pd
from tqdm.notebook import tqdm as prog_bar
import json
from nltk.sentiment import SentimentIntensityAnalyzer
from pillaralgos_dev import dev_helpers as dev
from pillaralgos.helpers import data_handler as dh

In [2]:
data = json.load(open('data/1028232346.json'))

In [3]:
class sentimentRanker():
    def __init__(self, data, sort_by='abs_overall', limit=10, chunk_length=2, save_json=False):
        """
        Gets data ready for sentiment analysis. Initializes dicts, lists, etc.

        input
        -----
        data: list
            List of dictionaries, a json file opened with json.load(open(file))
        sort_by: str
            Options:
                "positive" - strength of positive sentiment
                "negative" - strength of negative sentiment
                "neutral" - strength of neutral sentiment
                "compound" - overall sentiment where > 0.05 is positive, < -0.05 is negative, in between is neutral
                "abs_overall" - sort by the absolute value of "compound", resulting in a mixture of positive and negative (but not neutral) chat timescripts
            Return timestamps with the highest `sort_by` value
        limit: int, None
            int: Return only the top X timestamps (using df.head(X))
            None: Return all timestamps
        
        chunk_length: int
            How long timestamps returned should be, in minutes.
        """
        self.big_df = dh.organize_twitch_chat(data)  # organize
        self.first_stamp, self.chunks_list = dh.get_chunks(
            self.big_df, min_=chunk_length
        )  # first timestamp + list of X min chunks
        self.vid_id = data[0]["content_id"]

        self.sort_by = sort_by
        self.limit = limit
        self.save_json = save_json

    def run(self):
        import numpy as np
        if type(self.big_df) == pd.DataFrame:
            results = self.thalamus()
            self.results = results
            # results_jsonified sorts by top calc
            json_results = dh.results_jsonified(results, self.first_stamp, self.sort_by)
            if type(self.limit) == int:
                # grab only the top X most used
                json_results = json_results[: self.limit]

            if self.save_json:
                dh.save_json(json_results, f"algo3.6_{self.sort_by}")

            return json_results
        else:
            return np.array([])  # this is an empty numpy array if it is not a DF.

    def thalamus(self):
        chunk_data = pd.DataFrame()

        for chunk in prog_bar(self.chunks_list):
            start = chunk.sort_values('created_at').iloc[0,0]
            end = chunk.sort_values('created_at').iloc[-1,0]
            messages = ''
            chunk_results = pd.DataFrame()
            for idx, row in chunk.iterrows():
                messages += f" {row['body']}"
                cell = row['body']
                res = self.sent_analysis(cell, expand=True)
                chunk_results = chunk_results.append(res)
                
            means_dict = chunk_results.mean().to_dict()
            chunk_means = pd.DataFrame(means_dict,index=[chunk_results.index.values[-1]])
            chunk_means['start'] = start
            chunk_means['end'] = end
            chunk_means['messages'] = messages
            chunk_means['overall'] = chunk_means['compound'].apply(self.cat_compound)
            chunk_means['abs_overall'] = abs(chunk_means['compound'])
            chunk_data = chunk_data.append(chunk_means)

        result = self.finalizer(chunk_data)
        return result
        
    def finalizer(self, dataframe: pd.DataFrame) -> pd.DataFrame:
        
        dataframe['vid_id'] = self.vid_id
        return dataframe
        
    def sent_analysis(self, cell: str, expand = True) -> pd.DataFrame:
        '''
        Gives sentiment scores to strings
        '''
        sia = SentimentIntensityAnalyzer()
        result = sia.polarity_scores(cell)

        if expand:
            result = pd.DataFrame.from_dict(result,orient='index').T
            result['body'] = cell

        return result
    
    def cat_compound(self, score:float) -> str:
        '''
        Helper function that categorizes each compound score as neg,pos,neutral

        Compound score represents the overall sentiment of a sentence
        '''
        # decide sentiment as positive, negative and neutral
        if score >= 0.05:
            return "Positive"

        elif score <= - 0.05:
            return "Negative"

        else:
            return "Neutral"

In [4]:
sr = sentimentRanker(data)
res = sr.run()

  0%|          | 0/104 [00:00<?, ?it/s]

In [5]:
sr.results.sort_values('compound', ascending=False)

Unnamed: 0,neg,neu,pos,compound,start,end,messages,overall,abs_overall,vid_id,first_sec
0,0.000000,0.881750,0.118250,0.194633,2021-05-19 21:57:39.487,2021-05-19 21:59:27.574,😂😂😂😂 Deso deso 😭 allez un pti bonus y a une n...,Positive,0.194633,1028232346,2021-05-19 18:46:39.584
0,0.016059,0.884471,0.099471,0.166888,2021-05-19 22:12:43.196,2021-05-19 22:14:39.035,Main sur le coeur et bonne chance pour ton pr...,Positive,0.166888,1028232346,2021-05-19 18:46:39.584
0,0.041625,0.815125,0.143250,0.150612,2021-05-19 22:16:52.469,2021-05-19 22:18:50.707,C’est bon je suis dedans merci @jennyie_tw Sa...,Positive,0.150612,1028232346,2021-05-19 18:46:39.584
0,0.009947,0.855211,0.134842,0.131742,2021-05-19 21:16:03.871,2021-05-19 21:18:03.290,@biiiibiiii77 je t'explique !wager Le wager e...,Positive,0.131742,1028232346,2021-05-19 18:46:39.584
0,0.043000,0.728267,0.228733,0.129147,2021-05-19 20:39:06.667,2021-05-19 20:40:58.857,en tous cas trop hate de voir les games BONUS...,Positive,0.129147,1028232346,2021-05-19 18:46:39.584
...,...,...,...,...,...,...,...,...,...,...,...
0,0.078250,0.921750,0.000000,-0.053706,2021-05-19 18:46:39.584,2021-05-19 18:48:35.995,j'ai eu la notif moi mercii @TW_Aariel du pre...,Negative,0.053706,1028232346,2021-05-19 18:46:39.584
0,0.055900,0.898500,0.012267,-0.055997,2021-05-19 21:09:43.967,2021-05-19 21:11:41.631,@TW_Aariel LUL @bouskat jsuis pas méchante ma...,Negative,0.055997,1028232346,2021-05-19 18:46:39.584
0,0.132231,0.844615,0.023154,-0.092315,2021-05-19 21:20:09.958,2021-05-19 21:22:02.268,"🚀 Level UP pour Biiiibiiii77 (lvl 27), Iibonj...",Negative,0.092315,1028232346,2021-05-19 18:46:39.584
0,0.145143,0.854857,0.000000,-0.105986,2021-05-19 21:51:17.800,2021-05-19 21:52:41.498,Rejoignez mon Discord en cliquant ici : https...,Negative,0.105986,1028232346,2021-05-19 18:46:39.584


In [6]:
sr.results['overall'].value_counts()

Neutral     79
Positive    19
Negative     6
Name: overall, dtype: int64