### This Notebooks contains lexical exploration

In [1]:
!pip install emoji
!pip install emosent-py

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


#### Loading Libs

In [18]:
import sys
sys.path.append("../../libs/leia/")
from leia import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import emoji
from emosent import get_emoji_sentiment_rank
from datetime import datetime

#### Loading Classifiers

In [4]:
leia_classifier = SentimentIntensityAnalyzer()

#### Custom functions

In [5]:
def remove_duplicate_emoji(orig_str):
    prev_emoji = None
    remove_duplicate_emoji = []
    try:
        for c in orig_str:
            if c in emoji.EMOJI_DATA.keys():
                if prev_emoji == c:
                    continue
                prev_emoji = c
            remove_duplicate_emoji.append(c)
        return "".join(remove_duplicate_emoji)
    except:
        return orig_str

def check_emoji_sentiment(orig_str):
    neg = 0
    neu = 0
    pos = 0
    sco = 0
    
    try:
        emoji_list = emoji.distinct_emoji_list(orig_str)
        if len(emoji_list) > 0:
            for item in emoji_list:
                
                try:
                    sentiment_rank = get_emoji_sentiment_rank(item)
                    neg += sentiment_rank['negative']
                    neu += sentiment_rank['neutral']
                    pos += sentiment_rank['positive']
                    sco += sentiment_rank['sentiment_score']
                
                except:
                    pass
        
        tot = neg + neu + pos 
        
        return {
            'neg': round(neg/tot,3),
            'neu': round(neu/tot,3),
            'pos': round(pos/tot,3),
            'sco': round(sco/tot,3)
        }
    
    except:
        return {
            'neg': round(neg,3),
            'neu': round(neu,3),
            'pos': round(pos,3),
            'sco': round(sco,3)
        }

def check_text_sentiment(orig_str):
    orig_str_without_emoji =  emoji.replace_emoji(remove_duplicate_emoji(orig_str), replace='')
    sentiment_polarity = leia_classifier.polarity_scores(orig_str_without_emoji)
    
    return {
            'neg': sentiment_polarity['neg'],
            'neu': sentiment_polarity['neu'],
            'pos': sentiment_polarity['pos'],
            'compound': sentiment_polarity['compound'],
    }

def compound_sentiment(text_sentiment, emoji_sentiment):
    compound = {
        'neg': 0,
        'neu': 0,
        'pos': 0
    }
    tot = 0
    for kind in compound.keys():
        compound[kind] = text_sentiment[kind] + emoji_sentiment[kind]
        tot += compound[kind]
    return {k: round(v / tot,3) for k, v in compound.items()}

In [38]:
d_type = {
    'text': np.str,
    'author_id': np.float64,
    'conversation_id': np.float64,
    'created_at': np.str,
    'source': np.str,
    'tweet_id': np.float64,
    'geo_id': np.str,
    'retweet_count': np.float64,
    'reply_count': np.float64,
    'like_count': np.float64,
    'quote_count': np.float64,
    'in_reply_to_user_id': np.float64,
    'referenced_type': np.str,
    'referenced_id': np.float64
}

# df = dd.read_csv('../../src/application/data/tweets_brazil.csv', dtype=d_type, error_bad_lines=False)  
df_tweets = pd.read_csv('../../src/application/data/tweets_brazil.csv', sep=',', dtype=d_type, error_bad_lines=False, chunksize=10000)

  interactivity=interactivity, compiler=compiler, result=result)


#### Pre-processing

Removing excessive `emojis`

In [64]:
df_tweets['text'] = df_tweets['text'].apply(lambda x: remove_duplicate_emoji(x))

Splitting sentiment between `emojis` and `text`

`emoji`

In [65]:
df_tweets['emojis_sentiment'] = df_tweets['text'].apply(
    lambda x: check_emoji_sentiment(str(x))
)

`text`

In [66]:
df_tweets['text_sentiment'] = df_tweets['text'].apply(
    lambda x: check_text_sentiment(str(x))
)

In [68]:
df_tweets['sentiment_value'] = df_tweets.apply(lambda x: x['text_sentiment']['compound'], axis=1)

In [72]:
df_tweets.head()

Unnamed: 0,text,author_id,conversation_id,created_at,source,tweet_id,geo_id,retweet_count,reply_count,like_count,quote_count,in_reply_to_user_id,referenced_type,referenced_id,emojis_sentiment,text_sentiment,sentiment_value
0,no gente cês parece retardado de querer saber ...,2800738000.0,1.168307e+18,2019-09-01 23:38:37,Twitter for Android,1.168307e+18,d9d978b087a92583,0.0,1.0,3.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.247, 'neu': 0.753, 'pos': 0.0, 'comp...",-0.7845
1,q vírus é esse vei 🤬😡,1.010376e+18,1.168306e+18,2019-09-01 23:35:29,Twitter for Android,1.168306e+18,d9d978b087a92583,2.0,0.0,1.0,0.0,,,,"{'neg': 0.533, 'neu': 0.107, 'pos': 0.36}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
2,n posso toma duas corona q já fico feliz credo,2969021000.0,1.168304e+18,2019-09-01 23:26:26,Twitter for Android,1.168304e+18,5722ff20ba67083b,0.0,2.0,1.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 0.631, 'pos': 0.369, 'comp...",0.6249
3,era só uma corona geladinha agora poxa vida,63291940.0,1.1683e+18,2019-09-01 23:08:58,Twitter for Android,1.1683e+18,3b5c5c9c62f7c538,0.0,1.0,1.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
4,@SuavinBrito To bebendo corona de leve brota,161037600.0,1.168298e+18,2019-09-01 23:04:25,Twitter for Android,1.168299e+18,97bcdfca1a2dca59,1.0,1.0,1.0,0.0,3431821000.0,replied_to,1.168298e+18,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0


In [73]:
df_tweets.to_csv('pre_proc_data.csv', index=False, header=True)

In [75]:
pd.read_csv('pre_proc_data.csv', sep=',').head()

Unnamed: 0,text,author_id,conversation_id,created_at,source,tweet_id,geo_id,retweet_count,reply_count,like_count,quote_count,in_reply_to_user_id,referenced_type,referenced_id,emojis_sentiment,text_sentiment,sentiment_value
0,no gente cês parece retardado de querer saber ...,2800738000.0,1.168307e+18,2019-09-01 23:38:37,Twitter for Android,1.168307e+18,d9d978b087a92583,0.0,1.0,3.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.247, 'neu': 0.753, 'pos': 0.0, 'comp...",-0.7845
1,q vírus é esse vei 🤬😡,1.010376e+18,1.168306e+18,2019-09-01 23:35:29,Twitter for Android,1.168306e+18,d9d978b087a92583,2.0,0.0,1.0,0.0,,,,"{'neg': 0.533, 'neu': 0.107, 'pos': 0.36}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
2,n posso toma duas corona q já fico feliz credo,2969021000.0,1.168304e+18,2019-09-01 23:26:26,Twitter for Android,1.168304e+18,5722ff20ba67083b,0.0,2.0,1.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 0.631, 'pos': 0.369, 'comp...",0.6249
3,era só uma corona geladinha agora poxa vida,63291940.0,1.1683e+18,2019-09-01 23:08:58,Twitter for Android,1.1683e+18,3b5c5c9c62f7c538,0.0,1.0,1.0,0.0,,,,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
4,@SuavinBrito To bebendo corona de leve brota,161037600.0,1.168298e+18,2019-09-01 23:04:25,Twitter for Android,1.168299e+18,97bcdfca1a2dca59,1.0,1.0,1.0,0.0,3431821000.0,replied_to,1.168298e+18,"{'neg': 0, 'neu': 0, 'pos': 0}","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
