# **Sentiment Analysis**

In [1170]:
import pandas as pd
import numpy as np
from datetime import datetime

import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
import scipy.stats as stats

## **Preprocessing**

In [1171]:
import re
from textblob import TextBlob
from textblob import WordList

import nltk 
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
nltk.download('vader_lexicon') 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

stopwords = set(stopwords.words('english'))
additional_stopwords = {
    "huh", 
    "oh",
    "ayo",
    "rt", # retweet
    "lrt", # last retweet
    "im",
    "gon",
    "na",
    'ca',
    'nt',
    'wan',
    'na',
    'lol',
    'lmao',
    'rofl',
    'lmfao',
    'hi',
    'hello',
    'haha',
    'hahaha',
    'eh',
    'dah',
    'la',
    'lah',
    "ka",
    "ke",
    "kah",
    "aku",
    "kau",
    "guys"
    }

short_forms = {
    "irl": "in real life",
    "u": "you",
    "tpm": "dpm",
    "malaysians": "malaysian",
    "ds": "dato seri",
    "pm": "Prime Minister",
    "PM": "Prime Minister",
    "pm10": "Prime Minister 10",
    "PM10": "Prime Minister 10",
    "PMX": "Prime Minister 10",
    "pmx": "Prime Minister 10",
    "congrats": "congratulations",
    "congratulation": "congratulations",
    "tahniah": "congratulations",
    "btw": "by the way",
    "omg": "oh my god",
    "ni": "this",
    "nt": "not",
    "msia": "malaysia",
    "gov": "government",
    "govt": "government",
    "pls": "please",
    "pru": "General Election",
    "pru15": "General Election 15",
    "ge": "General Election",
    "ge15": "General Election 15",
    "kl": "kuala lumpur",
    "ngos": "ngo",
    "eksyen": "action",
    "wtf": "what the fuck",
    "tf": "the fuck",
    "stfu": "shut the fuck up",
    "idk": "i don't know",
    "dont": "do not",
    "don't": "do not",
    "can't": "cannot",
    "won't": "will not",
    "it's": "it is",
    "isn't": "is not",
    "we're": "we are",
    "you're": "you are",
    "they're": "they are",
    "he's": "he is",
    "she's": "she is",
    "I'm": "I am",
    "that's": "that is",
    "there's": "there is",
    "let's": "let us",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "how's": "how is",
    "didn't": "did not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "can't": "can not",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "mightn't": "might not",
    "mustn't": "must not",
    "i've": "I have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "he'd": "he would",
    "she'd": "she would",
    "i'd": "I would",
    "you'd": "you would",
    "we'd": "we would",
    "they'd": "they would",
    "he'll": "he will",
    "she'll": "she will",
    "i'll": "I will",
    "you'll": "you will",
    "we'll": "we will",
    "they'll": "they will",
    "i'd": "I had",
    "you'd": "you had",
    "we'd": "we had",
    "they'd": "they had",
    "should've": "should have",
    "could've": "could have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    "ought to": "should",
    "need to": "should",
    "gotta": "got to",
    "wanna": "want to",
    "kinda": "kind of",
    "sorta": "sort of",
    "outta": "out of",
    "aren't": "are not",
    "isn't": "is not",
    "wasn't": "was not",
    "weren't": "were not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "can't": "cannot",
    "couldn't": "could not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "mightn't": "might not",
    "mustn't": "must not",
    "daren't": "dare not",
    "ain't": "is not",
    "it's": "it is",
    "let's": "let us",
    "that's": "that is",
    "what's": "what is",
    "where's": "where is",
    "who's": "who is",
    "how's": "how is",
    "there's": "there is",
    "here's": "here is",
    "smh": "shake my head",
    "fyi": "for your information",
    "imo": "in my opinion",
    "brb": "be right back",
    # Add more short forms/contractions and their expansions as needed
}

stopwords.update(additional_stopwords)


[nltk_data] Downloading package brown to /Users/waizwafiq/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/waizwafiq/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/waizwafiq/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [1172]:
# event1 = pd.read_csv('./data/events/KerajaanGagal_until2022-10-12_event.csv')
# event1 = pd.read_csv('./data/events/KerajaanGagal_event.csv')
# event1 = pd.read_csv('./data/political_figs/IsmailSabri60_posts.csv')
event1 = pd.read_csv('./data/political_figs/anwaribrahim_posts.csv')
# event1 = pd.read_csv('./data/political_figs/DrZahidHamidi_posts.csv')

In [1173]:
def remove_URL(text):
    return re.sub(r"http\S+", "", text) 

def remove_hashtags(sample):
    return re.sub(r"#\S+", "", sample) 

def remove_breaklines(text):
    return re.sub(r"\n", " ", text)

# remove stopwords from a string
def remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stopwords]
    return ' '.join(filtered_text)

def remove_symbols(text):
    return re.sub(r"[^\w\s]", "", text)

def remove_whitespace(text):
    return re.sub(r"\s+", " ", text.strip())

def expand_shortforms(text):
    words = text.split()
    expanded_words = [short_forms.get(word.lower(), word) for word in words]
    expanded_text = ' '.join(expanded_words)
    return expanded_text

def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

def convert_date(date_str):
    """
    Converts the given date string into the desired format.

    Parameters:
    - date_str (str): The input date string to be converted.

    Returns:
    - str: The converted date string in the format "DD-MM-YYYY".

    Steps:
    1. Get the current datetime.
    2. If the length of the date string is greater than 3 and it contains a comma:
        a. Try to parse the date string with the format "%b %d, %Y".
        b. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        c. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    3. If the date string ends with 'h', 'm', or 's':
        a. Return the current date formatted as "DD-MM-YYYY".
    4. If the length of the date string is 10 and it has '-' at positions 2 and 5:
        a. Return the date string as it is without any changes.
    5. Otherwise, try to parse the date string with the format "%b %d".
        a. If successful, format the parsed date as "DD-MM-YYYY" and return it.
        b. If parsing fails, return the current date formatted as "DD-MM-YYYY".
    """
    now = datetime.now()
    if len(date_str) > 3 and ',' in date_str:
        try:
            date = datetime.strptime(date_str, "%b %d, %Y")
            return date.strftime("%d-%m-%Y")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    elif date_str.endswith('h') or date_str.endswith('m') or date_str.endswith('s'):
        return now.strftime("%d-%m-%Y")
    elif len(date_str) == 10 and date_str[2] == '-' and date_str[5] == '-':
        return date_str
    else:
        try:
            date = datetime.strptime(date_str, "%b %d")
            return date.strftime("%d-%m-2023")
        except ValueError:
            return now.strftime("%d-%m-%Y")
    

def processText(text):
    # out = lowercase(text)
    out = remove_URL(text)
    out = remove_hashtags(out)
    out = expand_shortforms(out)
    out = remove_stopwords(out)
    out = remove_breaklines(out)
    out = remove_symbols(out)
    out = remove_whitespace(out)
    return out
    

# Apply the function to the DataFrame column
event1['Post'] = event1['Post'].apply(processText)
# event1['Post'] = event1['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

try:
    # Apply the conversion function to the "Date Posted" column
    event1['Date Posted'] = event1['Date Posted'].apply(convert_date)
    event1['Date Posted'] = pd.to_datetime(event1['Date Posted'], format='%d-%m-%Y')
except TypeError:
    pass

event1[:5]

Unnamed: 0,Twitter Username,Post,Date Posted
0,@1205file,congratulations ANWAR IBRAHIM MALAYSIA S 10TH ...,2022-11-24
1,@13Suria_,MALAYSIA dato seri Anwar Ibrahim Prime Minister,2022-11-24
2,@1407_chauhan,Congratulations Dato Seri anwaribrahim electio...,2022-11-25
3,@1412_tharsika,time change everything king back,2022-11-24
4,@1711Sjagi,think minority Indians Chinese wants better Ma...,2022-11-22


In [1174]:
def lemmatize_word(word, pos):
    lemmatizer = WordNetLemmatizer()
    if pos.startswith('J'):
        pos = wordnet.ADJ
    elif pos.startswith('V'):
        pos = wordnet.VERB
    elif pos.startswith('N'):
        pos = wordnet.NOUN
    elif pos.startswith('R'):
        pos = wordnet.ADV
    else:
        pos = wordnet.NOUN  # Default to noun if the part of speech is not recognized
    
    return lemmatizer.lemmatize(word, pos=pos)

def stem_word(word):
    stemmer = PorterStemmer()
    stemmed_word = stemmer.stem(word)
    
    # Preserve original case
    if word[0].isupper():
        stemmed_word = stemmed_word.capitalize()
    elif word.isupper():
        stemmed_word = stemmed_word.upper()
        
    return stemmed_word

# Lemmatize
event1['Post'] = event1['Post'].apply(lambda x: ' '.join([lemmatize_word(word, pos) for word, pos in nltk.pos_tag(nltk.word_tokenize(x))]))

# Stemming
# event1['Post'] = event1['Post'].apply(lambda x: ' '.join([stem_word(word) for word in nltk.word_tokenize(x)]))

In [1175]:
del_max_token = 5
event1['post_split'] = event1['Post'].apply(lambda x: x.split(" ") if isinstance(x, str) else x)
event1['tokens_num'] = event1['post_split'].apply(lambda x: len(x))
event1[event1['tokens_num'] <= del_max_token]

Unnamed: 0,Twitter Username,Post,Date Posted,post_split,tokens_num
3,@1412_tharsika,time change everything king back,2022-11-24,"[time, change, everything, king, back]",5
20,@808_MUKHLEEZ,SEE General Election 16,2022-11-20,"[SEE, General, Election, 16]",4
38,@ARajasaikaran,take 25 year yall anwaribrahim,2022-11-24,"[take, 25, year, yall, anwaribrahim]",5
40,@ARgannapathy,congratulation malaysia anwaribrahim,2022-11-24,"[congratulation, malaysia, anwaribrahim]",3
47,@AbdFauzi,Keeper malaysia bodyguard anwaribrahim,2023-01-03,"[Keeper, malaysia, bodyguard, anwaribrahim]",4
...,...,...,...,...,...
2944,@yarraharajuku,Unity Racism Lost,2022-11-24,"[Unity, Racism, Lost]",3
2946,@yash_rajesh,Tonight sleep smile anwaribrahim,2022-11-24,"[Tonight, sleep, smile, anwaribrahim]",4
2956,@yogurtnyou,take lead,2022-11-19,"[take, lead]",2
2959,@youarenot_GaGa,People sing Negaraku together istana,2022-11-24,"[People, sing, Negaraku, together, istana]",5


In [1176]:
event1.drop(event1[event1['tokens_num'] <= del_max_token].index, inplace=True)
event1.reset_index(drop=True, inplace=True)
len(event1)

2607

## **Noun Frequency Analysis**

In [1177]:
nouns=[]
for i in event1["Post"]:
  blob = TextBlob(i).noun_phrases
  nouns.extend(blob)
nouns = np.array(nouns)
nouns_values, nouns_counts = np.unique(nouns, return_counts=True)
np.unique(nouns_counts, return_counts=True)

(array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
          12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   23,
          25,   26,   27,   29,   31,   32,   33,   39,   43,   45,   50,
          64,   65,   75,   82,   83,   87,  113,  133,  427,  978, 1328]),
 array([6058,  418,  140,   61,   40,   30,   16,    7,   11,    8,    9,
           6,    2,    8,    4,    2,    3,    2,    1,    2,    3,    1,
           2,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1]))

In [1178]:
for i in range(1, 20):
    index_list=np.where(nouns_counts == i)
    nouns_counts=np.delete(nouns_counts, index_list)
    nouns_values=np.delete(nouns_values, index_list)
np.unique(nouns_counts, return_counts=True)

(array([  20,   21,   23,   25,   26,   27,   29,   31,   32,   33,   39,
          43,   45,   50,   64,   65,   75,   82,   83,   87,  113,  133,
         427,  978, 1328]),
 array([2, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]))

In [1179]:
print(len(nouns_values))
print(len(nouns_counts))
count_sort_ind = np.argsort(nouns_counts)

29
29


In [1180]:
x = list(nouns_values[count_sort_ind])
y = list(nouns_counts[count_sort_ind])
fig = go.Figure(go.Bar(x=x, y=y, name='Nouns', marker=dict(cmax=1236, cmin=0, color=y, colorbar=dict(title="Scale"), colorscale="Viridis")))
fig.update_layout(title_text='Nouns Distribution: Total words = '+str(len(nouns_values))+' : Zoom In to See All Words' )
fig.write_html("nouns_distribution.html")
fig

## **Verb Frequency Analysis**

In [1203]:
custom_verbs = WordList(['umno', 's', 'anwaribrahim', 'u'])

verbs = []
for i in event1["Post"]:
    blob = TextBlob(i)
    blob_tags = blob.tags  # Get the POS tags for the words
    verb_phrases = [word for word, pos in blob_tags if pos.startswith('VB') and word.lower() not in custom_verbs]  # Extract verbs (POS tag starting with 'VB')
    verbs.extend(verb_phrases)

verbs = np.array(verbs)
verbs_values, verbs_counts = np.unique(verbs, return_counts=True)
np.unique(verbs_counts, return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  27,
         28,  29,  30,  31,  32,  36,  38,  52,  53,  56,  62,  63,  66,
         68,  73,  82,  88, 105, 110, 127, 135, 151, 156, 169]),
 array([690, 135,  53,  41,  20,  13,   9,   6,   9,  10,   6,   3,   2,
          6,   6,   2,   1,   2,   3,   2,   1,   1,   2,   1,   2,   1,
          1,   1,   1,   1,   2,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1]))

In [1204]:
for i in range(1, 20):
    index_list = np.where(verbs_counts == i)
    verbs_counts = np.delete(verbs_counts, index_list)
    verbs_values = np.delete(verbs_values, index_list)
np.unique(verbs_counts, return_counts=True)

(array([ 20,  21,  22,  23,  24,  25,  27,  28,  29,  30,  31,  32,  36,
         38,  52,  53,  56,  62,  63,  66,  68,  73,  82,  88, 105, 110,
        127, 135, 151, 156, 169]),
 array([2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [1205]:
print(len(verbs_values))
print(len(verbs_counts))
count_sort_ind = np.argsort(verbs_counts)

35
35


In [1206]:
x = list(verbs_values[count_sort_ind])
y = list(verbs_counts[count_sort_ind])
fig = go.Figure(go.Bar(x=x, y=y, name='Verbs', marker=dict(cmax=1236, cmin=0, color=y, colorbar=dict(title="Scale"), colorscale="Viridis")))
fig.update_layout(title_text='Verbs Distribution: Total words = '+str(len(verbs_values))+' : Zoom In to See All Words' )
fig.write_html("verbs_distribution.html")
fig

## **Adjectives Frequency Analysis**

In [1207]:
adjectives = []

custom_adjectives = WordList(['umno', 's', 'anwaribrahim', 'u', 'prime', 'seri', 'dato'])

for post in event1["Post"]:
    blob = TextBlob(post)
    for word, pos in blob.tags:
        if pos.startswith('JJ') and word.lower() not in custom_adjectives:  # Check if the word is an adjective
            adjectives.append(word.lower())

adjectives = np.array(adjectives)
adjectives_values, adjectives_counts = np.unique(adjectives, return_counts=True)
np.unique(adjectives_counts, return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  24,  25,  26,  27,  28,
         30,  32,  33,  34,  35,  36,  37,  38,  39,  40,  42,  45,  50,
         53,  58,  61,  63,  71,  90, 100, 118, 231, 249]),
 array([910, 201,  96,  42,  38,  23,  14,  20,   7,  10,   9,   9,   5,
          5,   4,   3,   4,   3,   4,   1,   2,   3,   1,   2,   1,   2,
          1,   1,   2,   1,   1,   1,   1,   2,   2,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1]))

In [1208]:
for i in range(1, 20):
    index_list = np.where(adjectives_counts == i)
    adjectives_counts = np.delete(adjectives_counts, index_list)
    adjectives_values = np.delete(adjectives_values, index_list)

np.unique(adjectives_counts, return_counts=True)

(array([ 20,  21,  24,  25,  26,  27,  28,  30,  32,  33,  34,  35,  36,
         37,  38,  39,  40,  42,  45,  50,  53,  58,  61,  63,  71,  90,
        100, 118, 231, 249]),
 array([1, 2, 3, 1, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]))

In [1209]:
count_sort_ind = np.argsort(adjectives_counts)
x = list(adjectives_values[count_sort_ind])
y = list(adjectives_counts[count_sort_ind])

In [1210]:
fig = go.Figure(go.Bar(x=x, y=y, name='Adjectives', marker=dict(cmax=max(y), cmin=min(y), color=y, colorbar=dict(title="Scale"), colorscale="Viridis")))
fig.update_layout(title_text='Adjectives Distribution: Total words = ' + str(len(adjectives_values)) + ' : Zoom In to See All Words')
fig.write_html("adjectives_distribution.html")
fig.show()

## **Tweet Similarities**

In [1189]:
event1_posts = event1["Post"]

tfidf_vectorizer = TfidfVectorizer() 
tfidf_matrix = tfidf_vectorizer.fit_transform(event1_posts)
tfidf_matrix.shape

(2607, 6762)

In [1190]:
c_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [1191]:
# Show Top N similar tweets (ascending order)
N = 20

c_sim_sum=c_sim.sum(axis=1)
top_indices = np.argsort(c_sim_sum)[-N:]

print("c_sim_sum =", c_sim_sum[top_indices])
event1_posts[top_indices]

c_sim_sum = [266.84894624 272.11638321 272.11638321 272.495722   274.83265719
 274.83265719 274.83265719 274.83265719 279.07837997 289.24088674
 292.20965837 292.20965837 297.98394394 297.98394394 297.98394394
 297.98394394 297.98394394 297.98394394 297.98394394 304.52058728]


2503    congratulation dato seri Anwar Ibrahim Malaysi...
1244    DATO SERI ANWAR IBRAHIM 10TH PRIME MINISTER MA...
2135    10th Prime Minister Malaysia dato seri Anwar I...
1421    Congratulations Dato Seri Anwar Ibrahim 10th P...
649     congratulation Anwar Ibrahim 10th Prime Minist...
2482    congratulation Anwar Ibrahim 10th Malaysia Pri...
1922    congratulation Anwar Ibrahim 10th Prime Minist...
0       congratulation ANWAR IBRAHIM MALAYSIA S 10TH P...
1783    Congratulations Datuk Seri Anwar Ibrahim 10th ...
65      Congratulations Anwar Ibrahim anwaribrahim Pri...
1946    Congratulations Dato Seri Anwar Ibrahim 10th P...
1528    Congratulations Malaysia 10th Prime Minister D...
1699           Anwar Ibrahim 10th prime minister Malaysia
397            Anwar Ibrahim 10th Prime Minister Malaysia
1881           Anwar Ibrahim prime minister 10th Malaysia
218            Anwar Ibrahim 10th Prime Minister Malaysia
2556           Anwar Ibrahim Malaysia 10th Prime Minister
1666          

In [1192]:
# Display similarity heatmap of Top N tweets
N = 20

indices = np.arange(len(event1_posts))
selected_indices = indices[top_indices]

fig = go.Figure(data=go.Heatmap(
    z=[list(item) for item in c_sim[np.ix_(selected_indices, selected_indices)]],
    x=[str(i) for i in selected_indices],
    y=[str(i) for i in selected_indices]
))

fig.write_html("sentences_scatter.html")
print('Plot Saved as sentences_scatter.html')

Plot Saved as sentences_scatter.html


## **Sentiment Analysis**

In [1193]:
lexicons = {
    'rid': -0.5,
    'fuck': -0.9,
    'shit': -0.75,
    'culprit': -0.55,
    'corrupt': -0.95,
    'corruption': -0.3
}

def custom_SentimentScore(text):
    words = text.split()
    sentiment_score = 0.0
    for word in words:
        if word.lower() in lexicons:
            sentiment_score += lexicons[word.lower()]
    
    return sentiment_score

In [1194]:
polarities_textblob = [TextBlob(post).sentiment.polarity for post in event1_posts]
polarities_custom = [custom_SentimentScore(post) for post in event1_posts]

max_custom_score = sum(abs(score) for score in lexicons.values())
combined_polarities = [(tb + cs) for tb, cs in zip(polarities_textblob, polarities_custom)]

# Adjust scores outside the range of -1 to 1
combined_polarities = [max(min(score, 1), -1) for score in combined_polarities]

polarities_df = pd.DataFrame(combined_polarities, columns=["pol"])
polarities_df.describe()

Unnamed: 0,pol
count,2607.0
mean,0.149058
std,0.293589
min,-1.0
25%,0.0
50%,0.068182
75%,0.312311
max,1.0


In [1195]:
fig = px.histogram(polarities_df, x="pol")
fig.update_layout(title_text='Sentiment Analysis Histogram: Number of Tweets = '+str(len(event1_posts)) )
fig.write_html("sentiment_histogram.html")
fig.show()

In [1196]:
# Remove 'neutral' polarity by the range of [-p, p]
p = 0.01
polarities_df_p = polarities_df[(polarities_df['pol'] <= -p) | (polarities_df['pol'] >= p)]
fig = px.histogram(polarities_df_p, x="pol")
fig.update_layout(title_text='Sentiment Analysis Histogram: Number of Tweets = ' + str(len(polarities_df_p)) )
fig.write_html(f"sentiment_histogram_p.html")
fig.show()
polarities_df_p.describe()

Unnamed: 0,pol
count,1757.0
mean,0.221174
std,0.334599
min,-1.0
25%,0.065
50%,0.204545
75%,0.418182
max,1.0


In [1197]:
# Create the histogram
fig = px.histogram(polarities_df, x="pol")

# Compute the mean and standard deviation
mean = polarities_df["pol"].mean()
std = polarities_df["pol"].std()

# Generate the x-values for the bell curve
x = np.linspace(-1, 1, 1000)

# Fit a skewed normal distribution to the polarity data
params = stats.skewnorm.fit(polarities_df["pol"])
pdf = stats.skewnorm.pdf(x, *params)

# Add the bell curve as a line plot
fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))
# Set the layout and save the plot
fig.update_layout(title_text='Sentiment Analysis Histogram: Number of Tweets = ' + str(len(polarities_df)))
fig.write_html("sentiment_histogram.html")

# Display the plot
fig.show()

# Describe the statistics of the polarity values
polarities_df.describe()

Unnamed: 0,pol
count,2607.0
mean,0.149058
std,0.293589
min,-1.0
25%,0.0
50%,0.068182
75%,0.312311
max,1.0


In [1198]:
p = 0.1  # Remove 'neutral' polarity by the range of [-p, p]

polarities_df_p = polarities_df[(polarities_df['pol'] <= -p) | (polarities_df['pol'] >= p)]

# Create the histogram
fig = px.histogram(polarities_df_p, x="pol")

# Compute the mean and standard deviation
mean = polarities_df_p["pol"].mean()
std = polarities_df_p["pol"].std()

# Generate the x-values for the bell curve
x = np.linspace(-1, 1, 1000)

# Fit a skewed normal distribution to the polarity data
params = stats.skewnorm.fit(polarities_df_p["pol"])
pdf = stats.skewnorm.pdf(x, *params)

# Add the bell curve as a line plot
fig.add_trace(go.Scatter(x=x, y=pdf*100, mode='lines', name='Bell Curve'))

# Set the layout and save the plot
fig.update_layout(title_text='Sentiment Analysis Histogram: Number of Tweets = ' + str(len(polarities_df_p)))
fig.write_html("sentiment_histogram_p.html")

# Display the plot
fig.show()

# Describe the statistics of the polarity values
polarities_df_p.describe()

Unnamed: 0,pol
count,1485.0
mean,0.259009
std,0.350127
min,-1.0
25%,0.136364
50%,0.259091
75%,0.468182
max,1.0
