Copyright (c) <2022>, <Regina Nockerts>
All rights reserved.

This source code is licensed under the BSD-style license found in the
LICENSE file in the root directory of this source tree. 

__NOTE__ to the user: In first use, this notebook cannot be run top to bottom. It assumes that you have a bunch of csv files that are created at different points in the notebook.

In [1]:
import pandas as pd
import numpy as np
import re
import pickle 
from nlpUtils import aardvark as aa 
from sklearn.feature_extraction.text import CountVectorizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('fivethirtyeight')
import seaborn as sns
# sns.set(font_scale=1.5)

In [None]:
import importlib
importlib.reload(aa)

# Setup
Assumes that you have completed dataSplitBalance

In [17]:
sid = SentimentIntensityAnalyzer()

In [None]:
a_file = open("data/change_lex.pkl", "rb")
change_lex = pickle.load(a_file)
a_file.close()
print(change_lex['red_heart_e'])

sid.lexicon.update(change_lex)

In [4]:
# Import the files that result from dataSplitBalance

unbal_x_train = pd.read_csv("dataBalancedSets/unbal_x_train.csv", header=0, index_col=0)
unbal_x_val = pd.read_csv("dataBalancedSets/unbal_x_val.csv", header=0, index_col=0)
unbal_y_train = pd.read_csv("dataBalancedSets/unbal_y_train.csv", header=0, index_col=0)
unbal_y_val = pd.read_csv("dataBalancedSets/unbal_y_val.csv", header=0, index_col=0)

under_x_train = pd.read_csv("dataBalancedSets/under_x_train.csv", header=0, index_col=0)
under_x_val = pd.read_csv("dataBalancedSets/under_x_val.csv", header=0, index_col=0)
under_y_train = pd.read_csv("dataBalancedSets/under_y_train.csv", header=0, index_col=0)
under_y_val = pd.read_csv("dataBalancedSets/under_y_val.csv", header=0, index_col=0)

underOver_y_train = pd.read_csv("dataBalancedSets/underOver_y_train.csv", header=0, index_col=0)
underOver_x_train = pd.read_csv("dataBalancedSets/underOver_x_train.csv", header=0, index_col=0)
underOver_y_val = pd.read_csv("dataBalancedSets/underOver_y_val.csv", header=0, index_col=0)
underOver_x_val = pd.read_csv("dataBalancedSets/underOver_x_val.csv", header=0, index_col=0)

# And the test dataset
x_test = pd.read_csv("dataBalancedSets/x_test.csv", header=0, index_col=0)
y_test = pd.read_csv("dataBalancedSets/y_test_sent.csv", header=0, index_col=0)

# And some odds and ends
tweets_clean  = pd.read_csv("archiveData/cleanB_tweets_clean.csv", header=0, index_col=0) 
emoji_df_full = pd.read_csv("data/emoji_full.csv", header=0, index_col=0)
all_unlabeled_tweets = pd.read_csv("data/all_unlabeled_tweets.csv", header=0, index_col=0)

print("Unbalanced:")
print("x-train:", unbal_x_train.shape, "x-val:", unbal_x_val.shape, "y-train:", unbal_y_train.shape, "y-val:", unbal_y_val.shape)
print("Undersampled")
print("x-train:", under_x_train.shape, "x-val:", under_x_val.shape, "y-train:", under_y_train.shape, "y-val:", under_y_val.shape)
print("Under-Oversampled")
print("x-train:", underOver_x_train.shape, "x-val:", underOver_x_val.shape, "y-train:", underOver_y_train.shape, "y-val:", underOver_y_val.shape)
print("TEST DATA")
print("x-TEST:", x_test.shape, "y-TEST:", y_test.shape)
emoji_df_full.head()


Unbalanced:
x-train: (823, 3) x-val: (206, 3) y-train: (823, 5) y-val: (206, 5)
Undersampled
x-train: (574, 3) x-val: (144, 3) y-train: (574, 5) y-val: (144, 5)
Under-Oversampled
x-train: (982, 3) x-val: (247, 3) y-train: (982, 5) y-val: (247, 5)
TEST DATA
x-TEST: (182, 3) y-TEST: (182, 5)


Unnamed: 0,emoji,demoji,VaderEmojiScore,emosentScore,emojiScore,analog
0,🚨,:police_car_light:,0.0,0.673,0.673,TEST
1,🙏,:folded_hands:,0.0,0.418,0.418,
2,🤷,:person_shrugging:,0.0,,-0.3875,don't care
3,🙄,:face_with_rolling_eyes:,0.0,,-0.3875,don't care
4,😂,:face_with_tears_of_joy:,0.4404,0.221,0.221,


In [5]:
drop_cols = ['Date', 'Content', 'Labels', 'label_sent', 'label_stance', 'y_stance', 'Flag']
tweets_clean.drop(drop_cols, inplace=True, axis=1 )
print(tweets_clean.shape)
tweets_clean.head()


(1211, 3)


Unnamed: 0,id_stable,ContentClean,y_sent
0,170314,Per a White House official: Biden and Harris m...,1
1,192623,Afghan Refugee kid educated in Iran wins this ...,2
2,106982,Not only did Trump stop processing asylum & re...,0
3,31609,An Afghan refugee demands the US not forget he...,0
4,152666,One moment you hate refugees and the next you ...,2


# NOTES
* the tweet_clean is the full, unsplit set - NOT for model development, only for finding emojis.
* the unbalanced, and testing sets can be used for VADER model development

_____________ FUNCTIONS ____________

In [None]:
# create the sentiment intensity dictionary object
# sid = SentimentIntensityAnalyzer()  #NOTE: this NEEDS to stay outside of the functions. I will be modifying it.

# FROM aardvark
# creates the sentiment intensity dictionary: aa.vader_sid(tweet)
# gets the compound score: aa.vader_sent_compound(tweet)
# gets the classification of the compund score using the authors' suggested cutoff points: aa.vader_pred(tweet, pos_cut, neg_cut)


# Data Prep
VADER should do better if we get the input into better shape.

### What if we use the Content v. ContentClean column that we used for labeling?
Remember that VADER has its own way of dealing with punctuation, capitalization, modifiers, negations, stopwords, tokenization and lemmatization. Earlier cleaning was done to try not to mess with that. I tested to make sure that was done correctly. The scores are the same, either set. This code has been moved to the graveyard.

(A nice tutorial explaining this: https://towardsdatascience.com/are-you-scared-vader-understanding-how-nlp-pre-processing-impacts-vader-scoring-4f4edadbc91d)


### What abou the demoji?
For VADER, I will have to create a dictionary of these codes as "words" that can be added to the lexicon. We started this by finding all the emoji and saving them to a dataframe: emoji_df_full
* keep the scores from the emosent library as the prioirity
* Use the VADER score as a backup
* Manually check the results to make sure they are reasonable and identify ones to customize.

# Emoji
ref: vaderEmoji.ipynb

In [6]:
emoji_df_full

Unnamed: 0,emoji,demoji,VaderEmojiScore,emosentScore,emojiScore,analog
0,🚨,:police_car_light:,0.0000,0.673,0.6730,TEST
1,🙏,:folded_hands:,0.0000,0.418,0.4180,
2,🤷,:person_shrugging:,0.0000,,-0.3875,don't care
3,🙄,:face_with_rolling_eyes:,0.0000,,-0.3875,don't care
4,😂,:face_with_tears_of_joy:,0.4404,0.221,0.2210,
...,...,...,...,...,...,...
1101,🦾,:mechanical_arm:,0.0000,,0.5560,💪
1102,🏃🏾‍♂️,:man_running_medium-dark_skin_tone:,0.0000,,0.0000,
1103,🚑,:ambulance:,0.0000,0.091,0.0910,
1104,🎃,:jack-o-lantern:,0.0000,0.617,0.6170,


### Emosent
Will the emosent package work for me?

In [7]:
print(emoji_df_full["emosentScore"].value_counts())
print(emoji_df_full["emosentScore"].isnull().value_counts())

 0.000    21
 1.000    18
 0.333    16
 0.500     9
 0.400     7
          ..
 0.063     1
 0.179     1
 0.581     1
-0.314     1
 0.617     1
Name: emosentScore, Length: 282, dtype: int64
True     638
False    468
Name: emosentScore, dtype: int64


Kinda. It has about half (missing 638) . But it seems to miss some of the important ones that I need. 
* 🤷, 🤮, etc.

And for the symbols where they overlap, the VADER and emosent scores do necessarilly agree and are sometimes very far off:
* 💔 (broken_heart): 0.2732 v. -0.122
* 😭 (loudly_crying_face): -0.4767 v. -0.093

In these cases, the emosent score (2nd) seems more appropriate

And some of the values are just off for __this__ dataset. For example, the stack of dollars (💵) has a emosent score of 0.423 - very high. Which makes sense normally: money is good. But in this dataset, it shows up when people are stressing the overly high cost of refugee or ilitary operations, or are talking about corruption. 

As this tool has been validated, I'll consider the values they have. But I'll still have to assign my own values to the remaining half. So: first emosent; if not, then VADER; if not, then my ranking; and my own ranking for emojis that are used differently than normal in my dataset.

NOTE: I will have to add the emosent and my emojis to the dictionary. 
* For more insight on ranking: http://kt.ijs.si/data/Emoji_sentiment_ranking/

### Get final sentiment value for each emoji

In [8]:
# Add column for the final value
emoji_df_full['emojiScore'] = np.NaN
emoji_df_full.head()

Unnamed: 0,emoji,demoji,VaderEmojiScore,emosentScore,emojiScore,analog
0,🚨,:police_car_light:,0.0,0.673,,TEST
1,🙏,:folded_hands:,0.0,0.418,,
2,🤷,:person_shrugging:,0.0,,,don't care
3,🙄,:face_with_rolling_eyes:,0.0,,,don't care
4,😂,:face_with_tears_of_joy:,0.4404,0.221,,


In [9]:
for i, v, e, s in zip(emoji_df_full.index, emoji_df_full["VaderEmojiScore"], emoji_df_full['emosentScore'], emoji_df_full["emojiScore"]):
    if pd.isnull(e) == True:
        if pd.isnull(v) == False:
            emoji_df_full['emojiScore'].iloc[i] = v
    elif e != 0:
        emoji_df_full['emojiScore'].iloc[i] = e
print("NANs after filling: \n", emoji_df_full["emojiScore"].isnull().value_counts())
emoji_df_full['emojiScore'] = emoji_df_full['emojiScore'].copy()
emoji_df_full.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emoji_df_full['emojiScore'].iloc[i] = e
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emoji_df_full['emojiScore'].iloc[i] = v


NANs after filling: 
 False    1085
True       21
Name: emojiScore, dtype: int64


Unnamed: 0,emoji,demoji,VaderEmojiScore,emosentScore,emojiScore,analog
0,🚨,:police_car_light:,0.0,0.673,0.673,TEST
1,🙏,:folded_hands:,0.0,0.418,0.418,
2,🤷,:person_shrugging:,0.0,,0.0,don't care
3,🙄,:face_with_rolling_eyes:,0.0,,0.0,don't care
4,😂,:face_with_tears_of_joy:,0.4404,0.221,0.221,


In [10]:
# NOTE: don't run if you don't need to:
# emoji_df_full.to_csv("archiveData/emoji_full_temp.csv")

### --> open the csv and edit
Only look at the entire database, not the labeled tweets, when deciding what to do with these.

In [11]:
# How often does the emoji appear?
print(aa.term_check("🔵", all_unlabeled_tweets))

('🔵', 10)


In [None]:
# How is the emoji generally used?
for i in all_unlabeled_tweets["ContentClean"]:
    if "🙋" in i:
        print(i)

In [13]:
# What is the score of clearly analagous emoji or text?
term = "😭"
print("emonsent:", aa.emosent_score(term))  # works for emoji
print("VADER", aa.vader_sent_compound(term))  # works for text

emonsent: -0.093
VADER -0.4767


From the list of emojis gathered in the entire dataset. Scoring based on expert knowledge and without reference to the data or labels.

I changed values for emojis that:
* have clear analogs in other emojis - eg. different color or skin tones.
  * When this was done, I made a note of the analog in a new column, "analog"
* the most direct text translation is an emotion (eg. heart, thumbs-up) or action (eg. facepalming, dancing), and not a noun
* the emoji is a generally know sign or symbol, eg. biohazard sign, peace symbol
* reference money - will have a different meaning when talking about government actions/programs
  
I changed the following categoreis to 0.0:
* means of communication (eg. microphone, television, telephone) - tend to be associated with news media or CTAs
* simple geometric forms, other than hearts - tend to be used as special bullet points
* government bodies - will have different meaning when talking about legalistic situations
* pointers and arrows - used to spatially indicate a reference or emphasize
  
I did not atempt to find substitutes for all emojis. 
* occupations
* objects

### NOTES
There are clearly better ways to deal with skin tones and emoji: searching for one "hands raised" person returns all. This method works for this project, but is not ideal. Future work, I guess.

Some interesting emojis to look at with the training data:  
* 🦍	:gorilla:
* ❄️	:snowflake:
* 🛃	:customs:
* 🛂	:passport_control:
* 🏳️‍⚧️	:transgender_flag:
* 🦠	:microbe:
* ⚖	:balance_scale:
* 🗳️	:ballot_box_with_ballot:
* ⌛	:hourglass_done:
* 👪	:family:	0.0	-0.018
  
### --> reload the new, modified emoji_df_full

In [14]:
emoji_df_full = pd.read_csv("data/emoji_full_mod1.csv", header=0, index_col=0)  ## there is a copy in archiveData: emoji_full_Copy)
emoji_df_full.head()

Unnamed: 0,emoji,demoji,VaderEmojiScore,emosentScore,emojiScore,analog
0,🚨,:police_car_light:,0.0,0.673,0.673,TEST
1,🙏,:folded_hands:,0.0,0.418,0.418,
2,🤷,:person_shrugging:,0.0,,-0.3875,don't care
3,🙄,:face_with_rolling_eyes:,0.0,,-0.3875,don't care
4,😂,:face_with_tears_of_joy:,0.4404,0.221,0.221,


# Update the VADER dictionary
Now that we have the new wordcodes and associated values, we need to put them in the VADER dictionary.

In [15]:
# Make the sid change lexicon
print(list(emoji_df_full.columns))
change_lex = emoji_df_full[['demoji', 'emojiScore']].copy()

# Change the demoji format so that VADER sid can use it
for i, d in enumerate(change_lex['demoji']):
    change_lex['demoji'].iloc[i] = re.sub(":", "", d) + "_e"

# Change the change_lex to a dictionary
change_lex = change_lex.set_index('demoji').T.to_dict('list')
for key, value in change_lex.items():
    change_lex.update({key: float(value[0])})

change_lex['person_shrugging_e']

['emoji', 'demoji', 'VaderEmojiScore', 'emosentScore', 'emojiScore', 'analog']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  change_lex['demoji'].iloc[i] = re.sub(":", "", d) + "_e"
  change_lex = change_lex.set_index('demoji').T.to_dict('list')


-0.3875

In [None]:
# Save change_lex as a dictionary that can be loaded in other files
a_file = open("data/change_lex.pkl", "wb")
pickle.dump(change_lex, a_file)
a_file.close()

In [18]:
#update the lexicon and make sure that it works
sid.lexicon.update(change_lex)
print(sid.polarity_scores("red_heart_e")["compound"]) # Note that the aa. functions don't use the updated lexicon.
print(change_lex['red_heart_e'])

0.1891
0.746


# Change the demoji codes
Way back in dataEmoji, we demojized with the standare :word_word: format. So we are going to have to go through all of the datasets that we created and change that now to the word_word_e format. This sucks. But I *think* it's more reliable than going back and changing the earlier code...

In [24]:
# Check that BERT can also handle the new format
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokenizer.encode('red_heart_e')

[101, 1894, 168, 1762, 168, 174, 102]

In [97]:
# Make a dev df
test_df = unbal_x_train.copy()

# Check what the starting number of ":" is
counter = 0
i_nums = []
for i, text in enumerate(test_df["ContentClean"]):
    if ":" in text:
        counter += 1
        i_nums.append(i)
print(counter)
print(test_df["ContentClean"].iloc[i_nums[0]])

166
خشونت علیه زنان، ممنوع! Two Afghan immigrant women ask a Swedish journalist to photograph their bodies from undercover history to show what it means when violence against women speaks out. The deformed bodies of these two women, continued in comment:backhand_index_pointing_down::backhand_index_pointing_down:


In [98]:
# build a regex function to recognize ":word_word:" and sub for "word_word_e"

tweet = "I:heart_emoji: my :blue::lady_bike:! love: it."
#tweet = "I heart_emoji my lady_bike! love: it."

def change_e_codes(text):
    new_text = text
    b_list = []
    c_list = []

    # find the desired strings
    a_list = re.findall(r":\b\w+\b:", new_text)   #("I", "CODE", text)
    
    # change the strings to desired format
    for i in a_list:
        b_list.append(re.sub(r":\b", " ", i))
    for i in b_list:
        c_list.append(re.sub(r"\b:", "_e ", i))
    
    # replace them in the text
    for i, old_code in enumerate(a_list):
        new_text = new_text.replace(old_code, c_list[i])

    new_text = re.sub(r"  ", " ", new_text)

    if len(c_list) != 0: 
        return new_text
    else: 
        return text

change_e_codes(tweet)        

'I heart_emoji_e my blue_e lady_bike_e ! love: it.'

In [99]:
test_df["ContentClean"] = test_df["ContentClean"].apply(change_e_codes)


In [100]:
counter = 0
for i in test_df["ContentClean"]:
    if ":" in i:
        counter += 1
print(counter)

print(test_df["ContentClean"].iloc[i_nums[0]])
print(test_df["ContentClean"].iloc[0])


109
خشونت علیه زنان، ممنوع! Two Afghan immigrant women ask a Swedish journalist to photograph their bodies from undercover history to show what it means when violence against women speaks out. The deformed bodies of these two women, continued in comment backhand_index_pointing_down_e backhand_index_pointing_down_e 
At the moment, that would be Mexican, Colombian, Venezuelan, Afghan, Russian, and Ukraine seeking relocation. should be the last place they will call because who wants to pay taxes to a state that hated their presence from the get-go?


In [101]:
# Now apply it to all of the dataframes
unbal_x_train["ContentClean"] = unbal_x_train["ContentClean"].apply(change_e_codes)
unbal_x_val["ContentClean"] = unbal_x_val["ContentClean"].apply(change_e_codes)
under_x_train["ContentClean"] = under_x_train["ContentClean"].apply(change_e_codes)
under_x_val["ContentClean"] = under_x_val["ContentClean"].apply(change_e_codes)
underOver_x_train["ContentClean"] = underOver_x_train["ContentClean"].apply(change_e_codes)
underOver_x_val["ContentClean"] = underOver_x_val["ContentClean"].apply(change_e_codes)
x_test["ContentClean"] = x_test["ContentClean"].apply(change_e_codes)

# And some odds and ends
tweets_clean["ContentClean"] = tweets_clean["ContentClean"].apply(change_e_codes)
all_unlabeled_tweets["ContentClean"] = all_unlabeled_tweets["ContentClean"].apply(change_e_codes)

In [104]:
# Save them somewhere new
unbal_x_train.to_csv('dataBalSetsEcodes/unbal_x_train.csv')
unbal_x_val.to_csv('dataBalSetsEcodes/unbal_x_val.csv')
under_x_train.to_csv('dataBalSetsEcodes/under_x_train.csv')
under_x_val.to_csv('dataBalSetsEcodes/under_x_val.csv')
underOver_x_train.to_csv('dataBalSetsEcodes/underOver_x_train.csv')
underOver_x_val.to_csv('dataBalSetsEcodes/underOver_x_val.csv')
x_test.to_csv('dataBalSetsEcodes/x_test.csv')
tweets_clean.to_csv('dataBalSetsEcodes/tweets_clean.csv')
all_unlabeled_tweets.to_csv('archiveData/all_unlabeled_tweets_ecodes.csv')

# Term Check

In [None]:
print(sid.polarity_scores("red_heart_e")["compound"]) # Note that the aa. functions don't use the updated lexicon.