Installing the necessary libraries

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 4.8 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 84.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 77.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


Downloading & extracting data

In [None]:
!wget -P data/facebooknews/  https://github.com/jbencina/facebook-news/blob/master/fb_news_comments_1000K_hashed.7z?raw=true
!wget -P data/facebooknews/ https://github.com/jbencina/facebook-news/blob/master/fb_news_posts_20K.7z?raw=true

!mv data/facebooknews/fb_news_comments_1000K_hashed.7z?raw=true data/facebooknews/fb_news_comments_1000K_hashed.7z
!mv data/facebooknews/fb_news_posts_20K.7z?raw=true data/facebooknews/fb_news_posts_20K.7z

!cd data/facebooknews && 7z e fb_news_comments_1000K_hashed.7z
!cd data/facebooknews && 7z e fb_news_posts_20K.7z

--2022-11-17 23:44:13--  https://github.com/jbencina/facebook-news/blob/master/fb_news_comments_1000K_hashed.7z?raw=true
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/jbencina/facebook-news/raw/master/fb_news_comments_1000K_hashed.7z [following]
--2022-11-17 23:44:13--  https://github.com/jbencina/facebook-news/raw/master/fb_news_comments_1000K_hashed.7z
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/jbencina/facebook-news/master/fb_news_comments_1000K_hashed.7z [following]
--2022-11-17 23:44:13--  https://raw.githubusercontent.com/jbencina/facebook-news/master/fb_news_comments_1000K_hashed.7z
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubuserco

Using code from https://github.com/monologg/GoEmotions-pytorch

In [None]:
import torch.nn as nn
from transformers import BertPreTrainedModel, BertModel


class BertForMultiLabelClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
        self.loss_fct = nn.BCEWithLogitsLoss()

        self.init_weights()

    def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            labels=None,
    ):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )
        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here

        if labels is not None:
            loss = self.loss_fct(logits, labels)
            outputs = (loss,) + outputs

        return outputs  # (loss), logits, (hidden_states), (attentions)

Testing the model for prediction

In [None]:
from transformers import AutoTokenizer

emotions = [
    'admiration',
    'amusement',
    'anger',
    'annoyance',
    'approval',
    'caring',
    'confusion',
    'curiosity',
    'desire',
    'disappointment',
    'disapproval',
    'disgust',
    'embarrassment',
    'excitement',
    'fear',
    'gratitude',
    'grief',
    'joy',
    'love',
    'nervousness',
    'optimism',
    'pride',
    'realization',
    'relief',
    'remorse',
    'sadness',
    'surprise',
    'neutral'
]

# tokenizer = AutoTokenizer.from_pretrained("monologg/bert-base-cased-goemotions-original")

# model = BertForMultiLabelClassification.from_pretrained("monologg/bert-base-cased-goemotions-original")

# tk = tokenizer(['not really sure', 'you moron!'], padding='longest', max_length=200, truncation=True, return_tensors='pt')
# logits = model(**tk)[0]
# indices = logits.argmax(dim=1)
# emotions[indices[0]], emotions[indices[1]]

Reading & handling of data

In [None]:
import pandas as pd

df = pd.read_csv('./data/facebooknews/fb_news_comments_1000K_hashed.csv')

In [None]:
df.head(15)

Unnamed: 0,created_time,from_id,from_name,message,post_name
0,2017-07-14T14:43:54+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,We are speaking to NRA supporters as well as W...,33661642d99eeceeb086_10154890879532217
1,2017-07-14T14:41:59+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,If you are just joining us we are outside of t...,33661642d99eeceeb086_10154890879532217
2,2017-07-14T14:41:58+0000,142b054b9d7f119260fa,210f51cc65568bf2d528,Do you know how backward America are in allowi...,142b054b9d7f119260fa_10154890879532217
3,2017-07-14T14:42:25+0000,52228acb5d5ca468be8c,46377b3cde64b2bf93dc,People who legally own guns often seem all too...,52228acb5d5ca468be8c_10154890879532217
4,2017-07-14T14:40:46+0000,af7fe02906a110370810,087accbb0dc7c975d194,Have you snowflakes watched the news and seen ...,af7fe02906a110370810_10154890879532217
5,2017-07-14T14:41:06+0000,9a90817f183ebc174742,343de400829f2349a781,I don't understand why America wants to carry ...,9a90817f183ebc174742_10154890879532217
6,2017-07-14T14:41:02+0000,5969e4a86e559f513d79,47a72b309aaa49128213,It is my right to legally protect my life as b...,5969e4a86e559f513d79_10154890879532217
7,2017-07-14T14:34:05+0000,858bf285623eeccb82f2,0d861bf581d55a4aa9fd,"I'm a gun owner, but the NRA are just terroris...",858bf285623eeccb82f2_10154890879532217
8,2017-07-14T14:42:47+0000,858bf285623eeccb82f2,0d861bf581d55a4aa9fd,2nd amendment makes you think you are free fro...,858bf285623eeccb82f2_10154890879532217
9,2017-07-14T14:47:11+0000,334ea0160501ea43df96,627bcf1efdcd30981075,It should be a legal requirement that everyone...,334ea0160501ea43df96_10154890879532217


In [None]:
print(len(df[~df.message.isna()]))
print(len(df[df.message.isna()]))

1011597
26722


In [None]:
df = df[~df.message.isna()]
print(len(df))

1011597


In [None]:
from tqdm.notebook import tqdm

def classify_data(data, batch_size, filename, tokenizer, model):
    n_batches = len(data) // batch_size
    model.to('cuda:0')
    result_df = pd.DataFrame([], columns=[*data.columns, 'emotion'])
    
    for i in tqdm(range(0, len(data), batch_size)):
        start, end = i, i + batch_size
        batch = data.iloc[start: end]
        params = tokenizer(batch.message.to_list(), padding='longest', max_length=244, truncation=True, return_tensors='pt')
        params = {k:v.to('cuda:0') for k, v in params.items()}
        logits = model(**params)[0]
        batch['emotion'] = logits.argmax(dim=-1).cpu().numpy()
        result_df = result_df.append(batch)

    print(len(result_df))
    result_df.to_csv(filename)

# classify_data(df, batch_size=32, filename='./drive/MyDrive/fbnews_comments_annotated.csv', tokenizer=tokenizer, model=model)


In [None]:
df2 = pd.read_csv('./drive/MyDrive/fbnews_comments_annotated.csv')
print(len(df2), len(df))
df2.head(15)

1011597 1011597


Unnamed: 0.1,Unnamed: 0,created_time,from_id,from_name,message,post_name,emotion
0,0,2017-07-14T14:43:54+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,We are speaking to NRA supporters as well as W...,33661642d99eeceeb086_10154890879532217,27
1,1,2017-07-14T14:41:59+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,If you are just joining us we are outside of t...,33661642d99eeceeb086_10154890879532217,27
2,2,2017-07-14T14:41:58+0000,142b054b9d7f119260fa,210f51cc65568bf2d528,Do you know how backward America are in allowi...,142b054b9d7f119260fa_10154890879532217,3
3,3,2017-07-14T14:42:25+0000,52228acb5d5ca468be8c,46377b3cde64b2bf93dc,People who legally own guns often seem all too...,52228acb5d5ca468be8c_10154890879532217,27
4,4,2017-07-14T14:40:46+0000,af7fe02906a110370810,087accbb0dc7c975d194,Have you snowflakes watched the news and seen ...,af7fe02906a110370810_10154890879532217,7
5,5,2017-07-14T14:41:06+0000,9a90817f183ebc174742,343de400829f2349a781,I don't understand why America wants to carry ...,9a90817f183ebc174742_10154890879532217,6
6,6,2017-07-14T14:41:02+0000,5969e4a86e559f513d79,47a72b309aaa49128213,It is my right to legally protect my life as b...,5969e4a86e559f513d79_10154890879532217,4
7,7,2017-07-14T14:34:05+0000,858bf285623eeccb82f2,0d861bf581d55a4aa9fd,"I'm a gun owner, but the NRA are just terroris...",858bf285623eeccb82f2_10154890879532217,27
8,8,2017-07-14T14:42:47+0000,858bf285623eeccb82f2,0d861bf581d55a4aa9fd,2nd amendment makes you think you are free fro...,858bf285623eeccb82f2_10154890879532217,3
9,9,2017-07-14T14:47:11+0000,334ea0160501ea43df96,627bcf1efdcd30981075,It should be a legal requirement that everyone...,334ea0160501ea43df96_10154890879532217,4


In [None]:
df2.emotion.value_counts()

27    462705
3      75816
7      64921
0      55725
4      37935
1      32858
2      32691
10     32164
20     25066
6      23786
22     21543
25     19317
26     17742
9      16372
15     15847
5      13612
18     13332
11     10227
14      9147
12      7735
17      6027
13      5024
8       4588
24      4580
21      1237
23       965
19       607
16        28
Name: emotion, dtype: int64

In [None]:
df2['emotion_txt'] = df2.emotion.apply(lambda x: emotions[int(x)])

In [None]:
df2.emotion_txt.value_counts()

neutral           462705
annoyance          75816
curiosity          64921
admiration         55725
approval           37935
amusement          32858
anger              32691
disapproval        32164
optimism           25066
confusion          23786
realization        21543
sadness            19317
surprise           17742
disappointment     16372
gratitude          15847
caring             13612
love               13332
disgust            10227
fear                9147
embarrassment       7735
joy                 6027
excitement          5024
desire              4588
remorse             4580
pride               1237
relief               965
nervousness          607
grief                 28
Name: emotion_txt, dtype: int64

In [None]:
df2['post_id'] = df2.post_name.apply(lambda x: x.split('_')[1])
df2.head()

Unnamed: 0.1,Unnamed: 0,created_time,from_id,from_name,message,post_name,emotion,emotion_txt,post_id
0,0,2017-07-14T14:43:54+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,We are speaking to NRA supporters as well as W...,33661642d99eeceeb086_10154890879532217,27,neutral,10154890879532217
1,1,2017-07-14T14:41:59+0000,33661642d99eeceeb086,4ca212f16b9f954d5e0a,If you are just joining us we are outside of t...,33661642d99eeceeb086_10154890879532217,27,neutral,10154890879532217
2,2,2017-07-14T14:41:58+0000,142b054b9d7f119260fa,210f51cc65568bf2d528,Do you know how backward America are in allowi...,142b054b9d7f119260fa_10154890879532217,3,annoyance,10154890879532217
3,3,2017-07-14T14:42:25+0000,52228acb5d5ca468be8c,46377b3cde64b2bf93dc,People who legally own guns often seem all too...,52228acb5d5ca468be8c_10154890879532217,27,neutral,10154890879532217
4,4,2017-07-14T14:40:46+0000,af7fe02906a110370810,087accbb0dc7c975d194,Have you snowflakes watched the news and seen ...,af7fe02906a110370810_10154890879532217,7,curiosity,10154890879532217


In [None]:
df2.emotion.value_counts()

27    462705
3      75816
7      64921
0      55725
4      37935
1      32858
2      32691
10     32164
20     25066
6      23786
22     21543
25     19317
26     17742
9      16372
15     15847
5      13612
18     13332
11     10227
14      9147
12      7735
17      6027
13      5024
8       4588
24      4580
21      1237
23       965
19       607
16        28
Name: emotion, dtype: int64

In [None]:
df_posts = pd.read_csv('./data/facebooknews/fb_news_posts_20K.csv')
df_posts = df_posts[~df_posts.message.isna()]
df_posts['post_id_split'] = df_posts.post_id.apply(lambda x: x.split('_')[1])

In [None]:
def count_comments_of_type(df_posts, df_comments, comment_type='anger'):
    result_df = pd.DataFrame([], columns=[*df_posts.columns, f'n_{comment_type}', 'n_comments'])

    for _, row in tqdm(list(df_posts.iterrows())):
        comments = df_comments[df_comments.post_id == row.post_id_split]
        row[f'n_{comment_type}'] = len(comments[comments.emotion_txt == comment_type])
        row[f'n_comments'] = len(comments)
        result_df = result_df.append(row)

    return result_df 

df_posts_c = count_comments_of_type(df_posts, df2)

  0%|          | 0/19121 [00:00<?, ?it/s]

In [None]:
df_posts_c.n_anger.value_counts()

0     8664
1     3450
2     2250
3     1581
4     1084
5      721
6      489
7      306
8      195
9      110
10      81
11      63
12      45
14      27
13      24
15      10
16       7
18       5
17       4
19       2
20       1
22       1
24       1
Name: n_anger, dtype: int64

In [None]:
df_posts_c.to_csv('./drive/MyDrive/fb_posts_emotion.csv')

In [None]:
import pandas as pd

df_emo = pd.read_csv('./drive/MyDrive/fb_posts_emotion.csv')
print(len(df_emo))
df_emo.head()

19121


Unnamed: 0.1,Unnamed: 0,created_time,description,link,message,page_id,post_id,react_angry,react_haha,react_like,react_love,react_sad,react_wow,scrape_time,shares,post_id_split,n_anger,n_comments
0,0,2017-07-14T14:30:59+0000,,https://www.facebook.com/bbcnews/videos/101548...,We are #LIVE outside the National Rifle Associ...,228735667216,228735667216_10154890879532217,54,24,993,144,12,24,2017-07-14 11:01:24.379857,139,10154890879532217,2,100
1,1,2017-07-14T14:20:59+0000,,http://bbc.in/2talMsx,UPDATE: \r\n-2 Ukrainian tourists killed in st...,228735667216,228735667216_10154890968202217,172,8,994,11,783,264,2017-07-14 11:01:24.379857,680,10154890968202217,2,99
2,2,2017-07-14T13:40:38+0000,,https://www.facebook.com/bbcnews/videos/101548...,Proms: Come with us on a tour of the Royal Alb...,228735667216,228735667216_10154890852247217,5,12,2034,369,6,45,2017-07-14 11:01:24.379857,395,10154890852247217,0,95
3,3,2017-07-14T12:55:45+0000,,https://www.facebook.com/bbcnews/videos/142678...,Thousands say their final goodbyes to Bradley ...,228735667216,228735667216_1426789250735491,6,0,2262,754,1989,11,2017-07-14 11:01:24.379857,542,1426789250735491,1,95
4,4,2017-07-14T12:45:00+0000,,https://www.facebook.com/bbcnews/videos/101548...,"Despite safety warnings, this beach near an ai...",228735667216,228735667216_10154890645702217,65,513,4336,54,128,815,2017-07-14 11:01:24.379857,1956,10154890645702217,9,100
