In [2]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import community
import re

pd.set_option('display.max_columns', None)

## Loading Giant Component Tweet Reply Dataset

In [4]:
df_tweets = pd.read_csv('data/tweets_gc.csv',
                        dtype={'id_str': object,
                               'in_reply_to_status_id_str': object,
                               'in_reply_to_user_id': object,
                               'user.id_str': object,
                               'user.friends_count': object,
                               'user.lang': object,
                               'user.statuses_count': object,
                               'user.verified': object})

df_tweets = df_tweets.drop_duplicates().drop(['Unnamed: 0'], axis=1)

display(df_tweets.head())
df_tweets.shape

Unnamed: 0,id_str,created_at,full_text,lang,in_reply_to_status_id_str,in_reply_to_user_id,entities.hashtags,entities.media,entities.symbols,entities.urls,entities.user_mentions,source,quoted_status.id_str,quoted_status.created_at,quoted_status.full_text,quoted_status.lang,quoted_status.in_reply_to_status_id_str,quoted_status.in_reply_to_user_id,quoted_status.entities.hashtags,quoted_status.entities.media,quoted_status.entities.symbols,quoted_status.entities.urls,quoted_status.entities.user_mentions,quoted_status.user.id_str,quoted_status.source,user.id_str,user.created_at,user.description,user.followers_count,user.friends_count,user.lang,user.location,user.name,user.screen_name,user.profile_banner_url,user.profile_image_url,user.statuses_count,user.url,user.verified
0,1484418031581233154,2022-01-21 06:50:12+00:00,@DiscussingFilm Watching Only the One without ...,en,1483652185757929473,7.804607549107323e+17,[],,[],[],"[{'indices': [0, 15], 'screen_name': 'Discussi...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,,,,,,,,,881231140287373312,2017-07-01 19:20:51+00:00,#justiceforJohnnyDepp,1302.0,2804,,,LunaJD❤❤❤,rosaoch89814746,https://pbs.twimg.com/profile_banners/88123114...,http://pbs.twimg.com/profile_images/1389741493...,18074,,False
1,1484485396167905284,2022-01-21 11:17:53+00:00,@DiscussingFilm Only watching #avatar. Love #A...,en,1483652185757929473,7.804607549107323e+17,"[{'indices': [30, 37], 'text': 'avatar'}, {'in...",,[],[],"[{'indices': [0, 15], 'screen_name': 'Discussi...","<a href=""http://twitter.com/download/android"" ...",,,,,,,,,,,,,,2902080532,2014-12-02 08:23:46+00:00,My superpower comes from serving other people....,102.0,385,,"Ontario, Canada",EWA STANSKA,EwaStanska,https://pbs.twimg.com/profile_banners/29020805...,http://pbs.twimg.com/profile_images/1456945814...,17691,https://t.co/YqRnwqIjIT,False
2,1484613837533835264,2022-01-21 19:48:16+00:00,@DiscussingFilm No. My family and friends will...,en,1483652185757929473,7.804607549107323e+17,[],,[],[],"[{'indices': [0, 15], 'screen_name': 'Discussi...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,,,,,,,,,883684219758583810,2017-07-08 13:48:30+00:00,,2.0,8,,,Marccox70@gmail.com,marccox70,,http://abs.twimg.com/sticky/default_profile_im...,389,,False
3,1483753192668151812,2022-01-19 10:48:22+00:00,"@DiscussingFilm Nope, won't be watching either...",en,1483652185757929473,7.804607549107323e+17,[],,[],[],"[{'indices': [0, 15], 'screen_name': 'Discussi...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,,,,,,,,,428656040,2011-12-05 01:16:04+00:00,"Liberal in a red state, lover of films, games,...",249.0,464,,"West Virginia, USA",Jerry Cline,JCshow71,,http://pbs.twimg.com/profile_images/9846791388...,9098,,False
4,1483750400880693250,2022-01-19 10:37:16+00:00,@DiscussingFilm Chocolate. Pirates of the Cari...,en,1483652185757929473,7.804607549107323e+17,"[{'indices': [91, 112], 'text': 'AmberHeardIsA...",,[],[],"[{'indices': [0, 15], 'screen_name': 'Discussi...","<a href=""http://twitter.com/download/android"" ...",,,,,,,,,,,,,,222640627,2010-12-04 00:40:09+00:00,Apanthropy,922.0,1259,,South Wales,Nanny Ogg's Hedgehog,hedgehogogg,https://pbs.twimg.com/profile_banners/22264062...,http://pbs.twimg.com/profile_images/1423786674...,70318,,False


(260, 39)

In [5]:
# Get relevant columns
df = df_tweets[['id_str', 'created_at',	'full_text']]
df.head()

Unnamed: 0,id_str,created_at,full_text
0,1484418031581233154,2022-01-21 06:50:12+00:00,@DiscussingFilm Watching Only the One without ...
1,1484485396167905284,2022-01-21 11:17:53+00:00,@DiscussingFilm Only watching #avatar. Love #A...
2,1484613837533835264,2022-01-21 19:48:16+00:00,@DiscussingFilm No. My family and friends will...
3,1483753192668151812,2022-01-19 10:48:22+00:00,"@DiscussingFilm Nope, won't be watching either..."
4,1483750400880693250,2022-01-19 10:37:16+00:00,@DiscussingFilm Chocolate. Pirates of the Cari...


In [6]:
# Lower case
df = df.assign(full_text=lambda x: x['full_text'].str.lower())

# Remove urls
df = df.assign(full_text=lambda x:x['full_text'].apply(lambda s: re.sub(r'\\w+:\\/{2}[\\d\\w-]+(\\.[\\d\\w-]+)*(?:(?:\\/[^\\s/]*))*', '', s)))

# Remove mentions
df = df.assign(full_text=lambda x:x['full_text'].apply(lambda s: re.sub(r'@\w+', '', s)))

df.head()

Unnamed: 0,id_str,created_at,full_text
0,1484418031581233154,2022-01-21 06:50:12+00:00,watching only the one without amber heard.....
1,1484485396167905284,2022-01-21 11:17:53+00:00,"only watching #avatar. love #aquaman2, but ha..."
2,1484613837533835264,2022-01-21 19:48:16+00:00,no. my family and friends will not contribute...
3,1483753192668151812,2022-01-19 10:48:22+00:00,"nope, won't be watching either. avatar was vi..."
4,1483750400880693250,2022-01-19 10:37:16+00:00,chocolate. pirates of the caribbean. benny a...


## Sentiment Analysis using Pre-Trained Model

In [8]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 21.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.1 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 37.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyY

In [9]:
from transformers import pipeline

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer

In [10]:
# Load Hugging Face Pre-trained model
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"
sentiment_pipeline = pipeline(model=model, tokenizer=model, max_length=512, truncation=True)

Downloading:   0%|          | 0.00/1.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [11]:
# Apply the pre-trained sentiment model to the tweets
df = df.assign(sentiment=lambda x: x['full_text'].apply(lambda s: sentiment_pipeline(s)))
df.head()

Unnamed: 0,id_str,created_at,full_text,sentiment
0,1484418031581233154,2022-01-21 06:50:12+00:00,watching only the one without amber heard.....,"[{'label': 'Neutral', 'score': 0.8662841320037..."
1,1484485396167905284,2022-01-21 11:17:53+00:00,"only watching #avatar. love #aquaman2, but ha...","[{'label': 'Negative', 'score': 0.928562104701..."
2,1484613837533835264,2022-01-21 19:48:16+00:00,no. my family and friends will not contribute...,"[{'label': 'Negative', 'score': 0.693401634693..."
3,1483753192668151812,2022-01-19 10:48:22+00:00,"nope, won't be watching either. avatar was vi...","[{'label': 'Negative', 'score': 0.895826995372..."
4,1483750400880693250,2022-01-19 10:37:16+00:00,chocolate. pirates of the caribbean. benny a...,"[{'label': 'Neutral', 'score': 0.8372517824172..."


In [12]:
# Get the sentiment label and sentiment score for each tweet
df = df.assign(sentiment_label=lambda x: x['sentiment'].apply(lambda s: s[0]['label']))
df = df.assign(sentiment_score=lambda x: x['sentiment'].apply(lambda s: s[0]['score']))

df.head()

Unnamed: 0,id_str,created_at,full_text,sentiment,sentiment_label,sentiment_score
0,1484418031581233154,2022-01-21 06:50:12+00:00,watching only the one without amber heard.....,"[{'label': 'Neutral', 'score': 0.8662841320037...",Neutral,0.866284
1,1484485396167905284,2022-01-21 11:17:53+00:00,"only watching #avatar. love #aquaman2, but ha...","[{'label': 'Negative', 'score': 0.928562104701...",Negative,0.928562
2,1484613837533835264,2022-01-21 19:48:16+00:00,no. my family and friends will not contribute...,"[{'label': 'Negative', 'score': 0.693401634693...",Negative,0.693402
3,1483753192668151812,2022-01-19 10:48:22+00:00,"nope, won't be watching either. avatar was vi...","[{'label': 'Negative', 'score': 0.895826995372...",Negative,0.895827
4,1483750400880693250,2022-01-19 10:37:16+00:00,chocolate. pirates of the caribbean. benny a...,"[{'label': 'Neutral', 'score': 0.8372517824172...",Neutral,0.837252


In [13]:
df['sentiment_label'].value_counts()

Negative    166
Neutral      70
Positive     24
Name: sentiment_label, dtype: int64

In [7]:
# df.to_csv('data/tweets_gc_sentiment.csv')

## References

cardiffnlp/twitter-roberta-base-sentiment-latest. (2021). Hugging Face. Retrieved June 17, 2022, from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

Yang, S. (2022, May 21). Johnny Depp v Amber Heard Twitter Sentiment Analysis. Medium. Retrieved June 17, 2022, from https://sophiamyang.medium.com/johnny-depp-v-amber-heard-twitter-sentiment-analysis-baa42c2e3cdd