In [2]:
import twarc
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import requests
from transformers import pipeline
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.append('uh')
puncs = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/korfoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Process the downloaded tweet json to csv files

In [5]:
all_columns=['conversation_id', 'id','author_id','text','in_reply_to_user_id']
for file in os.listdir("Banks_conv"):
    if file.split(".")[1]=="json":
        print("--------------Processing file {0}--------------".format(file))
        df=pd.read_json(os.path.join("Banks_conv",file),lines=True)
        all_tweets=[]
        i=100
        for line in df['data']:
            for tweet in line:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet[column])

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')

                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        for line in df['includes']:
            for tweet in line['tweets']:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet.get(column,''))

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')
                
                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        df_out = pd.DataFrame(all_tweets,columns=all_columns+['replied_to', 'attachment'])
        df_out = df_out.drop_duplicates().copy()
        df_out.to_csv("Banks_conv/"+file.split(".")[0]+".csv", index=False)

--------------Processing file WellsFargo.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processed more than 100000 tweets--------------
--------------Processing file DeutscheBank.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file BofA_Help.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file jpmorgan.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 

## Perform labeling for answered/unanswered tweets

In [9]:
#Dictionary to store Banks ids
Banks_dict = {
        "WellsFargo":"1178011",
        "jpmorgan":"1155522630",
        "BofA_Help":"18735040",
        "Citi":"79320096",
        "DeutscheBank":"41330603",
}

In [8]:
def do_labeling(org_ids):

    #DataFrame to store all labeled replies related to our task
    all_replies = pd.DataFrame()

    for file in os.listdir('Banks_conv'):
        if file.split('.')[1]=='csv':
            NGO_name = file.split('.')[0]
            NGO_id = org_ids[NGO_name]

            #added terminator to fix tokenizer error for some files
            df = pd.read_csv("Banks_conv/{0}".format(file), lineterminator='\n', dtype='str')

            #get the tweets on which users got replies
            ngo_replied_to_ids = df[(df["author_id"] == NGO_id) & (df["in_reply_to_user_id"] != NGO_id)]["replied_to"].dropna()
            ngo_replied_to = df[df['id'].isin(ngo_replied_to_ids)].dropna()
            ngo_replied_to["label"] = 1

            #get the tweets on which ngo didn't reply
            replies_to_ngo = df[(df["in_reply_to_user_id"]==NGO_id) & (df["author_id"]!=NGO_id)]
            replies_to_ngo = replies_to_ngo[~replies_to_ngo['id'].isin(ngo_replied_to_ids)].dropna()
            replies_to_ngo["label"] = 0

            #concatenate these tweets together
            all_ngo_replies = pd.concat([replies_to_ngo, ngo_replied_to]).reset_index(drop=True)
            all_ngo_replies["relatedOrg"] = NGO_name
            
            #add current ngo data to global df
            all_replies = pd.concat([all_replies, all_ngo_replies]).reset_index(drop=True)

            print("processed file: {0}".format(file))
            
    all_replies.to_csv("Banks_conv/labeled_data.csv")


In [10]:
#label banks
do_labeling(Banks_dict)

processed file: WellsFargo.csv
processed file: DeutscheBank.csv
processed file: BofA_Help.csv
processed file: Citi.csv
processed file: jpmorgan.csv


In [11]:
df = pd.read_csv("Banks_conv/labeled_data.csv", index_col=0, dtype='str')
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo


In [24]:
#check statistics
df.groupby([df.label, df.relatedOrg]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment
label,relatedNGO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,BofA_Help,330,330,330,330,330,330,330
0,Citi,8819,8819,8819,8819,8819,8819,8819
0,DeutscheBank,5715,5715,5715,5715,5715,5715,5715
0,WellsFargo,3393,3393,3393,3393,3393,3393,3393
0,jpmorgan,10292,10292,10292,10292,10292,10292,10292
1,BofA_Help,594,594,594,594,594,594,594
1,Citi,1,1,1,1,1,1,1
1,DeutscheBank,584,584,584,584,584,584,584
1,WellsFargo,925,925,925,925,925,925,925
1,jpmorgan,61,61,61,61,61,61,61


In [32]:
#delete citi bank, no use in it
df = df[df.relatedOrg != "Citi"]
df.to_csv("Banks_conv/labeled_no_city.csv")

## Process downloaded authors info

In [107]:
#save user ids to download through twarc
users = pd.DataFrame(df["author_id"].unique())
users.to_csv("Banks_conv/users_ids.txt", index = False, header=False)

In [108]:
#Process the needed info to csv
df=pd.read_json("Banks_conv/users.json",lines=True)
all_users=[]
public_columns=['followers_count', 'following_count','tweet_count']
additional_columns=['id','name', 'username', 'description','created_at']
for line in df['data']:
    for user in line:

        user_info=[]        
        for column in public_columns:
            try:
                user_info.append(user['public_metrics'][column])
            except:
                print(user)
        for column in additional_columns:
            user_info.append(user[column])

        all_users.append(user_info)

users = pd.DataFrame(all_users,columns=public_columns+additional_columns)
users = users.drop_duplicates().copy()
users = users.drop_duplicates(["id"]).copy()
users.to_csv("Banks_conv/users.csv", index=False)

In [109]:
#read df
df = pd.read_csv("Banks_conv/labeled_no_city.csv", index_col=0, dtype='str')
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo
...,...,...,...,...,...,...,...,...,...
30709,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan
30710,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan
30711,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan
30712,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan


In [111]:
#merge with existing tweet df
df_merged = df.merge(users, left_on='author_id', right_on='id', how='left')
df_merged = df_merged.drop(['name', 'description', 'created_at', 'id_y', 'username'], axis = 1)
df_merged = df_merged.rename(columns={"id_x":"id", "followers_count":"author_followers", "following_count":"author_following", "tweet_count":"author_tweets"})
df_merged

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets,username
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,8.0,159.0,Ki41703898
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,719.0,128221.0,JordanYebe
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,coinblockhead
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,coinblockhead
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,2.0,4.0,bas33dgood
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21889,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan,12849.0,8997.0,31189.0,ZoeParamour
21890,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan,658.0,719.0,17369.0,arixcalorna
21891,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan,486.0,540.0,62667.0,Mista_Ace
21892,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan,19517.0,788.0,794.0,SanaH_Sport


## Process downloaded info of users in reply to whom the tweet was written

In [113]:
#get the user id whom the tweet was addressed to download data from twarc
replied_users = pd.DataFrame(df["in_reply_to_user_id"].unique())
replied_users.to_csv("Banks_conv/replied_users_ids.txt", index = False, header=False)

In [119]:
#Process the needed info to csv
df=pd.read_json("Banks_conv/replied_users.json",lines=True)
all_users=[]
public_columns=['followers_count', 'following_count','tweet_count']
additional_columns=['id','name', 'username', 'description','created_at']
for line in df['data']:
    for user in line:

        user_info=[]        
        for column in public_columns:
            try:
                user_info.append(user['public_metrics'][column])
            except:
                print(user)
        for column in additional_columns:
            user_info.append(user[column])

        all_users.append(user_info)

replied_users = pd.DataFrame(all_users,columns=public_columns+additional_columns)
replied_users = replied_users.drop_duplicates().copy()
replied_users = replied_users.drop_duplicates(["id"]).copy()
replied_users.to_csv("Banks_conv/replied_users.csv", index=False)

In [122]:
#merge with existing tweet df
df_merged = df_merged.merge(replied_users, left_on='in_reply_to_user_id', right_on='id', how='left')
df_merged = df_merged.drop(['name', 'description', 'created_at', 'id_y'], axis = 1)
df_merged = df_merged.rename(columns={"id_x":"id", "followers_count":"replied_followers", "following_count":"replied_following", "tweet_count":"replied_tweets", "username":"replied_name"})
df_merged

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets,username_x,replied_followers,replied_following,replied_tweets,username_y
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,8.0,159.0,Ki41703898,347533.0,149.0,59749.0,WellsFargo
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,719.0,128221.0,JordanYebe,347533.0,149.0,59749.0,WellsFargo
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,coinblockhead,347533.0,149.0,59749.0,WellsFargo
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,coinblockhead,347533.0,149.0,59749.0,WellsFargo
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,2.0,4.0,bas33dgood,347533.0,149.0,59749.0,WellsFargo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21889,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan,12849.0,8997.0,31189.0,ZoeParamour,726641.0,49.0,8117.0,jpmorgan
21890,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan,658.0,719.0,17369.0,arixcalorna,726641.0,49.0,8117.0,jpmorgan
21891,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan,486.0,540.0,62667.0,Mista_Ace,3684.0,3016.0,169325.0,DeeeeBlike
21892,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan,19517.0,788.0,794.0,SanaH_Sport,31.0,370.0,588.0,TracieFain


In [124]:
#save the data
df_merged.to_csv("Banks_conv/labeled_with_users.csv")

## Add additional features

In [125]:
df = pd.read_csv("Banks_conv/labeled_with_users.csv", dtype='str', index_col= 0)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets,replied_followers,replied_following,replied_tweets,replied_name
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,8.0,159.0,347533.0,149.0,59749.0,WellsFargo
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,719.0,128221.0,347533.0,149.0,59749.0,WellsFargo
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,347533.0,149.0,59749.0,WellsFargo
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,2203.0,7727.0,347533.0,149.0,59749.0,WellsFargo
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,2.0,4.0,347533.0,149.0,59749.0,WellsFargo


In [None]:
#drop rows with uncorrectly processed label if such exist
df = df[df['label'].notna()]
df = df.reset_index(drop=True)

In [129]:
#tokenize the text
df["text_new"]=df["text"].apply(lambda x: [word for word in [re.sub(f"[{puncs}]","",token).lower() for token in x.split() if token[0]!='@'] if word not in stop_words+['rt','']])
#number of user mentions "@" in a text
df["num_mentions"]=df["text"].apply(lambda x: len([i for i in x.split() if i[0]=='@']))
#number of links in a text
df["num_links"]=df["text"].apply(lambda x: len([i for i in x.split() if 'http'in i]))
#number of words in original text
df["num_full_words"] = df.text.apply(lambda x: len(x.split(' ')))
#number of words in tokenized text
df["num_tokenized_words"] = df.text_new.apply(lambda x: len(x))
#number of hashtags in text
df["num_hashtags"]=df.text.apply(lambda x: len([i for i in x.split() if i[0]=='#']))
#number of related_NGO mentions in a tweet
df["num_NGO_mentions"]=df.apply(lambda x: x.text.split(" ").count(f"@{x.relatedNGO}"), axis=1)
#number of exclamation marks in text
df["num_exclamation"]=df.apply(lambda x: x.text.count("!"), axis=1)
#number of question marks in a tweet
df["num_question"]=df.apply(lambda x: x.text.count("?"), axis=1)
#if the tweet was a retweeted one
df["retweet"] = df.text.apply(lambda x: 1 if x[:2]=="RT" else 0)
#number of characters in original text
df["num_characters"] = df.text.apply(lambda x: len(x))

df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,replied_name,text_new,num_mentions,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,...,WellsFargo,"[thats, bullshit, phone, number, hung, 30, min...",2,0,21,12,1,1,0,0
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,...,WellsFargo,"[well, weekend, tried, make, 550, purchase, de...",1,0,39,19,0,1,0,1
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,...,WellsFargo,"[wells, fargo, deleted, comments, anyways, sca...",1,0,20,9,0,1,0,0
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,...,WellsFargo,"[boycott, wells, fargo, buybitcoin]",1,0,5,4,1,1,0,0
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,...,WellsFargo,[fuckwellsfargo],1,0,2,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21889,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan,12849.0,...,jpmorgan,"[ive, got, tickets, next, tuesday]",3,0,9,5,0,1,1,0
21890,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan,658.0,...,jpmorgan,[stop],1,0,2,1,1,1,0,0
21891,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan,486.0,...,DeeeeBlike,"[needs, addressed, asap]",2,0,8,3,0,1,0,0
21892,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan,19517.0,...,TracieFain,"[fired, pig]",2,0,8,2,1,1,1,0


In [None]:
#save the data
df.to_csv("Banks_conv/Banks_extended_features.csv")

## Sentiment analysis of tweets

In [134]:
#set hugging face model
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"

#set the pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model = model, tokenizer = model)

#check output for one of the tweets
sentiment_pipeline(df["text"][0])[0]["label"]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'negative'

In [135]:
#get sentiment while treating errors
def set_sentimet(text):
    try:
        sentiment = sentiment_pipeline(text)[0]["label"]
    except:
        sentiment = "Error"
    return sentiment

In [136]:
#add sentiment feature to the dataset
df["sentiment"] = df.text.apply(lambda x: set_sentimet(x))
df = df[df.sentiment!="Error"]
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,text_new,num_mentions,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,...,"[thats, bullshit, phone, number, hung, 30, min...",2,0,21,12,1,1,0,0,negative
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,...,"[well, weekend, tried, make, 550, purchase, de...",1,0,39,19,0,1,0,1,negative
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,...,"[wells, fargo, deleted, comments, anyways, sca...",1,0,20,9,0,1,0,0,negative
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,...,"[boycott, wells, fargo, buybitcoin]",1,0,5,4,1,1,0,0,negative
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,...,[fuckwellsfargo],1,0,2,1,1,1,0,0,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21889,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan,12849.0,...,"[ive, got, tickets, next, tuesday]",3,0,9,5,0,1,1,0,positive
21890,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan,658.0,...,[stop],1,0,2,1,1,1,0,0,negative
21891,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan,486.0,...,"[needs, addressed, asap]",2,0,8,3,0,1,0,0,negative
21892,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan,19517.0,...,"[fired, pig]",2,0,8,2,1,1,1,0,negative


In [137]:
#save the data
df.to_csv("Banks_conv/Banks_extended_features.csv")

## Topic modelling for tweets

In [7]:
#read the data
df = pd.read_csv("Banks_conv/Banks_extended_features.csv", dtype="str", index_col=0)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,...,0,21,12,1,1,0,0,negative,0,138
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,...,0,39,19,0,1,0,1,negative,0,188
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,...,0,20,9,0,1,0,0,negative,0,129
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,...,0,5,4,1,1,0,0,negative,0,43
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,...,0,2,1,1,1,0,0,negative,0,27


In [8]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

#process the tweet text
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'bit.ly/\S+', '', text)
    text = text.strip('[link]')
    return text

def remove_users(text):
    text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    return text

# further cleaning
def process_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower()
    #strip punctuation
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet)
    #remove double spacing
    tweet = re.sub('\s+', ' ', tweet)
    #remove numbers
    tweet = re.sub('([0-9]+)', '', tweet)
    #remove stopwords
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords]

    #apply word rooter
    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list]

    tweet = ' '.join(tweet_token_list)
    return tweet

#do the processing for topic modeling prep
df['processed_tweet'] = df.text.apply(process_tweet)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters,processed_tweet
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,...,21,12,1,1,0,0,negative,0,138,that bullshit phone number hung min wait cli...
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,...,39,19,0,1,0,1,negative,0,188,well weekend tri make purchas debit card kep...
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,...,20,9,0,1,0,0,negative,0,129,well fargo delet comment anyway scammer save ...
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,...,5,4,1,1,0,0,negative,0,43,boycott well fargo #buybitco
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,...,2,1,1,1,0,0,negative,0,27,#fuckwellsfargo


In [9]:
#assign vectorizer, which get rids of words that are more than in 90% of the tweets and those that are in less than 25.
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

#apply transformation
tf = vectorizer.fit_transform(df['processed_tweet']).toarray()

#get feature names
tf_feature_names = vectorizer.get_feature_names_out()

#assign model
model = LatentDirichletAllocation(n_components=6, random_state=0)

In [10]:
#check that tf was done correctly in terms of tweet dimensions
tf.shape

(21892, 1301)

In [11]:
#fit the model
model.fit(tf)

In [12]:
#Display top n words from topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

display_topics(model, tf_feature_names, no_top_words=10)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,work,583.3,account,1150.2,thank,959.9,bank,1414.2,stop,1090.4,line,432.2
1,compani,531.5,well,926.2,morgan,489.1,amp,608.8,cooper,1011.2,would,370.0
2,alreadi,424.4,bank,841.2,jp,408.2,busi,488.3,russia,972.1,die,278.4
3,continu,411.2,’t,794.2,good,404.0,money,392.4,russian,822.2,fund,258.3
4,left,271.2,money,793.6,one,298.1,invest,371.2,aggressor,534.1,women,256.0
5,de,262.7,get,759.0,u,276.6,like,353.0,ukrain,532.2,much,233.9
6,sponsor,221.8,’,677.8,pleas,253.2,trump,327.2,financ,447.3,caus,228.1
7,hundr,207.2,custom,666.5,million,240.3,go,309.7,#stopwar,373.2,power,224.2
8,great,197.6,fargo,654.2,help,225.4,deutsch,285.2,tax,366.9,love,220.3
9,armi,180.3,time,489.9,dollar,218.3,market,253.3,money,364.1,fire,215.6


In [13]:
#assign topics to tweets
topic_probabilities = model.transform(tf)
topic_list = []
for i in topic_probabilities:
   topic_list.append(str(i.argmax()))

topic_df = pd.DataFrame(topic_list, columns=['topic'])

df["topic"] = topic_df["topic"]
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters,processed_tweet,topic
0,1256647208256102400,1257392415062863877,1254487703334219776,@WellsFargo @jordanyebe Thats a bullshit phone...,1178011,1257349230248820741,0,0,WellsFargo,0.0,...,12,1,1,0,0,negative,0,138,that bullshit phone number hung min wait cli...,1
1,1256647208256102400,1257350048414924801,342060815,@WellsFargo Well this weekend I tried to make ...,1178011,1257349230248820741,0,0,WellsFargo,1374.0,...,19,0,1,0,1,negative,0,188,well weekend tri make purchas debit card kep...,1
2,953819605587415041,954374335141003269,951483180346892288,@WellsFargo Wells Fargo deleted my other comme...,1178011,953819605587415041,0,0,WellsFargo,751.0,...,9,0,1,0,0,negative,0,129,well fargo delet comment anyway scammer save ...,1
3,953819605587415041,954374200994627584,951483180346892288,@WellsFargo Boycott Wells Fargo #buybitcoin,1178011,953819605587415041,0,0,WellsFargo,751.0,...,4,1,1,0,0,negative,0,43,boycott well fargo #buybitco,1
4,953819605587415041,954351324648402944,2400707515,@WellsFargo #fuckwellsfargo,1178011,953819605587415041,0,0,WellsFargo,0.0,...,1,1,1,0,0,negative,0,27,#fuckwellsfargo,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21889,658707029549293568,661588595866787840,57460933,@jpmorgan @BarbicanCentre @TheRSC I've got tic...,1155522630,658707029549293568,0,1,jpmorgan,12849.0,...,5,0,1,1,0,positive,0,68,got ticket next tuesday,1
21890,1074984368634966016,1075021118711435265,259232385,@jpmorgan #stop,1155522630,1074984368634966016,0,1,jpmorgan,658.0,...,1,1,1,0,0,negative,0,15,#stop,5
21891,688242664693579776,688404765034856456,68163076,@DeeeeBlike @jpmorgan this needs to be address...,70063328,688242664693579776,0,1,jpmorgan,486.0,...,3,0,1,0,0,negative,0,54,need address asap,0
21892,919972865847545856,919981362718937091,257400981,@TracieFain @jpmorgan He should be fired ! #pig,355722698,919972865847545856,0,1,jpmorgan,19517.0,...,2,1,1,1,0,negative,0,47,fire #pig,


In [14]:
#Drop processed tweet column
df = df.drop(["processed_tweet"], axis=1)

In [15]:
#save the data
df.to_csv("Banks_conv/Banks_extended_features.csv")