In [1]:
import twarc
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import requests
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

## Topic modelling for tweets

In [178]:
#read the data
df = pd.read_csv("fix_labeled/extended_features.csv", dtype="str", index_col=0)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,has_question,sentiment,retweet,topic,num_characters
0,128892368731643905,129039727990026241,19279287,RT @Habitat_org For anyone in the Joplin area ...,33898911,128892368731643905,0,0,Habitat_org,1505,...,11,0,1,0,0,0,neutral,1,4,137
1,1513883133417979904,1514283971437383683,1461435318574493712,@Habitat_org @MrDrewScott @JonathanScott 👏👏👏👏👏,33898911,1513883133417979904,0,0,Habitat_org,1316,...,1,0,1,0,0,0,positive,0,0,46
2,1150423207129886720,1150806881885130752,889333376,@Habitat_org @HabitatPL 👍🙂,33898911,1150423207129886720,0,0,Habitat_org,69,...,1,0,1,0,0,0,positive,0,3,26
3,1150423207129886720,1150469798335266817,3293184712,@Habitat_org @HabitatPL So thankful for Habita...,33898911,1150423207129886720,0,0,Habitat_org,290,...,16,0,1,0,0,0,positive,0,4,202
4,1370041829408784384,1370457258673401860,3821162059,@Habitat_org Wow! This would be awesome for th...,33898911,1370041829408784384,0,0,Habitat_org,1142,...,9,0,1,1,0,0,positive,0,5,114


In [179]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

#process the tweet text
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'bit.ly/\S+', '', text)
    text = text.strip('[link]')
    return text

def remove_users(text):
    text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    return text

# further cleaning
def process_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower()
    #strip punctuation
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet)
    #remove double spacing
    tweet = re.sub('\s+', ' ', tweet)
    #remove numbers
    tweet = re.sub('([0-9]+)', '', tweet)
    #remove stopwords
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords]

    #apply word rooter
    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list]

    tweet = ' '.join(tweet_token_list)
    return tweet

#do the processing for topic modeling prep
df['processed_tweet'] = df.text.apply(process_tweet)
df

  text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
  tweet = re.sub('\s+', ' ', tweet)


Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_hashtags,num_NGO_mentions,num_exclamation,num_question,has_question,sentiment,retweet,topic,num_characters,processed_tweet
0,128892368731643905,129039727990026241,19279287,RT @Habitat_org For anyone in the Joplin area ...,33898911,128892368731643905,0,0,Habitat_org,1505,...,0,1,0,0,0,neutral,1,4,137,anyon joplin area habitat human need volunt h...
1,1513883133417979904,1514283971437383683,1461435318574493712,@Habitat_org @MrDrewScott @JonathanScott 👏👏👏👏👏,33898911,1513883133417979904,0,0,Habitat_org,1316,...,0,1,0,0,0,positive,0,0,46,👏👏👏👏👏
2,1150423207129886720,1150806881885130752,889333376,@Habitat_org @HabitatPL 👍🙂,33898911,1150423207129886720,0,0,Habitat_org,69,...,0,1,0,0,0,positive,0,3,26,👍🙂
3,1150423207129886720,1150469798335266817,3293184712,@Habitat_org @HabitatPL So thankful for Habita...,33898911,1150423207129886720,0,0,Habitat_org,290,...,0,1,0,0,0,positive,0,4,202,thank habitat partner work elimin pipelin fos...
4,1370041829408784384,1370457258673401860,3821162059,@Habitat_org Wow! This would be awesome for th...,33898911,1370041829408784384,0,0,Habitat_org,1142,...,0,1,1,0,0,positive,0,5,114,wow would awesom us live multipl chemic sensi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116404,1093570142888435713,1093570501375676416,474681580,"@WorldVision You go girl💃💃💃,there's alot of pe...",14086764,1093570142888435713,0,1,WorldVision,725,...,0,1,2,0,0,positive,0,2,68,go girl💃💃💃 alot peopl root
116405,1062866805545492481,1063045758188969984,958755194212405248,@ericmetaxas @obianuju @WorldVision @PatriciaH...,105901883,1062866805545492481,0,1,WorldVision,406,...,0,1,2,0,0,negative,0,0,105,wow lost total respect world vision aw
116406,931869631702331392,931881262201102336,928471317535514624,@WorldVision Arthur's Song Charity's is always...,14086764,931869631702331392,0,1,WorldVision,177,...,0,1,0,0,0,positive,0,4,114,arthur song chariti alway world peopl great p...
116407,1067115815001096193,1067207873061666821,196878226,@WorldVision 1 Peter 5:7 Cast all your anxiet...,14086764,1067115815001096193,0,1,WorldVision,12,...,0,1,0,0,0,positive,0,4,79,peter cast anxieti care


In [180]:
#assign vectorizer, which get rids of words that are more than in 90% of the tweets and those that are in less than 25.
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

#apply transformation
tf = vectorizer.fit_transform(df['processed_tweet']).toarray()

#get feature names
tf_feature_names = vectorizer.get_feature_names_out()

#assign model
model = LatentDirichletAllocation(n_components=6, random_state=0)

  vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')


In [181]:
#check that tf was done correctly in terms of tweet dimensions
tf.shape

(116409, 3743)

In [185]:
#fit the model
model.fit(tf)

In [183]:
#Display top n words from topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

display_topics(model, tf_feature_names, no_top_words=10)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,scout,4307.1,red,2608.0,donat,8166.0,polic,18329.2,thank,4230.2,pleas,3655.8
1,boy,2574.1,cross,2470.2,money,3621.1,iranian,18236.2,god,3191.9,thank,3524.2
2,girl,2067.6,help,1996.3,blood,3382.0,use,11021.9,love,2985.7,amp,2777.7
3,church,2021.0,peopl,1411.9,red,3074.4,ambul,10438.2,work,2816.8,help,2043.0
4,’t,1770.2,po,1216.2,cross,2966.1,voic,9892.2,great,2813.7,refuge,1485.2
5,i,1532.0,war,948.2,peopl,2960.7,peopl,9831.2,help,2431.8,share,1386.1
6,presid,1509.2,kill,892.4,help,2811.5,protest,9674.2,good,2150.7,children,1321.1
7,’,1469.2,need,827.8,give,2468.8,transport,9542.2,peopl,1851.0,need,1226.2
8,trump,1409.2,الله,798.2,get,2034.0,station,8976.2,bless,1681.2,wait,1145.4
9,amp,1395.2,sa,718.2,go,1845.5,#مهسا,8972.2,need,1518.4,year,1007.5


In [184]:
#assign topics to tweets
topic_probabilities = model.transform(tf)
topic_list = []
for i in topic_probabilities:
   topic_list.append(str(i.argmax()))

topic_df = pd.DataFrame(topic_list, columns=['topic'])

df["topic"] = topic_df["topic"]
df


Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_hashtags,num_NGO_mentions,num_exclamation,num_question,has_question,sentiment,retweet,topic,num_characters,processed_tweet
0,128892368731643905,129039727990026241,19279287,RT @Habitat_org For anyone in the Joplin area ...,33898911,128892368731643905,0,0,Habitat_org,1505,...,0,1,0,0,0,neutral,1,4,137,anyon joplin area habitat human need volunt h...
1,1513883133417979904,1514283971437383683,1461435318574493712,@Habitat_org @MrDrewScott @JonathanScott 👏👏👏👏👏,33898911,1513883133417979904,0,0,Habitat_org,1316,...,0,1,0,0,0,positive,0,0,46,👏👏👏👏👏
2,1150423207129886720,1150806881885130752,889333376,@Habitat_org @HabitatPL 👍🙂,33898911,1150423207129886720,0,0,Habitat_org,69,...,0,1,0,0,0,positive,0,3,26,👍🙂
3,1150423207129886720,1150469798335266817,3293184712,@Habitat_org @HabitatPL So thankful for Habita...,33898911,1150423207129886720,0,0,Habitat_org,290,...,0,1,0,0,0,positive,0,4,202,thank habitat partner work elimin pipelin fos...
4,1370041829408784384,1370457258673401860,3821162059,@Habitat_org Wow! This would be awesome for th...,33898911,1370041829408784384,0,0,Habitat_org,1142,...,0,1,1,0,0,positive,0,5,114,wow would awesom us live multipl chemic sensi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116404,1093570142888435713,1093570501375676416,474681580,"@WorldVision You go girl💃💃💃,there's alot of pe...",14086764,1093570142888435713,0,1,WorldVision,725,...,0,1,2,0,0,positive,0,2,68,go girl💃💃💃 alot peopl root
116405,1062866805545492481,1063045758188969984,958755194212405248,@ericmetaxas @obianuju @WorldVision @PatriciaH...,105901883,1062866805545492481,0,1,WorldVision,406,...,0,1,2,0,0,negative,0,0,105,wow lost total respect world vision aw
116406,931869631702331392,931881262201102336,928471317535514624,@WorldVision Arthur's Song Charity's is always...,14086764,931869631702331392,0,1,WorldVision,177,...,0,1,0,0,0,positive,0,4,114,arthur song chariti alway world peopl great p...
116407,1067115815001096193,1067207873061666821,196878226,@WorldVision 1 Peter 5:7 Cast all your anxiet...,14086764,1067115815001096193,0,1,WorldVision,12,...,0,1,0,0,0,positive,0,5,79,peter cast anxieti care


In [148]:
df.to_csv("fix_labeled/extended_features.csv")