In [2]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import html
import preprocessor as pp
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

from nltk.probability import FreqDist

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manasip/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/manasip/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [45]:
##Import file
df_gen = pd.read_csv("df_gender.csv")

In [46]:
df_gen.head(3)

Unnamed: 0,Tweet Id,Text,Username,Noisy,Location,Name,User_Id,Year,sentiment,sentimentclass,sentimentnum,Gender
0,9.472559e+17,I just read ur twitter comments on ur acct u h...,kathy_tkat,abortion,"Massachusetts, USA",Kathy,7.26653e+17,2017,"{'neg': 0.046, 'neu': 0.86, 'pos': 0.093, 'com...",positive,2,female
1,9.472487e+17,I live in a country where abortion is illegal ...,belindacree,abortion,Northern Ireland,Belinda,139375400.0,2017,"{'neg': 0.27, 'neu': 0.669, 'pos': 0.061, 'com...",negative,1,female
2,9.472429e+17,If the administration manages to outlaw aborti...,Bouteloua_spp,abortion,None of your god damn business,Michelle,205493100.0,2017,"{'neg': 0.152, 'neu': 0.848, 'pos': 0.0, 'comp...",negative,1,female


In [47]:
df_gen.rename(columns={"Tweet Id": "id", "Text": "tweet", "Username": "user", \
                       "Gender":"gender", "Noisy":"noisy"}, \
                   inplace=True)

In [48]:
##Sanity check
print(df_gen.id.nunique(), df_gen.shape[0])

8584 8584


In [49]:
df_gen.drop_duplicates(subset=['id'], inplace=True)

In [50]:
#Sanity check
df_gen.shape

(8584, 12)

------------Pre-processing tweets------------

In [51]:
df_gen["clean_tweet"] = df_gen.tweet

In [52]:
#Step 1: Decode html escape characters: &amp &quote etc 
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub(r'&[a-z]+;', '', x))

In [53]:
#Sanity check
df_gen['check'] = df_gen.clean_tweet.str.contains("&", case=False, na=False).astype(int) 
# Checked for &gt, &amp &#62 &quote
print(sum(df_gen.check))

0


In [54]:
#Step 2: Remove @mentions
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub('@[^\s]+','',x))
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub('@[^\w]+','',x))
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub('@','',x))

In [55]:
#Sanity check
df_gen.check = df_gen.clean_tweet.str.contains("@", case=False, na=False).astype(int) 
print(sum(df_gen.check))

0


In [56]:
#Step 3 Remove \n \t \r
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: x.replace('\n', '').replace('\t', '').replace('\r', ''))

In [57]:
#Sanity check
df_gen.check = df_gen.clean_tweet.str.contains("\n", case=False, na=False).astype(int) 
print('n', sum(df_gen.check))

df_gen.check = df_gen.clean_tweet.str.contains("\t", case=False, na=False).astype(int) 
print('t', sum(df_gen.check))

df_gen.check = df_gen.clean_tweet.str.contains("\r", case=False, na=False).astype(int) 
print('r', sum(df_gen.check))

n 0
t 0
r 0


In [58]:
#Step 4: Tweet-preprocessor to remove, urls, emojis, hashtags, reserved words, mentions (if any), smileys
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: pp.clean(x))

In [59]:
#Sanity check
df_gen.check = df_gen.clean_tweet.str.contains("#", case=False, na=False).astype(int) 
print("check #",sum(df_gen.check))
df_gen.check = df_gen.clean_tweet.str.contains("http", case=False, na=False).astype(int) 
print("check http",sum(df_gen.check))
df_gen.check = df_gen.clean_tweet.str.contains("http", case=False, na=False).astype(int) 
print("check www",sum(df_gen.check))

check # 0
check http 0
check www 0


In [60]:
#Step 5: Remove Placeholders
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub(r'{link}', '', x))
df_gen.clean_tweet = df_gen.clean_tweet.apply(lambda x: re.sub(r"\[video\]", '', x))

In [61]:
#Sanity check
df_gen.check = df_gen.clean_tweet.str.contains("{link}", case=False, na=False).astype(int) 
print("check link",sum(df_gen.check))
df_gen.check = df_gen.clean_tweet.str.contains("\[video\]", case=False, na=False).astype(int) 
print("check video",sum(df_gen.check))

check link 0
check video 0


In [62]:
#Step 6: Remove punctuations, digits
#Create helper function
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

def remove_digits(text):
    for digit in string.digits:
        text = text.replace(digit, '')
    return text

df_gen.clean_tweet = df_gen.clean_tweet.apply(remove_punctuations)

df_gen.clean_tweet = df_gen.clean_tweet.apply(remove_digits)

In [63]:
#Sanity check
df_gen.check = df_gen.clean_tweet.str.contains("\!", case=False, na=False).astype(int)
print("check !",sum(df_gen.check))
df_gen.check = df_gen.clean_tweet.str.contains("\?", case=False, na=False).astype(int)
print("check ?",sum(df_gen.check))
df_gen.check = df_gen.clean_tweet.str.contains("\.", case=False, na=False).astype(int)
print("check .",sum(df_gen.check))

df_gen.check = df_gen.clean_tweet.str.contains("1", case=False, na=False).astype(int)
print("check 1",sum(df_gen.check))

check ! 0
check ? 0
check . 0
check 1 0


In [64]:
#Step 7 Lowercase before tokenization
df_gen.clean_tweet = df_gen.clean_tweet.str.lower()

In [65]:
#Step 8 Tokenization
tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)

In [66]:
df_gen["tok_tweet"] = df_gen.clean_tweet.apply(lambda x: tknzr.tokenize(x))

In [67]:
#Step 8 Remove stopwords 
#Create helper function

stopword = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

df_gen.tok_tweet = df_gen.tok_tweet.apply(lambda x: remove_stopwords(x))

In [68]:
#Step 9 Lemmatization
#Create helper function

lem = nltk.WordNetLemmatizer()

def lemmat(text):
    text = [lem.lemmatize(word) for word in text]
    return text

df_gen.tok_tweet = df_gen.tok_tweet.apply(lambda x: lemmat(x))

In [69]:
#Drop words less than 3 words
tokens3 = df_gen[df_gen.tok_tweet.apply(lambda x: len(x) <= 3)].index

In [70]:
df_gen.drop(tokens3, inplace = True)
df_gen.reset_index(inplace=True)

In [71]:
all_words = [word for tokens in df_gen.tok_tweet for word in tokens]
tweet_lengths = [len(tokens) for tokens in df_gen.tok_tweet]
vocab = sorted(list(set(all_words)))

print('{} tokens total, with a vocabulary size of {}'.format(len(all_words), len(vocab)))
print('Max tweet length is {}'.format(max(tweet_lengths)))

108246 tokens total, with a vocabulary size of 10996
Max tweet length is 39


In [72]:
df_gen.head()

Unnamed: 0,index,id,tweet,user,noisy,Location,Name,User_Id,Year,sentiment,sentimentclass,sentimentnum,gender,clean_tweet,check,tok_tweet
0,0,9.472559e+17,I just read ur twitter comments on ur acct u h...,kathy_tkat,abortion,"Massachusetts, USA",Kathy,7.26653e+17,2017,"{'neg': 0.046, 'neu': 0.86, 'pos': 0.093, 'com...",positive,2,female,i just read ur twitter comments on ur acct u h...,0,"[read, ur, twitter, comment, ur, acct, u, huge..."
1,1,9.472487e+17,I live in a country where abortion is illegal ...,belindacree,abortion,Northern Ireland,Belinda,139375400.0,2017,"{'neg': 0.27, 'neu': 0.669, 'pos': 0.061, 'com...",negative,1,female,i live in a country where abortion is illegal ...,0,"[live, country, abortion, illegal, woman, trag..."
2,2,9.472429e+17,If the administration manages to outlaw aborti...,Bouteloua_spp,abortion,None of your god damn business,Michelle,205493100.0,2017,"{'neg': 0.152, 'neu': 0.848, 'pos': 0.0, 'comp...",negative,1,female,if the administration manages to outlaw aborti...,0,"[administration, manages, outlaw, abortion, go..."
3,3,9.472395e+17,I dont put my hat in the abortion discussion m...,DerekTheBard,abortion,"Elmira, NY",Derek,1539154000.0,2017,"{'neg': 0.238, 'neu': 0.71, 'pos': 0.051, 'com...",negative,1,male,i dont put my hat in the abortion discussion m...,0,"[dont, put, hat, abortion, discussion, much, t..."
4,4,9.4723e+17,To those saying Life begins at conception I sa...,Deborahtaxi,abortion,"New York, USA",Debbie,143478500.0,2017,"{'neg': 0.044, 'neu': 0.785, 'pos': 0.17, 'com...",positive,2,female,to those saying life begins at conception i sa...,0,"[saying, life, begin, conception, say, take, c..."


-------------LDA Modelling--------------------

In [78]:
#Create 4 dataframes
#Profile spilt by male and female
df_lf = df_gen[(df_gen.noisy=="prolife") & (df_gen.gender=="female")]
df_lm = df_gen[(df_gen.noisy=="prolife") & (df_gen.gender=="male")]

#Prochoice spilt by male and female
df_cf = df_gen[(df_gen.noisy=="prochoice") & (df_gen.gender=="female")]
df_cm = df_gen[(df_gen.noisy=="prochoice") & (df_gen.gender=="male")]

In [79]:
##Sanity check
print("prolife female: ", "gender:", df_lf.gender.unique(), "label:", df_lf.noisy.unique())
print("prolife male: ", "gender:", df_lm.gender.unique(), "label:", df_lm.noisy.unique())
print("prochoice female: ", "gender:", df_cf.gender.unique(), "label:", df_cf.noisy.unique())
print("prochoice male: ", "gender:", df_cf.gender.unique(), "label:", df_cf.noisy.unique())

prolife female:  gender: ['female'] label: ['prolife']
prolife male:  gender: ['male'] label: ['prolife']
prochoice female:  gender: ['female'] label: ['prochoice']
prochoice male:  gender: ['female'] label: ['prochoice']


In [80]:
#Create dictionary
txt_dict_lf = Dictionary(df_lf.tok_tweet)
txt_dict_lm = Dictionary(df_lm.tok_tweet)
txt_dict_cf = Dictionary(df_cf.tok_tweet)
txt_dict_cm = Dictionary(df_cm.tok_tweet)

In [81]:
txt_out_lf = txt_dict_lf.token2id
txt_out_lm = txt_dict_lm.token2id
txt_out_cf = txt_dict_cf.token2id
txt_out_cm = txt_dict_cm.token2id

In [82]:
#Create BOW
tweets_bow_lf = [txt_dict_lf.doc2bow(tweet) for tweet in df_lf.tok_tweet]
tweets_bow_lm = [txt_dict_lm.doc2bow(tweet) for tweet in df_lm.tok_tweet]
tweets_bow_cf = [txt_dict_cf.doc2bow(tweet) for tweet in df_cf.tok_tweet]
tweets_bow_cm = [txt_dict_cm.doc2bow(tweet) for tweet in df_cm.tok_tweet]

In [86]:
##Model
#Female Prolife
k = 5
tweets_lda_lf = LdaModel(tweets_bow_lf,
                      num_topics = k,
                      id2word = txt_dict_lf,
                      random_state = 10,
                      passes=100)

In [87]:
tweets_lda_lf.show_topics()

[(0,
  '0.099*"pro" + 0.085*"life" + 0.012*"child" + 0.011*"prolife" + 0.011*"people" + 0.010*"woman" + 0.009*"birth" + 0.009*"choice" + 0.008*"abortion" + 0.008*"baby"'),
 (1,
  '0.023*"prolife" + 0.014*"life" + 0.014*"pro" + 0.012*"birth" + 0.010*"woman" + 0.009*"child" + 0.009*"healthcare" + 0.009*"cut" + 0.008*"abortion" + 0.006*"food"'),
 (2,
  '0.047*"prolife" + 0.011*"abortion" + 0.008*"people" + 0.008*"amp" + 0.008*"vote" + 0.007*"baby" + 0.007*"would" + 0.006*"life" + 0.006*"see" + 0.005*"trump"'),
 (3,
  '0.062*"prolife" + 0.019*"abortion" + 0.014*"amp" + 0.012*"right" + 0.011*"life" + 0.011*"woman" + 0.007*"human" + 0.006*"adoption" + 0.006*"issue" + 0.005*"defundpp"'),
 (4,
  '0.044*"prolife" + 0.024*"life" + 0.022*"pro" + 0.012*"amp" + 0.010*"vote" + 0.007*"child" + 0.007*"would" + 0.007*"u" + 0.005*"trump" + 0.005*"good"')]

In [88]:
from gensim.models.coherencemodel import CoherenceModel

In [92]:
cm = CoherenceModel(model=tweets_lda_lf, texts=df_lf.tok_tweet.to_list(), coherence='c_v')
cm.get_coherence()

0.3159526015867854

In [93]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


In [94]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(tweets_lda_lf, tweets_bow_lf, dictionary=txt_dict_lf)
vis

In [95]:
##Model
#Male Prolife
k = 5
tweets_lda_lm = LdaModel(tweets_bow_lm,
                      num_topics = k,
                      id2word = txt_dict_lm,
                      random_state = 10,
                      passes=100)

In [96]:
tweets_lda_lm.show_topics()

[(0,
  '0.059*"pro" + 0.050*"life" + 0.024*"prolife" + 0.014*"conservative" + 0.014*"abortion" + 0.012*"trump" + 0.011*"justice" + 0.010*"marriage" + 0.010*"late" + 0.009*"pray"'),
 (1,
  '0.065*"pro" + 0.058*"life" + 0.024*"prolife" + 0.011*"people" + 0.011*"choice" + 0.011*"praytoendabortion" + 0.009*"birth" + 0.009*"day" + 0.008*"born" + 0.007*"tcot"'),
 (2,
  '0.049*"prolife" + 0.011*"u" + 0.010*"holylove" + 0.009*"unitedhearts" + 0.009*"ourlady" + 0.009*"guadalupe" + 0.009*"apparition" + 0.009*"feast" + 0.009*"praytoendabortion" + 0.008*"maranathasns"'),
 (3,
  '0.047*"prolife" + 0.023*"abortion" + 0.020*"life" + 0.013*"pro" + 0.011*"alert" + 0.010*"woman" + 0.008*"ohio" + 0.005*"bill" + 0.005*"sign" + 0.005*"via"'),
 (4,
  '0.040*"life" + 0.039*"prolife" + 0.038*"pro" + 0.014*"human" + 0.012*"child" + 0.011*"right" + 0.009*"unitedhearts" + 0.009*"baby" + 0.009*"vote" + 0.008*"jesus"')]

In [97]:
cm = CoherenceModel(model=tweets_lda_lm, texts=df_lm.tok_tweet.to_list(), coherence='c_v')
cm.get_coherence()

0.3776528707315047

In [98]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(tweets_lda_lm, tweets_bow_lm, dictionary=txt_dict_lm)
vis

In [99]:
##Model
#Female Prochoice
k = 5
tweets_lda_cf = LdaModel(tweets_bow_cf,
                      num_topics = k,
                      id2word = txt_dict_cf,
                      random_state = 10,
                      passes=100)

In [100]:
tweets_lda_cf.show_topics()

[(0,
  '0.062*"prochoice" + 0.017*"abortion" + 0.015*"child" + 0.013*"life" + 0.012*"amp" + 0.012*"prolife" + 0.011*"pro" + 0.010*"want" + 0.008*"people" + 0.007*"woman"'),
 (1,
  '0.067*"prochoice" + 0.025*"abortion" + 0.017*"woman" + 0.006*"right" + 0.006*"amp" + 0.006*"u" + 0.006*"life" + 0.006*"control" + 0.005*"prolife" + 0.004*"support"'),
 (2,
  '0.082*"prochoice" + 0.029*"woman" + 0.018*"abortion" + 0.018*"choice" + 0.015*"body" + 0.013*"right" + 0.010*"pro" + 0.010*"prolife" + 0.008*"make" + 0.007*"amp"'),
 (3,
  '0.058*"prochoice" + 0.017*"one" + 0.015*"abortion" + 0.010*"prolife" + 0.008*"like" + 0.007*"nation" + 0.007*"voteprochoice" + 0.006*"vote" + 0.006*"let" + 0.005*"must"'),
 (4,
  '0.068*"prochoice" + 0.017*"abortion" + 0.013*"woman" + 0.010*"right" + 0.008*"people" + 0.007*"need" + 0.007*"get" + 0.006*"womensrights" + 0.006*"amp" + 0.006*"v"')]

In [101]:
cm = CoherenceModel(model=tweets_lda_cf, texts=df_cf.tok_tweet.to_list(), coherence='c_v')
cm.get_coherence()

0.30972257339213

In [102]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(tweets_lda_cf, tweets_bow_cf, dictionary=txt_dict_cf)
vis

In [103]:
##Model
#Male Prochoice
k = 5
tweets_lda_cm = LdaModel(tweets_bow_cm,
                      num_topics = k,
                      id2word = txt_dict_cm,
                      random_state = 10,
                      passes=100)

In [104]:
tweets_lda_cm.show_topics()

[(0,
  '0.066*"prochoice" + 0.020*"prolife" + 0.018*"life" + 0.013*"woman" + 0.010*"abortion" + 0.009*"dont" + 0.009*"choice" + 0.008*"right" + 0.008*"like" + 0.008*"people"'),
 (1,
  '0.056*"prochoice" + 0.015*"choice" + 0.015*"abortion" + 0.011*"pro" + 0.009*"prolife" + 0.007*"life" + 0.007*"people" + 0.006*"amp" + 0.005*"th" + 0.005*"woman"'),
 (2,
  '0.047*"prochoice" + 0.013*"woman" + 0.011*"help" + 0.011*"abortion" + 0.009*"choice" + 0.009*"adoption" + 0.008*"many" + 0.008*"group" + 0.007*"amp" + 0.007*"would"'),
 (3,
  '0.070*"prochoice" + 0.021*"abortion" + 0.019*"woman" + 0.015*"prolife" + 0.011*"right" + 0.010*"u" + 0.007*"say" + 0.007*"one" + 0.006*"body" + 0.006*"like"'),
 (4,
  '0.055*"prochoice" + 0.019*"advocate" + 0.014*"human" + 0.013*"abortion" + 0.012*"woman" + 0.008*"science" + 0.008*"make" + 0.008*"unborn" + 0.008*"dont" + 0.008*"prominent"')]

In [105]:
cm = CoherenceModel(model=tweets_lda_cm, texts=df_cm.tok_tweet.to_list(), coherence='c_v')
cm.get_coherence()

0.3131768363188442

In [106]:
pyLDAvis.enable_notebook()
vis=gensimvis.prepare(tweets_lda_cm, tweets_bow_cm, dictionary=txt_dict_cm)
vis