In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('bat_tweets_final.csv')

In [3]:
df.shape

(19443, 37)

In [4]:
df.date.min()

'2018-12-31'

In [5]:
df.date.max()

'2022-06-09'

In [6]:
df['created_at'] = pd.to_datetime(df.created_at)

In [7]:
#create variable to indicate whether tweet came before or after 2020
df['covid'] = np.where(df.created_at < "2019-12-31", "pre", "post")

In [12]:
#initial cleaning
import re
import string
def clean_tweet(tweet):
    if type(tweet) == float:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp) # to avoid removing contractions
    temp = re.sub("@[A-Za-z0-9_]+","", temp) # remove mentions
    temp = re.sub("#[A-Za-z0-9_]+","", temp) # remove hashtags
    temp = re.sub(r'http\S+', '', temp) # remove urls
    temp = re.sub('[%s]' % re.escape(string.punctuation), '', temp) #remove puncuation
    temp = re.sub("[^a-z0-9]"," ", temp) #filter non-alphanumeric characters
    return temp
results = [clean_tweet(tw) for tw in df.tweet]
results

['vortex wind power is wildlifefriendly and safe for birds bats bees sugar amp squirrel gliders but bladed wind turbines kill millions of native wildlife species annually  110   330 birds amp 200   670 bats per wind turbine are mutilated and die lt ',
 '       but i think you have ignored my point about the damage intensive agriculture does in the first place the wildlife that could be there isn t autumn hacking of hedges so no food or cover spraying removing insects so no bats trees gone lincs farms are not full of biodiversity',
 ' not sure why  characterises protected species such as bats as pests  ditto bees etc  reinforcing negative stereotypes of our valuable wildlife someone needs to rewrite this page  ',
 '  nah the native stuff is all pretty friendly the only native mammal is a tiny ickle bat the only venomous spider is so rare its an endangered species australian wildlife now that is terrifying',
 'read this about bats         ',
 'any ideas what this might be  looks like som

### Lemmatize words

In [None]:
#lemmatize words
import spacy
load_model = spacy.load('en_core_web_sm', disable = ['parser','ner'])
def lemmatize_tweet(tweet):
    doc = load_model(tweet)
    temp = " ".join([token.lemma_ for token in doc])
    return temp
lem_results = [lemmatize_tweet(tweet) for tweet in results]
lem_results

### Filter out non-English words

In [None]:
#get rid of non-english words
import nltk
nltk.download('words')
words = set(nltk.corpus.words.words())
clean_tweet = []
for tweet in lem_results:
    clean_tweet.append(" ".join(w for w in nltk.wordpunct_tokenize(tweet) if w in words and w.isalpha()))
df['clean_tweet'] = clean_tweet

### Scattertext

In [None]:
#create scattertext html of pre and post covid terms
import scattertext as st

df['metadata'] = df.created_at.map(str) + " | " + df.clean_tweet

nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {"bat","wildlife"}
corpus = st.CorpusFromPandas(df, category_col='covid', text_col='clean_tweet', nlp=nlp).build()

html = st.produce_scattertext_explorer(corpus,
                                       category='pre',
                                       category_name='Pre-Covid',
                                       not_category_name='Post-Covid',
                                       width_in_pixels=1000,
                                       metadata = corpus.get_df()['metadata'])
open("pre_post_covid.html", 'wb').write(html.encode('utf-8'))

### Some additional EDA adjustments

In [None]:
#get rid of some tweets that have little meaning, get rid of tweets about baseball
df = df[df.clean_tweet != 'read this about bat'] 
df = df[df.clean_tweet != 'what be bat']
df = df[df.clean_tweet != 'bat']
df = df[df.clean_tweet != 'the bat']
df = df[df.clean_tweet.str.contains("baseball bat")==False]
df.reset_index(drop = True, inplace=True)

### Create wordclouds for before and after 2020

In [None]:
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt

stopwords = ['bat','wildlife', 's'] + list(STOPWORDS)
pre_tweets = " ".join(tweet for tweet in df.clean_tweet[df.covid=='pre'])
pre_wordcloud = WordCloud(width = 3000,
                      height = 2000,
                      background_color = 'black',
                      stopwords = stopwords).generate(pre_tweets)
fig = plt.figure(figsize = (40, 30),
                 facecolor = 'k',
                 edgecolor = 'k')
plt.imshow(pre_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
plt.tight_layout()

In [None]:
post_tweets = " ".join(tweet for tweet in df.clean_tweet[df.covid=='post'])
post_wordcloud = WordCloud(width = 3000,
                      height = 2000,
                      background_color = 'black',
                      stopwords = stopwords).generate(post_tweets)
fig = plt.figure(figsize = (40, 30),
                 facecolor = 'k',
                 edgecolor = 'k')
plt.imshow(post_wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
plt.tight_layout()

### Create document-term matrix using CountVectorizer

Note that I tried both TfidfVectorizer and CountVectorizer and found topics to be more coherent using CountVectorizer. This feels like an acceptable choice given that tweets are relatively short.

In [None]:
#create document term matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text

#specify stop words to remove
my_stop_words = text.ENGLISH_STOP_WORDS.union(["aa","bat","wildlife","just","like",
                                               "great","make","work","new","know","s"]) 
cv = CountVectorizer(strip_accents = "ascii",
                          stop_words = my_stop_words)
X = cv.fit_transform(df.clean_tweet).toarray()
df_cv = pd.DataFrame(X, index = df.clean_tweet,columns = cv.get_feature_names())

In [None]:
df_cv.shape

In [None]:
#look at most common words to get a gut check on if things are making sense
df_cv.sum().sort_values(ascending = False).head(20)

### Perform topic modeling using non-negative matrix factorization

In [None]:
#topic modeling
from sklearn.decomposition import NMF
nmf = NMF(n_components=3, init = "nndsvda", max_iter=500)
doc_topic = nmf.fit_transform(df_cv)

In [None]:
nmf.components_.shape

In [None]:
def get_top_terms(topic, n_terms, nmf=nmf, terms=df_cv.columns):
    # get the topic components (i.e., term weights)
    components = nmf.components_[topic, :]

    # get term indices, sorted (descending) by topic weights
    top_term_indices = components.argsort()[-n_terms:]
    
    # use the `terms` array to get the actual top terms
    top_terms = np.array(terms)[top_term_indices]
    
    return top_terms.tolist()

In [None]:
#show the top 10 terms in each topic
topic_terms_df = pd.DataFrame()
for i in range(0,3):
    topic_terms = pd.Series(get_top_terms(i,10), name = i)
    topic_terms_df = pd.concat([topic_terms_df, topic_terms], axis = 1)
    print(f'Topic {i}:')
    print(get_top_terms(i, 10))
    print("\n")
topic_terms_df.rename(mapper = {0:'Wind',1:'Disease',2:'Habitat'}, axis = 1, inplace = True)

In [None]:
topic_terms_df

In [None]:
#name topics and create a dataframe of topic focus for each tweet
topic_names = ["wind","disease","habitat"]
doc_topic_df = pd.DataFrame(doc_topic.round(5), index = df.index, columns = topic_names)

In [None]:
#label each document with its primary topic
doc_topic_df['primary_topic'] = doc_topic_df.idxmax(axis=1)

In [None]:
#add the primary topic of each document to the original dataframe
df = pd.merge(df, doc_topic_df.primary_topic, 
                  left_index=True, right_index=True)
df.head()

In [None]:
#look at overall frequency of different topics
import seaborn as sns

fig, ax = plt.subplots()
ax.hist(df.primary_topic)
fig.suptitle('Topic Frequency')
fig.tight_layout();

### Sentiment analysis using Vader

In [None]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
df['score'] = df.clean_tweet.map(analyzer.polarity_scores).map(lambda x: x.get('compound'))

In [None]:
#print the most positive tweet
df.clean_tweet.iloc[df.score.idxmax()]

In [None]:
#print the most negative tweet
df.clean_tweet.iloc[df.score.idxmin()]

In [None]:
#print the overall sentiment score
df.score.mean()

In [None]:
#assign positive or negative overall score to each tweet and then calculate percentage of positive tweets
df['sent'] = df.score.apply(lambda x: 'pos' if x>=0 else 'neg')
print("Percentage of negative tweets about bats pre-Covid:")
print(f"{(((df[df.covid=='pre'].sent[df.sent=='neg'].count())/len(df[df.covid=='pre']))*100).round(2)}%\n")
print(f"Percentage of negative tweets about bats post-Covid:")
print(f"{(((df[df.covid=='post'].sent[df.sent=='neg'].count())/len(df[df.covid=='post']))*100).round(2)}%\n")

### Vizualize change in topics over time

In [None]:
df['month_year'] = df['created_at'].dt.to_period('M')

In [None]:
topic_count_df = df.groupby(['month_year','primary_topic'])['primary_topic'].count().to_frame(name='topic_count').reset_index()

In [None]:
topic_count_df.head()

In [None]:
plt.rcParams.update({'font.size': 14})
fig, ax = plt.subplots()
fig.set_size_inches(11, 8)
plt.plot_date(topic_count_df['month_year'][topic_count_df.primary_topic == "habitat"], 
              topic_count_df['topic_count'][topic_count_df.primary_topic == "habitat"], 
             xdate=True,
             ydate=False,
             linestyle = 'dotted')
plt.plot_date(topic_count_df['month_year'][topic_count_df.primary_topic == "wind"], 
              topic_count_df['topic_count'][topic_count_df.primary_topic == "wind"], 
             xdate=True,
             ydate=False,
             linestyle = 'dotted')
plt.plot_date(topic_count_df['month_year'][topic_count_df.primary_topic == "disease"], 
              topic_count_df['topic_count'][topic_count_df.primary_topic == "disease"], 
             xdate=True,
             ydate=False,
             linestyle = 'dotted')
ax.legend(labels=['Habitat','Wind','Disease'])
ax.set_ylabel('Count of Tweets')
fig.suptitle('Topics Over Time')
ax.tick_params(labelrotation=45)
plt.tight_layout()
fig.savefig('topics_over_time.png', dpi=100)
plt.show()