In [None]:
import numpy as np
import pandas as pd
import os
import sqlite3
from nltk.corpus import stopwords
sw = stopwords.words('english')
sw2 = stopwords.words("english")
from string import punctuation
punctuation = set(punctuation)
punctuation.add("’")
import datetime
import time
from random import sample
import random
from IPython.display import Image

import tweepy
from tweepy import OAuthHandler
# I've put my API keys in a .py file called API_keys.py
from my_api_keys import api_key, api_key_secret, access_token, access_token_secret

import json
import csv
import re
import string
from collections import Counter, defaultdict
from pprint import pprint
from operator import itemgetter
import matplotlib.pyplot as plt

import pyLDAvis
import pyLDAvis.gensim_models
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel,LdaMulticore, Phrases 
from gensim.models.phrases import Phraser 
from gensim.corpora import Dictionary

#Lemmatizer = nlp.get_pipe("lemmatizer")

import nltk
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from matplotlib.pyplot import text

In [None]:
# Modifications to punctuation and sw lists

punctuation = set(punctuation)
punctuation.add("’")

sw2 = set(sw)
addl = ("|","-","/","•","&", "&amp;")
sw2.update(addl)

####  Read daily tweets CSVs into pandas dataframe

In [None]:
db = pd.DataFrame(columns = ['user_id','screen_name','description','location','friends_count',
           'followers_count','totaltweets','date_created', 'tweet_id', 'retweetcount','full_text'])

file_location = "/Users/natebender/Desktop/Repo/footprints/footprints_audience/data/"
files = sorted(os.listdir(file_location))
for idx, file in enumerate(files):
    
    data = "".join([file_location,file])
    datafile = pd.read_csv(data)

    db = db.append(datafile,ignore_index=True)

In [None]:
# Check desc stats on overall descriptions before splitting into groups
def get_patterns(all_tweets) :

    all_desc = all_tweets.dropna()
    all_str = " ".join(all_desc)    
    clean = [w for w in all_str.split() if w.lower() not in sw2]
    
    # Calculate your statistics here
    total_tokens = len(clean)
    unique_tokens = len(set(clean))
    clean_tok_len = [len(w) for w in clean]
    avg_token_len = np.mean(clean_tok_len)
    lex_diversity = len(set(clean))/len(clean)
    top_n = Counter(clean).most_common(20)
    
    
    # Now we'll fill out the dictionary. 
    results = {'tokens':total_tokens,
               'unique_tokens':unique_tokens,
               'avg_token_length':round(avg_token_len,2),
               'lexical_diversity':round(lex_diversity,2),
               'Top_n':top_n}

    return(results)

In [None]:
all_tweets = db.full_text.dropna()
print(f'Database: {"{:,}".format(len(db.tweet_id))} tweets')
print(f'Descriptive stats are:')
get_patterns(all_tweets)

### Topic Modeling

Now we want to dig in deeper and see if we can find groups of tweets that cluster together by distinct theme. 

We'll accomplish this using LDA (latent dirichlet analysis) modeling. LDA is an unsupervised classification algorithm.

In [None]:
for_modeling_unclean = db.full_text.dropna()
for_modeling = []

for tweet in for_modeling_unclean :
    words = [w for w in tweet.split()]# if w not in cop_sw]
    words = " ".join(words)    
    for_modeling.append(words)

In [None]:
len(for_modeling)

In [None]:
for_modeling = random.sample(for_modeling, 20000)

In [None]:
# database is small enough right now (1/13/22) that taking a sample is not necessary
# random.seed(1234)
# for_modeling = random.sample(for_modeling, 50000)  

In [None]:
# Updates spaCy's default stop words list with my additional words. 
# stop_list = ['`',"Mr.","Mrs.","Ms."]
# nlp.Defaults.stop_words.update(stop_list)

# Iterates over the words in the stop words list and resets the "is_stop" flag.
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

In [None]:
program_start = time.time()

doc_list = []
allowed_postags=['NOUN','ADJ','VERB','ADV']

# Iterates through each article in the corpus.
for doc in for_modeling :
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append([token.lemma_ for token in pr if token.pos_ in allowed_postags])
    
program_end = time.time()
print('Total time taken to run is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [None]:
id2word = Dictionary(doc_list)  
id2word.filter_extremes(no_below=10, no_above=0.4)  #getting rid of fewer than 10 instances. 
                                # And no more than words that appear in a certain fraction
                                # of the total corpus size (in this case .4)
id2word.compactify()  # assign new word ids to all words. 
corpus = [id2word.doc2bow(word) for word in doc_list]


In [None]:
num_topics = 5

program_start = time.time()

lda_model = LdaMulticore(corpus=corpus, 
                             id2word=id2word, 
                             num_topics=num_topics, 
                             random_state=1,
                             chunksize=30,
                             passes=20,
                             alpha=0.31,  # sets our priors
                             eta=0.91,
                             eval_every=1,
                             per_word_topics=True,
                             workers=1)

program_end = time.time()
print('Total time taken to run is {} minutes.'.format(round(program_end - program_start)/60, 2))

### LDA Modeling

In [None]:
pprint(lda_model.print_topics(num_words=10))

In [None]:
pyLDAvis.enable_notebook()

In [None]:
#pyLDAvis.gensim.prepare(lda_model, corpus, words)
#pyLDAvis.gensim_models.prepare(lda_model, corpus,id2word)

In [None]:
# create temp lists to pass into dataframe
tweetlist = []
categorylist = []
probabilitylist = []
topic_assignments = []


# Calculate topic probabilities for each tweet & assign tweets to categories
for tweet in for_modeling :
    doc = [w for w in tweet.split()]
    pr = nlp(" ".join(doc))
    doc = [token.lemma_ for token in pr if token.pos_ in allowed_postags]
    doc_new = id2word.doc2bow(doc)

    topic_probs = lda_model[doc_new][0]
    topic = max(topic_probs,key=lambda x: x[1])
    topic_assignments.append(topic[0])    
    prob = max(topic_probs)
    cat = topic[0]
    prob = topic[1]
    tweetlist.append(tweet)
    categorylist.append(cat)
    probabilitylist.append(prob)

tweets_df = pd.DataFrame()
tweets_df["Tweet"] = tweetlist
tweets_df["Category"] = categorylist
tweets_df["Probability"] = probabilitylist

In [None]:
tweets_df

In [None]:
newdf = tweets_df.merge(db, left_on='Tweet', right_on='full_text', how='inner')
newdf = newdf.drop_duplicates(subset=['tweet_id'], keep='first')
newdf["date_created"] = pd.to_datetime(newdf["date_created"], format="%Y-%m-%d") 
newdf['days'] = newdf['date_created'].dt.date

In [None]:
# # rename columns from numbers to actual names for final display

# newdf['Category'] = newdf['Category'].replace([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
#                                                             ['Random','Activism','Event-focused random','Collaboration',
#                                                              'Fossil Fuels', 'Education','CTAs & tracking',
#                                                              'End Bad Policy','Aspirational Policy','National FF Policy'])
                                                             


In [None]:
newdf_groupby_category = newdf.groupby('Category')

### Why are we getting so few tweets in the categories if every tweet is being categorized into a category? Across the eight categories we should see the same total as number of tweets in the database?

In [None]:
print(f"Number of tweets per category")
print(newdf_groupby_category.size())

### Topic modeling the Environmental group

In [None]:
newdf_env = newdf[newdf['Category']==1]

In [None]:
newdf_env_unique = newdf_env.drop_duplicates(subset = ["user_id"])
print(len(newdf_env_unique))

In [None]:
for_modeling_unclean = newdf_env_unique.full_text.dropna()
for_modeling = []

for tweet in for_modeling_unclean :
    words = [w for w in tweet.split()]# if w not in cop_sw]
    words = " ".join(words)    
    for_modeling.append(words)

In [None]:
program_start = time.time()

doc_list = []
allowed_postags=['NOUN','ADJ','VERB','ADV']

# Iterates through each article in the corpus.
for doc in for_modeling :
    # Passes that article through the pipeline and adds to a new list.
    pr = nlp(doc)
    doc_list.append([token.lemma_ for token in pr if token.pos_ in allowed_postags])
    
program_end = time.time()
print('Total time taken to run is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [None]:
id2word = Dictionary(doc_list)  
id2word.filter_extremes(no_below=10, no_above=0.4)  #getting rid of fewer than 10 instances. 
                                # And no more than words that appear in a certain fraction
                                # of the total corpus size (in this case .4)
id2word.compactify()  # assign new word ids to all words. 
corpus = [id2word.doc2bow(word) for word in doc_list]


In [None]:
num_topics = 3

program_start = time.time()

lda_model = LdaMulticore(corpus=corpus, 
                             id2word=id2word, 
                             num_topics=num_topics, 
                             random_state=1,
                             chunksize=30,
                             passes=40,
                             alpha=0.31,  # sets our priors
                             eta=0.91,
                             eval_every=1,
                             per_word_topics=True,
                             workers=1)

program_end = time.time()
print('Total time taken to run is {} minutes.'.format(round(program_end - program_start)/60, 2))

In [None]:
pprint(lda_model.print_topics(num_words=10))

In [None]:
# create temp lists to pass into dataframe
tweetlist = []
categorylist = []
probabilitylist = []
topic_assignments = []


# Calculate topic probabilities for each tweet & assign tweets to categories
for tweet in for_modeling :
    doc = [w for w in tweet.split()]
    pr = nlp(" ".join(doc))
    doc = [token.lemma_ for token in pr if token.pos_ in allowed_postags]
    doc_new = id2word.doc2bow(doc)

    topic_probs = lda_model[doc_new][0]
    topic = max(topic_probs,key=lambda x: x[1])
    topic_assignments.append(topic[0])    
    prob = max(topic_probs)
    cat = topic[0]
    prob = topic[1]
    tweetlist.append(tweet)
    categorylist.append(cat)
    probabilitylist.append(prob)

newdf_env_temp = pd.DataFrame()
newdf_env_temp["Tweet"] = tweetlist
newdf_env_temp["Category"] = categorylist
newdf_env_temp["Probability"] = probabilitylist

In [None]:
tweets_df

In [None]:
newdf_env_temp

In [None]:
newdf_env_final = newdf_env_temp.merge(tweets_df, left_on='Tweet', right_on='Tweet', how='inner')

In [None]:
newdf_env_final

In [None]:
newdf_envfinal_cats = newdf_env_final.groupby('Category')

In [None]:
print(f"Number of tweets per category")
print(newdf_envfinal_cats.size())

In [None]:
newdf_final = newdf_env_unique[newdf_env_unique['Category']==1]

In [None]:
pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(lda_model, corpus, words)