# Topic Visualization with pyLDAvis

In [1]:
import pandas as pd
import random

import string 
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis.sklearn


In [6]:
pyLDAvis.enable_notebook()

eng_stopwords_and_punct = stopwords.words('english') + list(string.punctuation) + ['’', '“','”', "''", '""', '', '..', '...', '``', '—', '->', '’ ’']

# TweetTokenizer will put all text in the tweet to lowercase, strip out usernames/handles and reduce reated chars in words
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

lemmatizer = WordNetLemmatizer()

# Functions
def clean_tokenize_lemmatize_tweet(tweet):
    ''' 
    Apply text cleaning, tokenization then lemmatization on the given tweet 
    '''
    #remove urls
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    #remove RT 
    tweet = re.sub(r'^RT\s+', '', tweet)
    #remove the # symbol
    tweet = re.sub('#', '', tweet)
    #remove words containing numbers
    tweet = re.sub(r'\w*\d\w*', '', tweet)
    #tokenize
    tokens = tweet_tokenizer.tokenize(tweet)
    # remove stopwords and punctuation
    tokens = [token for token in tokens if token not in eng_stopwords_and_punct]
    # remove tokens that are only 1 char in length
    tokens = [token for token in tokens if len(token)>1]
    #lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens


def perform_topic_modeling(content, topic_num, top_word_count):
    '''
    Perform topic modeling using on the provided corpus for the 
    designated number of topics and top word count
    '''
    cv = CountVectorizer(max_df=0.95, 
                         min_df=0.01, 
                         strip_accents='unicode', 
                         tokenizer=clean_tokenize_lemmatize_tweet, 
                         ngram_range=(1,2))

    lda = LatentDirichletAllocation(max_iter=10, 
                                    learning_method='online', 
                                    n_components=topic_num, 
                                    random_state=42)

    # Unsupervised learning...no need to do a test/train split here ;)
    dtm = cv.fit_transform(content)

    # This might take awhile if you are dealing with a large amount of documents!
    lda.fit(dtm)

    return (lda, dtm, cv)



In [9]:
# Read in all US Tweets and create Series for entire US and Regions of interest 
vetted_usa_tweets = pd.read_csv('../data/vetted_usa_tweets.csv')
us_tweets = vetted_usa_tweets['content']
west_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'West']['content']
midwest_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Midwest']['content']
southwest_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southwest']['content']
southeast_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southeast']['content']
northeast_tweets = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Northeast']['content']


## US Topics

In [12]:
# model topics in all the Tweets
usa_tm_tuple = perform_topic_modeling(us_tweets, 15, 30)
pyLDAvis.sklearn.prepare(usa_tm_tuple[0], usa_tm_tuple[1], usa_tm_tuple[2])

## West Region Topics

In [13]:
# model topics in all West Region Tweets
w_tm_tuple = perform_topic_modeling(west_tweets, 10, 30)
pyLDAvis.sklearn.prepare(w_tm_tuple[0], w_tm_tuple[1], w_tm_tuple[2])

## Southwest Region Topics

In [14]:
# model topics in all Southwest Region Tweets
sw_tm_tuple = perform_topic_modeling(southwest_tweets, 10, 30)
pyLDAvis.sklearn.prepare(sw_tm_tuple[0], sw_tm_tuple[1], sw_tm_tuple[2])

## Midwest Region Topics

In [15]:
# model topics in all Midwest Region Tweets
mw_tm_tuple = perform_topic_modeling(midwest_tweets, 10, 30)
pyLDAvis.sklearn.prepare(mw_tm_tuple[0], mw_tm_tuple[1], mw_tm_tuple[2])

## Northeast Region Topics

In [16]:
# model topics in all Northeast Region Tweets
ne_tm_tuple = perform_topic_modeling(northeast_tweets, 10, 30)
pyLDAvis.sklearn.prepare(ne_tm_tuple[0], ne_tm_tuple[1], ne_tm_tuple[2])

## Southeast Region Topics

In [17]:
# model topics in all Southeast Region Tweets
se_tm_tuple = perform_topic_modeling(southeast_tweets, 10, 30)
pyLDAvis.sklearn.prepare(se_tm_tuple[0], se_tm_tuple[1], se_tm_tuple[2])