# Topic Visualization with pyLDAvis

In [1]:
import pandas as pd
import random

import string 
import re

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, FreqDist
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis.sklearn


In [2]:
pyLDAvis.enable_notebook()

other_punct = ['’', '“','”', "''", '""', '', '..', '...', '``', '—', '->', '’ ’']
eng_stopwords_and_punct = stopwords.words('english') + list(string.punctuation) + other_punct

# TweetTokenizer will put all text in the tweet to lowercase, strip out usernames/handles and reduce reated chars in words
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 

lemmatizer = WordNetLemmatizer()

# Functions
def split_content_by_sentiment(region_df):
    '''
    Create Series for each sentiment class from the provided dataframe
    '''
    pos = region_df[region_df['sentiment'] == 'positive']['content']
    neg = region_df[region_df['sentiment'] == 'negative']['content']
    neu = region_df[region_df['sentiment'] == 'neutral']['content']
    return [pos, neg, neu]

def clean_tokenize_lemmatize_tweet(tweet):
    ''' 
    Apply text cleaning, tokenization then lemmatization on the given tweet 
    '''
    #remove urls
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    #remove RT 
    tweet = re.sub(r'^RT\s+', '', tweet)
    #remove the # symbol
    tweet = re.sub('#', '', tweet)
    #remove words containing numbers
    tweet = re.sub(r'\w*\d\w*', '', tweet)
    #tokenize
    tokens = tweet_tokenizer.tokenize(tweet)
    # remove stopwords and punctuation
    tokens = [token for token in tokens if token not in eng_stopwords_and_punct]
    # remove tokens that are only 1 char in length
    tokens = [token for token in tokens if len(token)>1]
    #lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens


def perform_topic_modeling(content, topic_num, top_word_count):
    '''
    Perform topic modeling using on the provided corpus for the 
    designated number of topics and top word count
    '''
    cv = CountVectorizer(max_df=0.95, 
                         min_df=0.01, 
                         strip_accents='unicode', 
                         tokenizer=clean_tokenize_lemmatize_tweet, 
                         ngram_range=(1,2))

    lda = LatentDirichletAllocation(max_iter=10, 
                                    learning_method='online', 
                                    n_components=topic_num, 
                                    random_state=42)

    # Unsupervised learning...no need to do a test/train split here ;)
    dtm = cv.fit_transform(content)

    # This might take awhile if you are dealing with a large amount of documents!
    lda.fit(dtm)

    return (lda, dtm, cv)


In [3]:
# Read in all US Tweets and create Series for entire US and Regions of interest 
vetted_usa_tweets = pd.read_csv('../data/vetted_usa_tweets.csv')

west = vetted_usa_tweets[vetted_usa_tweets['region'] == 'West']
midwest = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Midwest']
southwest = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southwest']
southeast = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Southeast']
northeast = vetted_usa_tweets[vetted_usa_tweets['region'] == 'Northeast']

# Series of Tweets by Region
us_tweets = vetted_usa_tweets['content']
west_tweets = west['content']
midwest_tweets = midwest['content']
southwest_tweets = southwest['content']
southeast_tweets = southeast['content']
northeast_tweets = northeast['content']

# Series of Tweets by Region and Sentiment
us_tweets_by_sentiment = split_content_by_sentiment(vetted_usa_tweets)
west_tweets_by_sentiment = split_content_by_sentiment(west)
midwest_tweets_by_sentiment = split_content_by_sentiment(midwest)
southwest_tweets_by_sentiment = split_content_by_sentiment(southwest)
southeast_tweets_by_sentiment = split_content_by_sentiment(southeast)
northeast_tweets_by_sentiment = split_content_by_sentiment(northeast)


## US Topics

In [4]:
# model topics in all the Tweets
usa_tm_tuple = perform_topic_modeling(us_tweets, 8, 30)
pyLDAvis.sklearn.prepare(usa_tm_tuple[0], usa_tm_tuple[1], usa_tm_tuple[2])

### US Positive Tweets

In [5]:
# model topics in Positive Tweets
usa_pos_tm_tuple = perform_topic_modeling(us_tweets_by_sentiment[0], 5, 30)
pyLDAvis.sklearn.prepare(usa_pos_tm_tuple[0], usa_pos_tm_tuple[1], usa_pos_tm_tuple[2])

### US Negative Tweets

In [6]:
# model topics in Negative Tweets
usa_neg_tm_tuple = perform_topic_modeling(us_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(usa_neg_tm_tuple[0], usa_neg_tm_tuple[1], usa_neg_tm_tuple[2])

### US Neutral Tweets

In [7]:
# model topics in Neutral Tweets
usa_neu_tm_tuple = perform_topic_modeling(us_tweets_by_sentiment[2], 5, 30)
pyLDAvis.sklearn.prepare(usa_neu_tm_tuple[0], usa_neu_tm_tuple[1], usa_neu_tm_tuple[2])

## West Region Topics

In [8]:
# model topics in all West Region Tweets
w_tm_tuple = perform_topic_modeling(west_tweets, 5, 30)
pyLDAvis.sklearn.prepare(w_tm_tuple[0], w_tm_tuple[1], w_tm_tuple[2])

In [9]:
# model topics in Positive West Region Tweets
w_pos_tm_tuple = perform_topic_modeling(west_tweets_by_sentiment[0], 3, 30)
pyLDAvis.sklearn.prepare(w_pos_tm_tuple[0], w_pos_tm_tuple[1], w_pos_tm_tuple[2])


In [10]:
# model topics in Negative West Region Tweets
w_neg_tm_tuple = perform_topic_modeling(west_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(w_neg_tm_tuple[0], w_neg_tm_tuple[1], w_neg_tm_tuple[2])

In [11]:
# model topics in Neutral West Region Tweets
w_neu_tm_tuple = perform_topic_modeling(west_tweets_by_sentiment[2], 3, 30)
pyLDAvis.sklearn.prepare(w_neu_tm_tuple[0], w_neu_tm_tuple[1], w_neu_tm_tuple[2])

## Southwest Region Topics

In [21]:
# model topics in all Southwest Region Tweets
sw_tm_tuple = perform_topic_modeling(southwest_tweets, 5, 30)
pyLDAvis.sklearn.prepare(sw_tm_tuple[0], sw_tm_tuple[1], sw_tm_tuple[2])

### Southwest Positive Tweets

In [12]:
# model topics in Positive Southwest Region Tweets
sw_pos_tm_tuple = perform_topic_modeling(southwest_tweets_by_sentiment[0], 3, 30)
pyLDAvis.sklearn.prepare(sw_pos_tm_tuple[0], sw_pos_tm_tuple[1], sw_pos_tm_tuple[2])

### Southwest Negative Tweets

In [13]:
# model topics in Negative Southwest Region Tweets
sw_neg_tm_tuple = perform_topic_modeling(southwest_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(sw_neg_tm_tuple[0], sw_neg_tm_tuple[1], sw_neg_tm_tuple[2])

### Southewst Neutral Tweets

In [14]:
# model topics in Neutral Southwest Region Tweets
sw_neu_tm_tuple = perform_topic_modeling(southwest_tweets_by_sentiment[2], 3, 30)
pyLDAvis.sklearn.prepare(sw_neu_tm_tuple[0], sw_neu_tm_tuple[1], sw_neu_tm_tuple[2])

## Midwest Region Topics

In [15]:
# model topics in all Midwest Region Tweets
mw_tm_tuple = perform_topic_modeling(midwest_tweets, 5, 30)
pyLDAvis.sklearn.prepare(mw_tm_tuple[0], mw_tm_tuple[1], mw_tm_tuple[2])

### Midwest Positive Tweets

In [16]:
# model topics in Positive Midwest Region Tweets
mw_pos_tm_tuple = perform_topic_modeling(midwest_tweets_by_sentiment[0], 3, 30)
pyLDAvis.sklearn.prepare(mw_pos_tm_tuple[0], mw_pos_tm_tuple[1], mw_pos_tm_tuple[2])

### Midwest Negative Tweets

In [17]:
# model topics in Negative Midwest Region Tweets
mw_neg_tm_tuple = perform_topic_modeling(midwest_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(mw_neg_tm_tuple[0], mw_neg_tm_tuple[1], mw_neg_tm_tuple[2])

### Midwest Neutral Tweets


In [18]:
# model topics in Neutral Midwest Region Tweets
mw_neu_tm_tuple = perform_topic_modeling(midwest_tweets_by_sentiment[2], 3, 30)
pyLDAvis.sklearn.prepare(mw_neu_tm_tuple[0], mw_neu_tm_tuple[1], mw_neu_tm_tuple[2])

## Northeast Region Topics

In [19]:
# model topics in all Northeast Region Tweets
ne_tm_tuple = perform_topic_modeling(northeast_tweets, 5, 30)
pyLDAvis.sklearn.prepare(ne_tm_tuple[0], ne_tm_tuple[1], ne_tm_tuple[2])

### Northeast Positive Tweets

In [25]:
ne_pos_tm_tuple = perform_topic_modeling(northeast_tweets_by_sentiment[0], 3, 30)
pyLDAvis.sklearn.prepare(ne_pos_tm_tuple[0], ne_pos_tm_tuple[1], ne_pos_tm_tuple[2])

### Northeast Negative Tweets


In [24]:
ne_neg_tm_tuple = perform_topic_modeling(northeast_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(ne_neg_tm_tuple[0], ne_neg_tm_tuple[1], ne_neg_tm_tuple[2])

### Northeast Neutral Tweets

In [26]:
ne_neu_tm_tuple = perform_topic_modeling(northeast_tweets_by_sentiment[2], 3, 30)
pyLDAvis.sklearn.prepare(ne_neu_tm_tuple[0], ne_neu_tm_tuple[1], ne_neu_tm_tuple[2])

## Southeast Region Topics

In [20]:
# model topics in all Southeast Region Tweets
se_tm_tuple = perform_topic_modeling(southeast_tweets, 5, 30)
pyLDAvis.sklearn.prepare(se_tm_tuple[0], se_tm_tuple[1], se_tm_tuple[2])

### Southeast Positive Tweets

In [27]:
se_pos_tm_tuple = perform_topic_modeling(southeast_tweets_by_sentiment[0], 3, 30)
pyLDAvis.sklearn.prepare(se_pos_tm_tuple[0], se_pos_tm_tuple[1], se_pos_tm_tuple[2])

### Southeast Negative Tweets

In [28]:
se_neg_tm_tuple = perform_topic_modeling(southeast_tweets_by_sentiment[1], 3, 30)
pyLDAvis.sklearn.prepare(se_neg_tm_tuple[0], se_neg_tm_tuple[1], se_neg_tm_tuple[2])

### Southeast Neutral Tweets

In [29]:
se_neu_tm_tuple = perform_topic_modeling(southeast_tweets_by_sentiment[2], 3, 30)
pyLDAvis.sklearn.prepare(se_neu_tm_tuple[0], se_neu_tm_tuple[1], se_neu_tm_tuple[2])