In [46]:
import numpy as np
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt
import nltk

import corextopic.corextopic as ct
import corextopic.vis_topic as vt 

from sklearn.feature_extraction.text import TfidfVectorizer
from corextopic import vis_topic as vt

%matplotlib inline

### Read in clean tweets, lemmatized and stop words removed.

In [47]:
my_clean_tweets_df = pd.read_csv("final_analysis_clean_tweets_lemmatized.csv")

In [48]:
my_clean_tweets_df.shape

(1735605, 16)

### Vectorize the tweets into doc-word matrix. 

#### Remove words present in more than 0.95 tweets. Word minimum frequency: 3

In [49]:
vectorizer = TfidfVectorizer(
    max_df=.95,
    min_df=3,
    max_features=None,
    ngram_range=(1, 2),
    norm=None,
    binary=True,
    use_idf=False,
    sublinear_tf=False
)
vectorizer = vectorizer.fit(my_clean_tweets_df['text'].values.astype('U'))
tfidf = vectorizer.transform(my_clean_tweets_df['text'].values.astype('U'))
vocab = vectorizer.get_feature_names()
print(len(vocab))

1173509


### Define anchors to seed different topics

In [50]:
anchors = [["carbon", "dioxide", "restrict", "deprivation", "deprive", 
            "flow", "cause", "suffocate", "toxic", "oxygen", "hypoxia", "exhale", "inhale"], 
           ["bacteria", "germ", "fungus", "fungal", "mold", "spore", "moisture", "humidity", "cause", "lung",
           "breed", "contaminate", "moist", "humid", "pneumonia", "snot", "trap"],
           ["mask", "ineffective", "useless", "inadequate", "flaw", "insufficient",
           "pointless", "useless", "worthless", "futile"],
           ["school", "communicate", "communication", "child abuse", "student", "pupil"]]


### Run a model with a given number of topics and specified anchor strength 

In [None]:
topic_no = 200
anchor_str = 10
model = ct.Corex(n_hidden=topic_no, seed=42)
model = model.fit(
    tfidf,
    words=vocab,
    anchors=anchors,
    anchor_strength=anchor_str
)

### Get Total Correlation (the bigger, the more informative the topics)

In [None]:
model.tc

#### Total correlation for individual topics

In [None]:
model.tcs

### Selecting the Number of Topics

#### One way to choose the number of topics is to observe the distribution of TCs for each topic to see how much each additional topic contributes to the overall TC. We should keep adding topics until additional topics do not significantly contribute to the overall TC.

In [None]:
plt.figure(figsize=(10,5))
plt.bar(range(model.tcs.shape[0]), model.tcs, color='#4e79a7', width=0.5)
plt.xlabel('Topic', fontsize=16)
plt.ylabel('Total Correlation (nats)', fontsize=16);

###  Which topics have TC above 0.5?

In [None]:
np.argwhere(model.tcs > 0.5)

### List top words for most popular topics

In [None]:
for i, topic_ngrams in enumerate(model.get_topics(n_words=10)):
    topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0]
    print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams)))

### Get topics using the get_topics() function.

In [None]:
topics = model.get_topics()
for topic_n,topic in enumerate(topics):
    # w: word, mi: mutual information, s: sign
    topic = [(w,mi,s) if s > 0 else ('~'+w,mi,s) for w,mi,s in topic]
    # Unpack the info about the topic
    words,mis,signs = zip(*topic)    
    # Print topic
    topic_str = str(topic_n+1)+': '+', '.join(words)
    print(topic_str)

### Get top tweets (indices in df) per given topic

##### Documents are sorted according to log probabilities which is why the highest probability documents have a score of 0 ($e^0 = 1$) and other documents have negative scores (for example, $e^{-0.5} \approx 0.6$)

In [None]:
model.get_top_docs(topic=0, n_docs=100, sort_by='log_prob')

#### CorEx is a discriminative model, whereas LDA is a generative model. This means that while LDA outputs a probability distribution over each document, CorEx instead estimates the probability a document belongs to a topic given that document's words. As a result, the probabilities across topics for a given document do not have to add up to 1. The estimated probabilities of topics for each document can be accessed through log_p_y_given_x or p_y_given_x.

### Get a topic label for each tweet

#### We can also use a softmax to make a binary determination of which documents belong to each topic. These softmax labels can be accessed through labels.

In [None]:
model.labels

### Directly access the topic assignments for each word

In [None]:
model.clusters

### The most probable tweets for each topic 

In [None]:
top_docs = model.get_top_docs()
for topic_n, topic_docs in enumerate(top_docs):
    docs,probs = zip(*topic_docs)
    topic_str = str(topic_n+1)+': '+', '.join(str(docs))
    print(topic_str)

### Get summary files and visualizations

In [None]:
vt.vis_rep(model, column_label=vocab, prefix='topic-model-example')

### Merge the output topic assignment with original tweets

In [None]:
topic_df = pd.DataFrame(
    model.transform(tfidf), 
    columns=["topic_{}".format(i+1) for i in range(topic_no)]
).astype(float)
topic_df.index = my_clean_tweets_df.index
df = pd.concat([my_clean_tweets_df, topic_df], axis=1)

In [None]:
topic1_tweets = df[df.topic_1 == 1.0]
topic1_tweets.to_csv("topic1_tweets_corex_partial.csv")
topic1_tweets.shape

In [None]:
topic2_tweets = df[df.topic_2 == 1.0]
topic2_tweets.to_csv("topic2_tweets_corex_partial.csv")
topic2_tweets.shape

In [None]:
topic3_tweets = df[df.topic_3 == 1.0]
topic3_tweets.to_csv("topic3_tweets_corex_partial.csv")
topic3_tweets.shape

In [None]:
topic4_tweets = df[df.topic_4 == 1.0]
topic4_tweets.to_csv("topic4_tweets_corex_partial.csv")
topic4_tweets.shape

In [None]:
topic4_tweets