In [280]:
import json
import pandas as pd
from nltk.stem import WordNetLemmatizer
import re, unicodedata
from wordcloud import WordCloud, ImageColorGenerator
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
nltk.download('wordnet')
pd.set_option('display.max_colwidth', -1)
STOPWORDS = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jianhenghou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [281]:
def transform(words):
    contraction_mapping = {"ive":"i have","dont":"do not","cant":"can not","ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "havent":"have not","haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }

    word_list = []
    words = words.replace(".",". ")
    for word in words.split():
        new_word = word
        """Convert all characters to lowercase from list of tokenized words"""
        new_word = new_word.lower()
        """Remove punctuation except for "?" and "!" from list of tokenized words"""
        if contraction_mapping.__contains__(new_word):
            new_word = contraction_mapping[new_word]
        word_list.append(new_word)
    return ' '.join(word_list)

def normalize(words):
    word_list = []
    for word in words:
        """Remove non-ASCII characters from list of tokenized words"""
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
#         new_word = re.sub(r'[^\w\s]', '', new_word)

        """Replace all interger occurrences in list of tokenized words with textual representation"""
#         if new_word.isdigit():
#             new_word = 'digit'
        if new_word != '':
            word_list.append(new_word)
    return word_list

In [282]:
data = []
with open("../Crawling/PostProcessing/Citalopram_Merged_Discussion.jl") as f:
    df = pd.DataFrame(json.loads(line) for line in f)

In [283]:
def sent_preprocessing(sent):
    return "".join(word + ' ' for word in normalize([word for word in word_tokenize(transform(sent)) if word != None])).strip()

In [284]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4787 entries, 0 to 4786
Data columns (total 5 columns):
content_id    4787 non-null object
post_type     4787 non-null object
poster        4787 non-null object
text          4787 non-null object
timestamp     4787 non-null object
dtypes: object(5)
memory usage: 187.1+ KB


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [285]:
df["text"] = df.text.apply(lambda x: sent_preprocessing(x))
df.head()

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [286]:
from nltk import ngrams
def get_ngrams(line, n):
    
    result = ''
    token_sent = word_tokenize(line)
    tag_dic = {each[0]:each[1] for each in nltk.pos_tag(token_sent)}
    n_grams = ngrams(token_sent, n)

    """Lemmatize words in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()

    for grams in n_grams:

        condition = False
        if tag_dic[grams[-1]] == "NN" or tag_dic[grams[-1]] == "NNS":
            condition = True
        if set(grams).intersection(STOPWORDS) != set() or set(grams).intersection([',','.','?', '!', ':', '/', '_', '(', ')','...','}', '{', '', '%', 'hour', 'hours','day', 'days', 'week', 'weeks', 'month', 'months','year', 'years', 'hi', 'hey', 'time', 'times', 'today', 'thank', 'thanks','luck', 'morning','night','tonight','mg', 'mgs','', 'weekend', 'wish', 'hope', 'step', 'steps', 'everyone', 'others', 'people']) != set():
            condition = False
        if condition == True:
            phrase = ''
            for each in grams:
                phrase += '_'+ lemmatizer.lemmatize(each, pos='v')
#             if '10_' in phrase:
#                 print(phrase)
            result += ' ' + phrase[1:]
    return result[1:]

In [287]:
# for i in range(500):
#     x = df["text"].iloc[i]
#     get_ngrams(x, 2)

In [288]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

documents = df.text.apply(lambda x: get_ngrams(x, 2))

no_features = 2000

# # NMF is able to use tf-idf
# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
# tfidf = tfidf_vectorizer.fit_transform(documents)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
ct_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english',strip_accents = 'unicode')
data_vectorized = ct_vectorizer.fit_transform(documents)
ct_feature_names = ct_vectorizer.get_feature_names()

# num_topics = 10
# num_top_words = 10

# # Run NMF
# nmf = NMF(n_components=num_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
# lda_model = LatentDirichletAllocation(n_components=num_topics, max_iter=10, 
#                                 learning_method='online', 
#                                 random_state=0,
#                                 evaluate_every = -1,       # compute perplexity every n iters, default: Don't
#                                 n_jobs = -1,               # Use all available CPUs
#                                )
# lda_output = lda_model.fit_transform(data_vectorized)

# print("=====NMF=====")
# display_topics(nmf, tfidf_feature_names, num_top_words)
# print("=====LDA=====")
# display_topics(lda_model, ct_feature_names, num_top_words)

In [289]:
'''
non-merged version 1 - gram 
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -971217.1918830868
Model Perplexity:  444.18687560698623

merged version 1 - gram 
Best Model's Params:  {'learning_decay': 0.7, 'n_components': 5}
Best Log Likelihood Score:  -961311.6105043223
Model Perplexity:  405.8577617448416
'''
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'learning_decay': [.5, .7]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params, n_jobs=-1)

# Do the Grid Search
model.fit(data_vectorized)

# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))



Best Model's Params:  {'learning_decay': 0.7, 'n_components': 2}
Best Log Likelihood Score:  -140170.6191131683
Model Perplexity:  720.1114049008153


In [290]:
# Show top n keywords for each topic
import numpy as np
def show_topics(vectorizer, lda_model, n_words):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=ct_vectorizer, lda_model=best_lda_model, n_words=10)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9
Topic 0,side_effect,take_care,withdrawal_symptoms,panic_attack,cold_turkey,take_citalopram,take_cit,brain_zap,weight_gain,low_dose
Topic 1,side_effect,panic_attack,take_citalopram,many_people,keep_post,stick_plaster,one_thing,best_regard,dry_mouth,heighten_anxiety


In [291]:
%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
import os
pyLDAvis.enable_notebook()

LDAvis_prepared = sklearn_lda.prepare(best_lda_model, data_vectorized, ct_vectorizer, mds='tsne')

CPU times: user 3.19 s, sys: 178 ms, total: 3.37 s
Wall time: 32.9 s


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [33]:
# 1-gram
LDAvis_prepared

In [49]:
# 1-gram (replies merged to discussion)
LDAvis_prepared

In [292]:
# 2-gram (replies merged to discussion)
LDAvis_prepared

In [46]:
# LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))
# # # this is a bit time consuming - make the if statement True
# with open(LDAvis_data_filepath, 'wb') as f:
#         pickle.dump(LDAvis_prepared, f)

# # load the pre-prepared pyLDAvis data from disk
# with open(LDAvis_data_filepath) as f:
#     LDAvis_prepared = pickle.load(f)
# pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')