# 1. Data Scrape
Use the reddit pushshift API to scrape the raw data needed.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import requests
import sklearn
import seaborn as sns
from bs4 import BeautifulSoup
import time
import nltk
from nltk.stem import WordNetLemmatizer
import regex as re


%matplotlib inline

In [None]:
# access reddit posts via the reddit pushshift API ...

url = "https://api.pushshift.io/reddit/search/submission?"

In [None]:
params = {
    'subreddit': 'parenting',
    'size': 1_000,
}

In [None]:
res = requests.get(url, params)
print(res)

In [None]:
raw_data = res.json()

In [None]:
posts = raw_data['data']

In [None]:
# quick vis check ...
posts[349]

In [None]:
from datetime import datetime

# Performa quick test of the timestamp conversion before implementing
test = 1230768000
datetime.fromtimestamp(test).strftime("%Y-%m")

### Before running the cell below, update your file path location (see line 17) where you want to save the raw data from your scrape.

In [None]:

# period starting 1 Jan 2015, ending 31 Dec 2019, bi-weekly interval
for i in range(1420088400, 1577854799, 1209600):
    # corpus = []
    params = {
        'subreddit': 'parenting',
        'size': 1_000,
        'after': i
    }
    res = requests.get(url, params)
    raw_data = res.json()
    posts = raw_data['data']
    text = pd.DataFrame(posts)
    text = text[['subreddit', 'created_utc', 'title', 'selftext', 'score']]
    
    full_text = full_text.append(text)
    full_text.to_csv('../data/parenting_posts.csv')
    
    time.sleep(30)

# 2. EDA

In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import seaborn as sns
import nltk
from nltk.stem import WordNetLemmatizer
import regex as re

%matplotlib inline

### Don't forget to update the file path in the cell below before reading in data!

In [None]:
# Read in the raw data from our .csv file

data = pd.read_csv('../data/parenting_posts.csv')
data.head()

In [None]:
data.info()

In [None]:
# checking for duplicate records in the data set
data['created_utc'].value_counts()


### Note that there are clearly duplicate observations in the data, so we'll want to remove all those that are dupes.

In [None]:
# removing the duplicate records from the data set ...
deduped_data = data.drop_duplicates(subset='created_utc')
deduped_data.info()

In [None]:
# additional deduping check ...
deduped_data['created_utc'].value_counts()

### Check your data to see if you have the column, "Unnamed: 0".  If yes, remove the column to further clean up the data frame.

In [None]:
# dropping old index column
deduped_data.drop(columns='Unnamed: 0', inplace=True)

In [None]:
# Check for any null values that could cause problems with our modeling ...
deduped_data['created_utc'].isnull().sum()

In [None]:
deduped_data = deduped_data[deduped_data['created_utc'].notna()]
deduped_data.isnull().sum()

### We want to be able to use tha time stamp informaiton for some exploratory analysis, so we'll want to transform it into a more readable format.  Before we can do that, we need to transform it from a string type to an integer type.  Once we do that, we can transform it as desired.

In [None]:
deduped_data['created_utc'] = deduped_data['created_utc'].astype(int)
deduped_data.info()

In [None]:
test = datetime.fromtimestamp(deduped_data['created_utc'][2]).strftime("%Y-%m")
test

In [None]:
# Alternate approach to convert time stamp to more interpretable format ...
# def convert(systime):
#     return datetime.fromtimestamp(systime).strftime("%Y-%m")
# Input/guidance from Teng Mao for this approach.

In [None]:
deduped_data['yr_mo'] = deduped_data['created_utc'].apply(convert)
deduped_data.head()

In [None]:
# visually checking random values in middle of data ...
deduped_data[72000:72500]

In [None]:
deduped_data.shape

## Visualizations

I'd like to get some sense of what's happening with this reddit forum, in terms of usage, activity levels, trends, etc.  So we'll look at a few plots to give us a sense of what that activity looks like.  Plese note that this is just a sample of the visual inspection executed, for brevity's sake.

In [None]:
plt.figure(figsize=(16,9))
plt.hist(deduped_data['yr_mo'],
         color='purple',
         alpha=0.3,
         bins=60);
plt.xlabel('Volume of Unique "Parenting" Posts (reddit.com)\n 1 Jan 2015 - 31 Jan 2020 \n Unique Posts in Sample = 102,776',
          fontsize=20)
blanks = []
plt.xticks(ticks=blanks);

In [None]:
plt.figure(figsize=(16,7))
plt.xlabel('Vote Scores for Unique "Parenting" Posts (reddit.com) \n 1 Jan 2015 - 31 Jan 2020',
           fontsize=20)
plt.scatter(deduped_data['yr_mo'], 
            deduped_data['score'], 
            marker='D', 
            edgecolors='purple',
            color='teal', 
            alpha=0.3)
blanks = []
plt.xticks(ticks=blanks);

In [None]:
plt.plot(deduped_data['score'])

In [None]:
# checking against raw data for comparison ...
plt.plot(data['score'])

In [None]:
# Simplifying the data set name from "deduped_data" to "ddd"
ddd = deduped_data

In [None]:
# test removal of sys text from a single record ...
ddd['selftext'][74409]

In [None]:
# target text for replacemnt
target_words = ['[deleted]', '[removed]']

ddd['selftext'].replace(target_words, "", inplace=True)
ddd

In [None]:
# Merge title and post (i.e. "selftext") content into single text field.
# This merged field will be our sole focus for the subsequent topic modeling exercise.
ddd = ddd.replace(np.nan, '', regex=True)
ddd['all_text'] = ddd['title'] + " " + ddd['selftext']
ddd.head(3)

In [None]:
ddd['all_text'].isnull().sum()

In [None]:
ddd.shape

## Check your file path for where to save the cleaned data file before running the below cell.

In [None]:
ddd.to_csv('../data/working_data.csv')

# 3. Preprocessing and LDA Modeling

In [None]:
# Import various libraries for preprocessing steps ...
# General packages
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

# NLP-related libraries ...
from sklearn.feature_extraction.text import CountVectorizer
import spacy
import nltk
import gensim
from gensim.models.phrases import Phrases, Phraser
import pyLDAvis
import pyLDAvis.gensim
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('punkt')
from nltk.tokenize import RegexpTokenizer

from gensim.models.ldamodel import LdaModel
import gensim.corpora as corpora

In [None]:
# Read in the parenting posts dataset ...
pp = pd.read_csv('../capstone_datasets/working_data.csv')
pp.head(2)

In [None]:
data = pp.all_text.values.tolist()

## 3.1 Preprocessing

In [None]:
# Ages of children referenced in the posts are materially relevant to our analysis,
# so we'll construct a tokenizer that allows us to keep numeric digits.

corpus = []
def alt_to_words(sentences):
    for sentence in sentences:
        tokenizer = RegexpTokenizer(r'\w+|\d+')
        tokens = tokenizer.tokenize(sentence)
        corpus.append(tokens)
    return corpus[0:3]

In [None]:
alt_to_words(data)

In [None]:
# Quick visual inspection to ensure we retained the desired information ...
corpus[0:3]

## Lemmatizing
Note that this is a technique/tool for reducing words to their root to facilitate NLP analysis and modeling.  For this project, I also experimented with Stemming (an alternate tool to Lemmatizing that does essentially the same thing, albeit a bit more severely, on average).  Because the results were not materially different when modeling with the stemmed vs lemmed vacabularies, we'll present only the lemmatized version here for the sake of brevity.

Note that the needed libraries were imported above.

In [None]:
# Instantiate the lemmetizer
lemmer = WordNetLemmatizer()

In [None]:
lemmed_corpus = [[lemmer.lemmatize(i) for i in sublist] for sublist in corpus]

In [None]:
lemmed_corpus[3]

In [None]:
print(corpus[1] == lemmed_corpus[1])

In [None]:
print(corpus[1])

In [None]:
# Side by side comparison ...
list(zip(corpus[1], lemmed_corpus[1]))

## Stopword removal

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
# make sure all words in the corpus are lowercase before removing stopwards ...

lower_lemmed_corpus = [[x.lower() for x in sublist] for sublist in lemmed_corpus]


In [None]:
# remove the stopwords
# note this may take a minute... or three
low_lem_corpus_nosw = [[w for w in sublist if w not in stopwords.words('english')] for sublist in lower_lemmed_corpus]


In [None]:
print(low_lem_corpus_nosw[23])

In [None]:
low_lem_corpus_nosw[0:2]

## Vectorize

In [None]:
# CountVectorizer() can't take in a list of lists, so converting input data
# to single list of words in order to get the features using CVect ...

lemset = [" ".join(doc) for doc in low_lem_corpus_nosw]

In [None]:
# Vectorize the lemmetized lists of words in the posts ...
# Note that you may want to tune the hyperparamters in the vectorizer via gridsearch,
# but that will come at a cost in terms of processing power and time to process

vec_lem = CountVectorizer(lowercase=False, 
                      ngram_range=(1, 2), 
                      max_df = 0.95, 
                      min_df = 0.0075)

In [None]:
lemmed_features = vec_lem.fit_transform(lemset)

In [None]:
lemmed_features.shape

In [None]:
vocab_lem = vec_lem.get_feature_names()

## 3.2 Fitting Initial LDA Model

In [None]:
# Some initial steps to prep for gensim LDA modeling ...
# Need to define the index for all the words in the corpus
lem_word_index = corpora.Dictionary(low_lem_corpus_nosw)

In [None]:
# marry the index with the corpus to be used in the model
texts = low_lem_corpus_nosw
lem_corpus = [lem_word_index.doc2bow(text) for text in texts]

In [None]:
# Fit the model ... 
# initially modeling 24 topics, just as a point of departure
# Be prepared to wait when you fit the model as it can take several hours
# to run, depending on the size of your data and your hardware capabilities.

lda_model_lem_24 = gensim.models.ldamodel.LdaModel(corpus=lem_corpus, 
                                                   id2word=lem_word_index, 
                                                   num_topics=24,
                                                   random_state=1972,
                                                   update_every=1,
                                                   chunksize=100,
                                                   passes=10,
                                                   alpha='auto',
                                                   per_word_topics=True)

In [None]:
lda_model_lem_24.print_topics()

In [None]:
# Model performance: perplexity and coherance scores
print('Perplexity: ', lda_model_lem_24.log_perplexity(lem_corpus))

In [None]:
from gensim.models import CoherenceModel

In [None]:
# Calculate the model's coherence score
coherence_ldamodel_lem_24 = CoherenceModel(model=lda_model_lem_24, 
                                       texts=low_lem_corpus_nosw, 
                                       dictionary=lem_word_index, 
                                       coherence='c_v')
coherence_lda_lem_24 = coherence_ldamodel_lem_24.get_coherence()
print('Coherence Score: ', coherence_lda_lem_24)


## Cluster Visulaization
If you are unfamiliar with the pyLDAvis library, it is advisable to spend some time with the documentation as it is a very powerful dimensionality reduction and visualization tool for viewing LDA topic modeling results.  Note that the more separate and defined the clusters are, and the bigger they are, the stronger the model is.  

Note also that it can take several minutes or longer to process, depending on the size of your dataset and the particulars of your hardware.

In [None]:
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model_lem_24, lem_corpus, lem_word_index)
pyLDAvis.display(vis)

At this point it is worth pointing out that I switched to the LDA mnodel in scikit learn in order to facilitate leveraging GridsearchCV to facilitate finding the "best" number of topics for this corpus.  Please refer to the gridsearch notebook to see that code (but note that all the data wrangling, EDA, and preprocessing in this notebook need to be completed first).  I chose to switch to the Scikit Learn LDA model for this process because it is a bit more stratight foreward and expediant, compared to doing the same thing with the gensim LDA library.  When time permits, it would be interesting to do a side-by-side comparison to see if/how results may vary sa there does appear to be a difference in the results of from the two LDA models.

In any case, after gridsearching multiple N's for the number of topics, 15 topcics was among the best parameters from the gridsearched scikit learn LDA model, but plugging back in the the gensim LDA model, 12 seemed to yeild the strongest results, so we'll proceed with those parameters - results can be seen below.  

While not displayed here for the sake of notebook readability, I did also run several of the other N's to the gensim LDA model to validate that 12 was, in fact, the better fit, and all those results corroborated that this should be the case.  Just remember that cluster modeling is a relatively imprecise practice and there may not be a single, "best" option.  In all cases, the words associated with each topic yielded a pretty clear picture of what each topic would be about.  As N increased, the model fit scores worsened and clusters overlapped more, but there was more granularity in the topics yielded.  That said, if the ultimate goal of the exercise is to yield topics so that new content can be categorized and labeled by a machine learning algorithm, then cleaner separability (i.e. the least overlappping of clusters possible) likely would yield be better results.

Since this client project did not require this secondary step/analysis, getting the clusters exactly perfect was less of an issue.  In any case, we'll proceed with the 12-topic example for the duration of the exercise.

## Gridsearching with ScikitLearn LDA model
**Note that this section of code is optional**, it is not necessary to run to complete the rest of the notebook.  If you want to skip this section, you can proceed direction the the 12-topic model and continue from there.

In [None]:
# Gridsearching the LDA Model
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.model_selection import GridSearchCV

In [None]:
# Define Search Parameters
search_params = {
    'n_components': [10, 12, 15, 19, 30], 
    'learning_decay': [.5, .7, .9]
}

# Init the Model
lem_lda_gs = LatentDirichletAllocation(n_jobs=-1, 
                                       random_state=1972, 
                                       learning_method='online')


# We want to gridsearch without any cross validation (for an unsupervised mode)
# found the below solution at: https://stackoverflow.com/questions/44636370/scikit-learn-gridsearchcv-without-cross-validation-unsupervised-learning
cv_off = [(slice(None), slice(None))]

# Init Grid Search Class
gscv_model = GridSearchCV(lem_lda_gs, param_grid=search_params, n_jobs=-1, cv=cv_off)

# Do the Grid Search
# model.fit(data_vectorized)

# lem_lda_gs.fit(lemmed_features)
gscv_model.fit(lemmed_features)

In [None]:
gscv_model.best_params_

In [None]:
gscv_model.best_score_

## 12-topic model

In [None]:
# reducing from 24 to 12 topics and consilidating the scoring into one cell
# this will take some time again ...
lda_model_lem_12 = gensim.models.ldamodel.LdaModel(corpus=lem_corpus,
                                               id2word=lem_word_index,
                                               num_topics=12,
                                               random_state=1972,
                                               update_every=1,
                                               chunksize=100,
                                               passes=10,
                                               alpha='auto',
                                               per_word_topics=True)

# Get topics ...
lem_topics_12 = lda_model_lem_12.print_topics()

# Calculate and save the perplexity score
lem_top_12_perpscore = lda_model_lem_12.log_perplexity(lem_corpus)

# CInstantiate the confusion score model
coherence_ldamodel_lem_12 = CoherenceModel(model=lda_model_lem_12, 
                                       texts=low_lem_corpus_nosw, 
                                       dictionary=lem_word_index, 
                                       coherence='c_v')

# Calculate and save the coherence score
coherence_lda_lem_12 = coherence_ldamodel_lem_12.get_coherence()

print(f'Perplexity: {lem_top_12_perpscore}\n Coherence Score: {coherence_lda_lem_12}\n 12 Lemmed LDA Model Topics:\n {lem_topics_12}')



In [None]:
# Visualize the 12-topic model
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim.prepare(lda_model_lem_12, lem_corpus, lem_word_index)
pyLDAvis.display(vis)

## Given the time it takes to process the bulk of the steps outlined above, you may find it desireable to persist the fitted model and pyLDAvis results in order to grately accelerate your ability to demonstrate and review results with others.  Code to achieve those ends is provided below in steps 4 and 5.

# 4. Persisting the Model

In [None]:
import pickle

In [None]:
pickle.dump(lem_corpus, open('lem_corpus.pickle', 'wb'))

In [None]:
# Need additional libraries in order to save and reload gensim "keyed vector" objects ...
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors

In [None]:
# lem_word_index.save('../capstone_datasets/lem_word_index.index')
fname = get_tmpfile("lem_word_index.kv")
lem_word_index.save(fname)

# word_vectors = KeyedVectors.load(fname, mmap='r')


In [None]:
# Test save the 12-topic LDA model ...
lda_model_lem_12.save('lda12.model')

# 5. Demo Code

In [None]:
import gensim
from gensim.models.ldamodel import LdaModel
from gensim.test.utils import datapath
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim
import pickle


In [None]:
# call pickle and gensim saved objects ...

lda12 = LdaModel.load('lda12.model', mmap='r')
my_corpus = pickle.load(open('lem_corpus.pickle', 'rb'))
my_index = KeyedVectors.load('lem_word_index.index', mmap='r')
vis_demo = pickle.load(open('vis_demo.pickle', 'rb'))

In [None]:
pyLDAvis.display(vis_demo)