In [None]:
## Libraries
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from nltk.lm.preprocessing import flatten
from nltk.util import ngrams
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud
import unicodedata
import stop_words
import spacy
from spacy.lang.en import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import gensim
from gensim.models import LsiModel
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
nltk.download('punkt')
import gensim
from gensim.models import LsiModel
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models

In [None]:
# Data Load & Preprocess
df = pd.read_csv('redditSubmissions.csv', on_bad_lines='warn')
df.dropna(inplace=True)

# Normalize encodings
df['title'] = df['title'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore'))

# Lower Text
df['title'] = df['title'].str.lower()

# Remove numbers and punctuation
df['title'] = df['title'].str.replace(r'[^\w\s]','', regex = True)
df['title'] = df['title'].str.replace('\d+', '', regex=True)

# Remove Stopwords
stop_words = stop_words.STOP_WORDS
df['title'] = df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

df.head()

In [None]:
# Summary of all variables
df.describe(include="all").T

In [None]:
# Value counts of meta data
frequency_table_all = {
    'username': df['username'].value_counts(),
    'subreddit': df['subreddit'].value_counts(),
    'reddit_id': df['reddit_id'].value_counts()
}

# Print frequency tables
for column, table in frequency_table_all.items():
    print(f"Frequency Table for '{column}':\n", table, "\n")

In [None]:
#Summary of meta data
summary_table_all = {
    'username': df['username'].describe(),
    'subreddit': df['subreddit'].describe(),
    'reddit_id': df['reddit_id'].describe()
}

for column, summary in summary_table_all.items():
    print(f"Summary for '{column}':\n", summary, "\n")

In [None]:
#Create a word cloud of the text.
df_reddit = df[df['title'].str.contains("funny")]
df1 = df_reddit['title'].str.cat(sep=' ')

wc = WordCloud().generate(df1)
plt.imshow(wc)
plt.axis("off")
plt.show()

In [None]:
#Model the data using chosen technique
texts = list(df['title'])
texts = [word_tokenize(text) for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


lda_model = gensim.models.ldamodel.LdaModel(corpus = corpus, #TDM
                                           id2word = dictionary, #Dictionary
                                           num_topics = 3,
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
coherence_model_lda = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'\nCoherence Score: {coherence_lda}\nPerplexity Score: {lda_model.log_perplexity(corpus)}')

In [None]:
# LDA Visualization
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, n_jobs = 1)
pyLDAvis.display(vis)