### Work in progress on finding hot trends in Reddit data sets. Here I employ probabilistic Topic Modeling, especially Latent Dirichlet Allocation (LDA). The goal is to find the trends over time by using moving average method to compute performance history of each topic within each subreddits and to use BM25 ranking method (state-of-the-art TF-IDF) to develop new trend ranking for the hot reddit posts.

In [1]:
import nltk
import numpy as np
import os
import pandas as pd
import psycopg2
import re
import sys
import time
import string
import operator
 
from nltk.corpus import stopwords
from gensim import corpora, models
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

stemmer=PorterStemmer()
lemmatizer = nltk.WordNetLemmatizer()



%matplotlib inline
import matplotlib.pyplot as plt
import seaborn; seaborn.set() # For pretty plots
%config InlineBackend.figure_format='retina'

In [2]:
con = psycopg2.connect(database = 'Reddit', user = 'mrr-phys')

In [3]:
# Get the list of all the columns
sql_query = """
            SELECT column_name 
            FROM information_schema.columns
            WHERE table_name = 'main_comments';
            """
col_list = pd.read_sql(sql_query, con)
col_list

Unnamed: 0,column_name
0,id
1,subreddit_id
2,submission_id
3,content


In [4]:
subreddits = pd.read_sql("SELECT * FROM main_subreddits", con)['name'].tolist()
subreddits

['youtube',
 'Best_Of_YouTube',
 'youtube_recommended',
 'funny',
 'todayilearned',
 'mildlyinteresting',
 'announcements',
 'aww',
 'kiddet',
 'justforkids',
 'KidSafeVideos',
 'childrensbooks',
 'reallifedoodles',
 'BeAmazed',
 'Parenting',
 'DoesAnybodyElse']

In [8]:
main_submissions = pd.read_sql("SELECT * FROM main_submissions", con)
main_submissions = main_submissions.rename(columns = {'id': 'submission_id'})

In [9]:
main_comments = pd.read_sql("SELECT * FROM main_comments", con)

In [10]:
dg = main_comments.groupby(['subreddit_id', 'submission_id'])['content'].apply(lambda x: "%s" % ' '.join(x))
dg = dg.reset_index()

In [11]:
merged = pd.merge(main_submissions, dg, how='left', on=['subreddit_id', 'submission_id'])
merged.head()

Unnamed: 0,submission_id,subreddit_id,created,content_x,content_y
0,6mfhvh,2qh44,1499732467,,A bit more than a month ago I posted this: htt...
1,6mfayu,2qh44,1499730695,Hopefully you fine folks can help me remember ...,
2,6mf8x8,2qh44,1499730177,I have a playlist that is supposed to automati...,
3,6mf61v,2qh44,1499729431,"I mean seriously, The Youtube comment section ...",The YouTube comments system as a whole is so f...
4,6mf402,2qh44,1499728902,There was a little magic wand next to my video...,The revert button in the video editor manager ...


In [12]:
merged['content'] = merged[['content_x', 'content_y']].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [13]:
all_submissions = merged['submission_id'].tolist()

In [14]:
data = []
for ids in all_submissions:
    data_local = merged[merged['submission_id'] == ids]['content'].tolist()
    data.append(data_local)

Get rid of the Emojis:

In [15]:
emoji_pattern = re.compile(
    u"(\ud83d[\ude00-\ude4f])|"  # emoticons
    u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
    u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
    u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
    u"(\ud83c[\udde0-\uddff])"   # flags (iOS)
    "+", flags = re.UNICODE)

Function that takes a post, cleans it and returns list of tokens:

In [26]:
def clean_data(document):
    if type(document) == str: conv_post = unicode(document, "utf-8")
    else: conv_post = document
    document = emoji_pattern.sub('', conv_post)
    document = re.sub(r'[^A-Za-z .-[\d+]]+', '', document)
    document = re.sub(r'\b\d+\b', '', document)
    document = document.replace('*', ' ')
    document = document.replace('/', ' ')
    document = document.replace('|', ' ')
    document = document.replace('.', ' ')
    document = document.replace('...', ' ')
    document = document.replace('^', ' ')
    return document


In [28]:
def normalise(word):
    #normalises words to lowercase and stems and lemmatizes it.
    word = word.lower()
    #word = lemmatizer.lemmatize(word)
    #word = stemmer.stem(word)    
    return word

In [18]:
def tokenize(data):
    tokened_data = [[normalise(word) for word in nltk.word_tokenize(clean_data(text)) if word not in string.punctuation] 
                    for doc in data for text in doc]
    return tokened_data

In [19]:
def get_words_frequency(data):
    from collections import defaultdict
    frequency = defaultdict(int)
    for doc in data:
        for token in doc:
            frequency[token] += 1
    return frequency

In [20]:
def extract_stopwords(frequency, h):
    stops = set(sorted(frequency, key=frequency.get, reverse=True)[:h]).union(stopwords.words('english'))
    stops = set(subreddits).union(stops)
    return stops

In [21]:
def extract_ngrams(doc, stops):
    candidates_unigram = [token for token in doc if token not in stops]
    #bigrams = nltk.ngrams(candidates_unigram,2)
    #candidates_bigram = ["_".join(word) for word in bigrams]
    return candidates_unigram #+ candidates_bigram 

In [22]:
def remove_infrequent_words(doc, frequency, l):
    return [token for token in doc if frequency[token] > l]

*** Preprocess data: ***

In [29]:
tokenized_data = tokenize(data)

In [30]:
words_frequency = get_words_frequency(tokenized_data)
my_stops = extract_stopwords(words_frequency,300)
ngrams_tokened_data = [extract_ngrams(tokened_doc, my_stops) for tokened_doc in tokenized_data]
cleaned_docs= [remove_infrequent_words(tokened_doc, words_frequency, 30) for tokened_doc in ngrams_tokened_data]

*** Build LDA model ***

In [31]:
# Create dictionary and corpus
dictionary = corpora.Dictionary(cleaned_docs)

In [32]:
corpus = [dictionary.doc2bow(doc) for doc in cleaned_docs]

In [33]:
# Build LDA model
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20, passes=10)

In [34]:
#lda.print_topics(-1)

In [35]:
top_words = [[_ for _, word in lda.show_topic(topicno, topn=40)] for topicno in range(lda.num_topics)]

In [36]:
for topicno, words in enumerate(top_words):
    print("%i: %s" % (topicno, ' '.join(words[:25])))

0: summer milk meal heat fruit pressure drink cut hot salt comfort neighborhood smoking beach plate healthy amazed set habit instagram small juice dreams plant super
1: music song game movie amazon art tv games songs sound movies dp playing listen favorite original hear episode artist shows listening youtu voice alarm played
2: girl boy birthday nice face na happy fuck dude cool awesome cute weird guys friend gif gon christmas head haha bike laugh super amazing nose
3: world dinner org wikipedia women wiki en fucking american trump fuck fact men true news history hate country war pants sex white culture racist states
4: app mobile click google page phone button site desktop feature apps screen res chrome option mode version moderators computer android issue works fix update working
5: left top god clothes city picture damn tree beautiful chores fucking kitchen shop beer weed canada driver lady photo yep til memory painful leg mr
6: words word english bathroom language glass signs frenc

*** Evaluation, Diagnostics and Improvements of Topics: ***

Pointwise mutual information (PMI):

Log conditional probability (LCP):

Pooling: