In [215]:
import numpy as np
import pandas as pd
import xmltodict, re
from bs4 import BeautifulSoup

def read(_type):
    file  = "../../../datasets/datascience.stackexchange.com/{}.xml".format(_type)
    xml   = open(file).read()
    _dict = xmltodict.parse(xml)
    df    = pd.DataFrame(_dict.values()[0]['row'])
    df.columns = [ c[1:] for c in df.columns ]
    return df

pd.set_option('display.max_columns', 50)

In [77]:

tags  = read('Tags')
cmts  = read('Comments')
links = read('PostLinks')
users = read('Users')

# Badges.xml
# Comments.xml
# PostHistory.xml
# PostLinks.xml
# Posts.xml
# Tags.xml
# Users.xml
# Votes.xml

In [152]:
posts = read('Posts')
posts = posts.set_index('Id')
posts = pd.merge(posts, posts[['Tags']], right_index=True, left_on='ParentId', how='left')
posts = posts[['PostTypeId', 'Body', 'Title', 'Tags_x', 'Tags_y', 'ParentId']]
posts.head()

Unnamed: 0_level_0,PostTypeId,Body,Title,Tags_x,Tags_y,ParentId
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,1,<p>I've always been interested in machine lear...,How can I do simple machine learning without h...,<machine-learning>,,
7,1,"<p>As a researcher and instructor, I'm looking...",What open-source books (or other materials) pr...,<education><open-source>,,
9,2,"<p>Not sure if this fits the scope of this SE,...",,,<machine-learning>,5.0
10,2,"<p>One book that's freely available is ""The El...",,,<education><open-source>,7.0
14,1,<p>I am sure data science as will be discussed...,Is Data Science the Same as Data Mining?,<data-mining><definitions>,,


In [216]:
cleanr = re.compile('<.*?>')

def remove_code(x):
    soup = BeautifulSoup(x, "html.parser")
    for code in soup("code"): code.string = ""
    return str(soup)

posts['BodyX'] = posts.Body
posts['BodyX'] = posts.BodyX.apply(remove_code)
posts['BodyX'] = posts.BodyX.str.replace(cleanr, '')

posts['Tags'] = posts.apply(lambda x: x.Tags_y if pd.isnull(x.Tags_x) else x.Tags_x, axis=1)
posts['Tags'] = posts.Tags.str.replace('><', ',').apply(lambda x: [] if pd.isnull(x) else x[1:-1].split(','))
posts.head()

Unnamed: 0_level_0,PostTypeId,Body,Title,Tags_x,Tags_y,ParentId,BodyX,Tags
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5,1,<p>I've always been interested in machine lear...,How can I do simple machine learning without h...,<machine-learning>,,,I've always been interested in machine learnin...,[machine-learning]
7,1,"<p>As a researcher and instructor, I'm looking...",What open-source books (or other materials) pr...,<education><open-source>,,,"As a researcher and instructor, I'm looking fo...","[education, open-source]"
9,2,"<p>Not sure if this fits the scope of this SE,...",,,<machine-learning>,5.0,"Not sure if this fits the scope of this SE, bu...",[machine-learning]
10,2,"<p>One book that's freely available is ""The El...",,,<education><open-source>,7.0,"One book that's freely available is ""The Eleme...","[education, open-source]"
14,1,<p>I am sure data science as will be discussed...,Is Data Science the Same as Data Mining?,<data-mining><definitions>,,,I am sure data science as will be discussed in...,"[data-mining, definitions]"


In [217]:
import nltk
from gensim import corpora, models, similarities, utils

stops = nltk.corpus.stopwords.words('english')

docs = posts.BodyX.values
docs = map(utils.simple_preprocess, docs)
docs = map(lambda x: [ w for w in x if w not in stops ], docs)

In [233]:
dic        = corpora.Dictionary(docs)
corpus_tmp = [ dic.doc2bow(doc) for doc in docs]
tfidf      = models.TfidfModel(corpus_tmp, normalize=True, id2word=dic)

lda = models.LdaModel(tfidf[corpus_tmp], num_topics=20, id2word=dic,)
for _,w in lda.print_topics( num_words=5 ): print w

0.005*"n_" + 0.005*"replaced" + 0.003*"cardinality" + 0.003*"counter" + 0.003*"siblings"
0.004*"pandas" + 0.003*"dataframes" + 0.003*"http" + 0.003*"org" + 0.003*"science"
0.018*"activation" + 0.014*"relu" + 0.012*"sigmoid" + 0.007*"hyper" + 0.006*"gpus"
0.012*"tensorflow" + 0.008*"keras" + 0.006*"mnist" + 0.005*"y_i" + 0.005*"cell"
0.006*"deviation" + 0.005*"policy" + 0.005*"incremental" + 0.005*"scatter" + 0.004*"axes"
0.007*"characters" + 0.005*"hat" + 0.004*"leakage" + 0.004*"explanations" + 0.004*"eeg"
0.007*"theta" + 0.003*"ann" + 0.003*"player" + 0.003*"ks" + 0.003*"silhouette"
0.005*"p_" + 0.005*"cheers" + 0.004*"w_" + 0.004*"confused" + 0.004*"hyperplane"
0.005*"averages" + 0.004*"s_" + 0.004*"width" + 0.003*"ast" + 0.003*"proportions"
0.008*"autoencoder" + 0.007*"imputation" + 0.005*"reward" + 0.005*"scored" + 0.004*"encoder"
0.006*"transition" + 0.004*"df" + 0.004*"gmm" + 0.004*"leak" + 0.003*"released"
0.008*"keras" + 0.007*"tensorflow" + 0.002*"use" + 0.002*"nmf" + 0.002*"

In [232]:
dic   = corpora.Dictionary(docs)
dic.filter_extremes(no_below=5, no_above=0.5)
corpus_bag = [ dic.doc2bow(doc) for doc in docs]

lda = models.LdaModel(corpus_bag, num_topics=20, id2word=dic,)
for _,w in lda.print_topics( num_words=5 ): print w

0.065*"data" + 0.015*"like" + 0.009*"science" + 0.008*"would" + 0.007*"big"
0.023*"data" + 0.011*"model" + 0.008*"would" + 0.008*"like" + 0.008*"values"
0.049*"amp" + 0.013*"data" + 0.013*"begin" + 0.011*"gt" + 0.009*"end"
0.035*"learning" + 0.018*"machine" + 0.011*"data" + 0.010*"activation" + 0.009*"gradient"
0.025*"would" + 0.013*"time" + 0.010*"like" + 0.008*"could" + 0.007*"want"
0.034*"clustering" + 0.033*"distance" + 0.021*"similarity" + 0.018*"cluster" + 0.014*"clusters"
0.010*"color" + 0.009*"use" + 0.008*"blue" + 0.008*"python" + 0.008*"using"
0.043*"mathbf" + 0.041*"bmatrix" + 0.033*"h_" + 0.022*"xgboost" + 0.016*"activation"
0.018*"regression" + 0.015*"linear" + 0.014*"http" + 0.014*"org" + 0.011*"pdf"
0.025*"tensorflow" + 0.023*"keras" + 0.016*"python" + 0.013*"code" + 0.012*"use"
0.019*"user" + 0.011*"based" + 0.010*"use" + 0.010*"would" + 0.010*"data"
0.022*"model" + 0.012*"training" + 0.011*"using" + 0.011*"class" + 0.011*"set"
0.054*"network" + 0.054*"layer" + 0.035*"i