# Wikipedia Categories

## Topic Modelling (LDA)

Turkish Wikipedia dump from: https://www.kaggle.com/mustfkeskin/turkish-wikipedia-dump

In [52]:
import re
import requests
import os
import random
import codecs
import cPickle
from gensim.models.ldamodel import LdaModel as Lda
from gensim import corpora
from nltk.stem.wordnet import WordNetLemmatizer

In [58]:
# Preparation of documents

trwikidumpfile = '../turkish-wikipedia-dump'

docs = {}

with open(trwikidumpfile) as trwikidump:
    data = ''
    title = ""
    for line in trwikidump:
        if '<doc' in line and 'title=' in line:
            title = re.sub(r'\<doc id\=\".*\" url\=\".*\" title=\"', '', line)
            title = title.replace('">', '')
        elif '</doc>' in line:
            docs[title] = data
            title = ''
            data = ''
        else:
            data += line.decode('utf8') + '\n'

In [48]:
# get Turkish stop words
re = requests.get('https://raw.githubusercontent.com/ahmetax/trstop/master/dosyalar/turkce-stop-words')
stopwords = re.text
stopwords = re.text.split()

In [57]:
len(docs.values())

AttributeError: 'file' object has no attribute 'values'

In [61]:
# Topic modelling based on 
# https://github.com/abhijeet3922/Topic-Modelling-on-Wiki-corpus/blob/master/wiki_topic_model.py

# Function to remove stop words from sentences & lemmatize verbs. 
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stopwords])
    normalized = " ".join(lemma.lemmatize(word,'v') for word in stop_free.split())
    x = normalized.split()
    y = [s for s in x if len(s) > 2]
    return y

doc_complete = docs.values()

# Randomly sample 70000 articles from the corpus     
docs_all = random.sample(doc_complete, 70000)
#docs = open("docs_wiki.pkl",'wb')
#cPickle.dump(docs_all,docs)

# Use 60000 articles for training.
docs_train = docs_all[:60000]


# Cleaning all the 60,000 simplewiki articles
lemma = WordNetLemmatizer()
doc_clean = [clean(doc) for doc in docs_train]

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Filter the terms which have occured in less than 3 articles and more than 40% of the articles 
dictionary.filter_extremes(no_below=4, no_above=0.4)

#words,ids = dictionary.filter_n_most_frequent(50)
#print words,"\n\n",ids

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

#Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, passes=50, iterations=500)
ldafile = open('lda_model_sym_wiki.pkl','wb')
cPickle.dump(ldamodel,ldafile)
ldafile.close()

#Print all the 50 topics
for topic in ldamodel.print_topics(num_topics=50, num_words=10):
    print topic[0]+1, " ", topic[1],"\n"



1   0.010*"kız" + 0.009*"babası" + 0.009*"gün" + 0.008*"geri" + 0.008*"annesi" + 0.008*"son" + 0.007*"genç" + 0.007*"kızı" + 0.007*"oğlu" + 0.007*"birlikte" 

2   0.012*"akciğer" + 0.008*"yunan" + 0.007*"resim" + 0.006*"atina" + 0.005*"yunanistan" + 0.005*"scott" + 0.004*"kanser" + 0.004*"italya'ya" + 0.004*"jackson" + 0.004*"yunanistan'ın" 

3   0.024*"hava" + 0.019*"bağlı" + 0.012*"hüseyin" + 0.012*"olan," + 0.010*"iran" + 0.009*"silahlı" + 0.009*"sahip," + 0.009*"nüfuslu" + 0.008*"kuvvetleri" + 0.008*"cumhurbaşkanlığı" 

4   0.021*"bulunan" + 0.017*"han" + 0.016*"yüzölçümü" + 0.016*"yahudi" + 0.014*"()," + 0.014*"itibarı" + 0.012*"tarihi" + 0.010*"güney" + 0.010*"havalimanı" + 0.009*"yılı" 

5   0.027*"ilk" + 0.019*"sezonunda" + 0.019*"gol" + 0.019*"sezon" + 0.017*"transfer" + 0.016*"maçta" + 0.016*"oldu." + 0.014*"millî" + 0.013*"forma" + 0.013*"lig" 

6   0.024*"köy" + 0.014*"adı" + 0.011*"eski" + 0.011*"köyü" + 0.011*"göç" + 0.010*"ilçe" + 0.009*"bulunmaktadır." + 0.008*"bağlı" +