# Konu Modelleme Görselleştirme Denemesi
1. LDA ile konu modelleme
1. Konu modellerini pyLDAvis ile görselleştirme
1. LDA sonuçlarını t-SNE ([Bokeh Küütphanesi](https://bokeh.org)) ile görselleştirme

In [None]:
%pylab inline

import pandas as pd
import pickle as pk
from scipy import sparse as sp


Abstractlar üzerinde çalışıyorum yine. Ama bu sefer iterasyon, ön işlemler falan çok olduğu için 1000 makale üzerinde deniyorum.

In [None]:
p_df = pd.read_excel('http://mugeakbulut.com/phd/1000.xlsx',

                     
#p_df = pd.read_excel('http://mugeakbulut.com/phd/iSearch_full.xlsx',
                    header=0,
                    index_col=False,
                    keep_default_na=True
                  )

docs = array(p_df['Abstract'])

# Makale Pre-process ve vector haline getirme

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Hepsini küçük harfe çevir
        docs[idx] = tokenizer.tokenize(docs[idx])  # Kelimelere böl

    # Sayıları kaldır. Ama sası içerek kelimeleri değil. Fizik bu belli olmaz. 
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Sadece bir karakter olanlar uçsun
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Makalelerdeki tüm kelimeleri lematize et
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs

In [None]:
docs = docs_preprocessor(docs)

Bigram/trigram hesaplama

In [None]:
from gensim.models import Phrases
# bigram ve trigram (sadece 10 kez veya daha sık geçenler için).
bigram = Phrases(docs, min_count=10)
trigram = Phrases(bigram[docs])

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token = bigram, makaleye ekle
            docs[idx].append(token)
    for token in trigram[docs[idx]]:
        if '_' in token:
            # Token = trigram, makaleye ekle
            docs[idx].append(token)

Rare ve common tokens sil

In [None]:
from gensim.corpora import Dictionary

# Makalelerin sözlük temsilini oluştur
dictionary = Dictionary(docs)
print('Initial makalelerdeki tekil kelime sayısı:', len(dictionary))

# 10 belgeden az geçen kelimeleri veya makalelerin % 20'sinden fazlasında geçenleri at.
dictionary.filter_extremes(no_below=10, no_above=0.2)
print('Rare ve common wordler silindikten sonra tekil kelime sayısı:', len(dictionary))

**Budama**

Önce her bir dokümanın kelimelerden oluşan bir temsilini elde et

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [None]:
print('Tekil token sayısı: %d' % len(dictionary))
print('Makale sayısı: %d' % len(corpus))

# Train LDA model...

In [None]:
from gensim.models import LdaModel

In [None]:
# Set training parameters.
num_topics = 4
chunksize = 500 # size of the doc looked at every pass. Bunu biraz abartmış olaiblirim.
passes = 20 # number of passes through documents
iterations = 400
eval_every = 1  # Böyle iyi. Öteki türlü çok zaman alıyor


temp = dictionary[0]  # sözlük load
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)

In [None]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
pyLDAvis.gensim.prepare(model, corpus, dictionary)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

p_df['tokenz'] = docs

docs1 = p_df['tokenz'].apply(lambda l: l[:int0(len(l)/2)])
docs2 = p_df['tokenz'].apply(lambda l: l[int0(len(l)/2):])

In [None]:
corpus1 = [dictionary.doc2bow(doc) for doc in docs1]
corpus2 = [dictionary.doc2bow(doc) for doc in docs2]

# LDA model dönüşümü kullan
lda_corpus1 = model[corpus1]
lda_corpus2 = model[corpus2]

In [None]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False):
    
#LDA dönüşümü, her makale için yalnızca sıfır olmayan ağırlıktaki konuları döndürüyor.
#Bu işlev "docs in topic" matris dönüşümünü sağlıyor.

    top_dist =[]
    keys = []

    for d in corpus:
        tmp = {i:0 for i in range(num_topics)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [array(vals)]
        if kwords:
            keys += [array(vals).argmax()]

    return array(top_dist), keys

In [None]:
top_dist1, _ = get_doc_topic_dist(model, lda_corpus1)
top_dist2, _ = get_doc_topic_dist(model, lda_corpus2)

print("Intra similarity: cosine similarity for corresponding parts of a doc(higher is better):")
print(mean([cosine_similarity(c1.reshape(1, -1), c2.reshape(1, -1))[0][0] for c1,c2 in zip(top_dist1, top_dist2)]))

random_pairs = np.random.randint(0, len(p_df['Abstract']), size=(400, 2))

print("Inter similarity: cosine similarity between random parts (lower is better):")
print(np.mean([cosine_similarity(top_dist1[i[0]].reshape(1, -1), top_dist2[i[1]].reshape(1, -1)) for i in random_pairs]))

In [None]:
def explore_topic(lda_model, topic_number, topn, output=True):
    """
 Top n kelimenin biçimlendirilmiş listesi
    """
    terms = []
    for term, frequency in lda_model.show_topic(topic_number, topn=topn):
        terms += [term]
        if output:
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))
    
    return terms

In [None]:
topic_summaries = []
print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
for i in range(num_topics):
    print('Konu '+str(i)+' |---------------------\n')
    tmp = explore_topic(model,topic_number=i, topn=10, output=True )
#     print tmp[:5]
    topic_summaries += [tmp[:5]]
    print

In [None]:
#centroid termler yazılabilir
top_labels = {0: 'Şu an', 1:'Tamamen', 2:'Uyduruyorum', 3:'Uydurdum'}

In [None]:
import re
import nltk

from nltk.corpus import stopwords

stops = set(stopwords.words('english'))

def paper_to_wordlist( paper, remove_stopwords=True ):
    '''
    Metni bir kelime dizisine dönüştür. Sonra da listesini döndürür.
    '''
    lemmatizer = WordNetLemmatizer()
    # 1. Remove non-letters
    paper_text = re.sub("[^a-zA-Z]"," ", paper)
    # 2. Convert words to lower case and split them
    words = paper_text.lower().split()
    # 3. Remove stop words
    words = [w for w in words if not w in stops]
    # 4. Remove short words
    words = [t for t in words if len(t) > 2]
    # 5. lemmatizing
    words = [nltk.stem.WordNetLemmatizer().lemmatize(t) for t in words]

    return(words)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvectorizer = TfidfVectorizer(input='content', analyzer = 'word', lowercase=True, stop_words='english',\
                                  tokenizer=paper_to_wordlist, ngram_range=(1, 3), min_df=40, max_df=0.20,\
                                  norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

In [None]:
dtm = tvectorizer.fit_transform(p_df['Abstract']).toarray()

In [None]:
top_dist =[]
for d in corpus:
    tmp = {i:0 for i in range(num_topics)}
    tmp.update(dict(model[d]))
    vals = list(OrderedDict(tmp).values())
    top_dist += [array(vals)]

In [None]:
top_dist, lda_keys= get_doc_topic_dist(model, corpus, True)
features = tvectorizer.get_feature_names()

In [None]:
top_ws = []
for n in range(len(dtm)):
    inds = int0(argsort(dtm[n])[::-1][:4])
    tmp = [features[i] for i in inds]
    
    top_ws += [' '.join(tmp)]
    
p_df['Text_Rep'] = pd.DataFrame(top_ws)
p_df['clusters'] = pd.DataFrame(lda_keys)
p_df['clusters'].fillna(10, inplace=True)

cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

p_df['colors'] = p_df['clusters'].apply(lambda l: cluster_colors[l])

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(top_dist)

In [None]:
p_df['X_tsne'] =X_tsne[:, 0]
p_df['Y_tsne'] =X_tsne[:, 1]

In [None]:
from bokeh.plotting import figure, show, output_notebook, save#, output_file
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

In [None]:
source = ColumnDataSource(dict(
    x=p_df['X_tsne'],
    y=p_df['Y_tsne'],
    color=p_df['colors'],
    label=p_df['clusters'].apply(lambda l: top_labels[l]),
#     msize= p_df['marker_size'],
    topic_key= p_df['clusters'],
    title= p_df[u'Title'],
    content = p_df['Text_Rep']
))

In [None]:
title = 'Konuların T-SNE görselleştirmesi'

plot_lda = figure(plot_width=1000, plot_height=600, title=title,
                  tools="pan,wheel_zoom,box_zoom,reset,hover", 
                  x_axis_type=None, y_axis_type=None,
                  min_border=1)

plot_lda.scatter(x='x', y='y', legend='label', source=source,
                 color='color', alpha=0.8, size=10)#'msize', )

# hover toollar
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"Bilgi": "Başlık: @title, Anahtar Kelimeler: @content - Konu: @topic_key "}
plot_lda.legend.location = "top_left"

show(plot_lda)


# save(plot_lda, '{}.html'.format(title))