In [198]:
import pandas as pd
import numpy as np
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import spacy
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import nltk
from nltk.corpus import stopwords
from pymongo import*
import json
import glob 
import pyLDAvis
import pyLDAvis.gensim
from nltk.tokenize import word_tokenize

import os
import pickle

In [226]:
def get_articles(articles_limit):
    connection = "mongodb://bouman:80um4N!@ec2-15-188-255-64.eu-west-3.compute.amazonaws.com:27017/"
    client = MongoClient(connection)
    db = client.get_database ('media_analysis')
    col = db["articles"]
    # fr_news = col.find({'meta.source.language': 'fr'}, {"_id": 0, "text": 1}).limit(articles_limit)
    fr_news = col.find({'meta.source.language': 'fr', 'text' : {'$regex' : 'données|data', '$options':'i'}}, {"_id": 0, "text": 1}).limit(articles_limit)    
    df = pd.DataFrame(data=fr_news)
    df['text'] = df['text'].map(lambda x: str(x))
    return df

def get_fr_stopwords():
    fr_stopwords = stopwords.words('french')
    with open("other_words.json", 'r') as f:
        months = json.load(f)
    fr_stopwords += months
    return fr_stopwords

In [227]:
df = get_articles(100000)
fr_stopwords = get_fr_stopwords()
nlp = spacy.load("fr_core_news_md")

In [228]:
def text_generator(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(sentence, deacc=False))

def remove_stopwords(texts):
    return [[word for word in doc if word not in fr_stopwords] for doc in texts]

def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

def get_corpus(df,no_below_filter=10,no_above_filter=0.25):
    words = list(text_generator(df.text))
    words = remove_stopwords(words)
    # print(words[:10])

    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]
    # print("Bigram_list: ", [[i for i in doc if "_" in i] for doc in bigram])
    # print(sum([len([i for i in doc if "_" in i]) for doc in bigram]))
    
    # bigram = lemmatization(bigram)
    # print(bigram[:10])

    id2word = gensim.corpora.Dictionary(bigram)
    print("with extremes: ", len(id2word))
    # id2word.filter_extremes(no_below=no_below_filter, no_above=no_above_filter)
    # print("without extremes: ", len(id2word))
    # id2word.compactify()

    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [229]:
train_corpus, train_id2word, bigram_train = get_corpus(df,no_below_filter=10,no_above_filter=0.1)

with extremes:  217455


In [230]:
num_topics = 13
lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                            id2word=train_id2word,
                                            num_topics=num_topics,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=500,
                                            passes=10, alpha="auto")

print('\nPerplexity: ', lda_model.log_perplexity(train_corpus)) 
coherence_model_lda = CoherenceModel(model=lda_model, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)


Perplexity:  -13.042411549538947
Coherence Score:  0.42611471876111295


In [232]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, train_corpus, train_id2word, mds="mmds", R=30)
pyLDAvis.save_html(vis, "./melike_new_results/v2_" + str(num_topics) +"donnes"+".html")

LDAvis_data_filepath = os.path.join('./melike_new_results/v2_lda_donnes' + str(num_topics))
with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(vis, f)



In [None]:
# num_topics = 12
# lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
#                                                 id2word=train_id2word,
#                                                 num_topics=num_topics,
#                                                 random_state=100,
#                                                 update_every=1,
#                                                 chunksize=500,
#                                                 passes=100, alpha="auto")
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, train_corpus, train_id2word, mds="mmds", R=30)
# pyLDAvis.save_html(vis, "./melike_new_results/v1_" + str(num_topics) +".html")

# LDAvis_data_filepath = os.path.join('./melike_new_results/v1_lda_' + str(num_topics))
# with open(LDAvis_data_filepath, 'wb') as f:
#         pickle.dump(vis, f)

# print('Perplexity: ', lda_model.log_perplexity(train_corpus))  # a measure of how good the model is. lower the better.
# coherence_model_lda = CoherenceModel(model=lda_model, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('Coherence Score: ', coherence_lda, "\n")

In [207]:
print('Perplexity: ', lda_model.log_perplexity(train_corpus))  # a measure of how good the model is. lower the better.
coherence_model_lda = CoherenceModel(model=lda_model, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda, "\n")

Perplexity:  -13.215110533978534
Coherence Score:  0.4911813981387432 



In [205]:
for i in range(15,36,5):
    print("Topic number: ", i)
    lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                                id2word=train_id2word,
                                                num_topics=i,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=500,
                                                passes=10, alpha="auto")
    print('Perplexity: ', lda_model.log_perplexity(train_corpus))  # a measure of how good the model is. lower the better.
    coherence_model_lda = CoherenceModel(model=lda_model, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('Coherence Score: ', coherence_lda, "\n")

Topic number:  15
Perplexity:  -13.215110449963428
Coherence Score:  0.4911813981387432 

Topic number:  20
Perplexity:  -14.490720746529115
Coherence Score:  0.49811920635144097 

Topic number:  25
Perplexity:  -15.731970392592807
Coherence Score:  0.48933430664996225 

Topic number:  30
Perplexity:  -16.996611818136227
Coherence Score:  0.46926235811521605 

Topic number:  35
Perplexity:  -18.264485482993887
Coherence Score:  0.4523513031304942 



In [231]:
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, train_corpus, train_id2word, mds="mmds", R=30)
# pyLDAvis.save_html(vis, "./melike__new_results/v1.html")
# vis

