In [259]:
import pandas as pd
import os
import sys


from multiprocessing import cpu_count
from loguru import logger
from pathlib import Path
from time import time, strftime, gmtime


In [4]:
data_folder = Path.home() / 'Data' / 'cc_news'
model_input_folder = data_folder / 'model_output' 

In [9]:
# Configuring the logger
config = {"handlers": [{"sink": sys.stdout,"colorize": True,
          "format": "<green>{time:YYYY-MM-DD HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> - <level>{message}</level>"}]}
logger.configure(**config)

[1]

In [5]:
df = pd.read_csv(model_input_folder / 'data_topic_model_ready.csv')

In [16]:
from pprint import pprint

In [143]:
pprint(df.sample(1).iloc[0].text)

('Juliet L. Shiebany, 1960-2019 - Obituaries - Devils Lake Journal - Devils '
 'Lake, ND - Devils Lake, ND Sections '
 'NewsSportsEntertainmentLifeObituariesCarsJobsClassifiedsMap '
 'DirectoryHomesDealsSubscribe Site Archive Log in Subscribe Now Juliet L. '
 'Shiebany, 1960-2019 Wednesday Dec 18, 2019 at 12:01 AM Juliet L. Shiebany '
 'was granted her angel wings on Thursday, December 5, 2019. There will be a '
 'private burial with family, and a Celebration of Life is scheduled for Jan. '
 '31. Born July 11, 1960 in Florida, she moved and spent her childhood until '
 'college in St. Louis. She then moved west to Columbia to attend the '
 'University of Missouri, where she graduated with a degree in Journalism and '
 'met her husband of almost 40 years, Ali Shiebany. Columbia became her '
 'permanent home where she spread her roots and started her family. After a '
 'few years she was overjoyed to announce the birth of her daughter Sara. '
 'Julie showered all those around her in a be

In [151]:
news_politics = df[df.main_topic == 'politics']
news_politics_data = news_politics.text.values.tolist()


['Domestic dispute leads to firing of deputy sheriff | FOX 46 Charlotte NewsWeatherTrafficGood DaySportsContests More Expand / Collapse search ☰ Search site News LocalNationalWorldPoliticsUnusualFOX 46 News AppLinksWeather ClosingsClosings AdminTrafficGas PricesWeather AppGood Day Nick\'s PicksPage\'s Pep RallyTeachers Getting ResultsConsumerHealthPets & AnimalsWatch LiveSports PanthersHornetsAuto RacingFOX Sports AppEntertainment ContestsWhat\'s On FOXViral StoriesAbout Us PersonalitiesTV ScheduleWork For UsFCC Public FileContact UsCopies of NewscastsClosed Captions Domestic dispute leads to firing of deputy sheriff Published 5 mins ago Updated 1 min ago Charlotte FOX 46 Charlotte Facebook Twitter Print Email article ( CMPD ) CHARLOTTE, N.C. - A deputy sheriff in Mecklenburg County has been arrested on an assault charge stemming from a domestic incident, the Mecklenburg County Sheriff\'s Office announced on Wednesday. Minique Jackson faces charges of assault with a deadly weapon. Jack

In [159]:
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser

In [153]:
# Tokenize removing punctuations with gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

In [154]:
news_politics_data_tokens = list(sent_to_words(news_politics_data))

In [157]:
print(news_politics_data_tokens[42])

['is', 'this', 'final', 'jeopardy', 'for', 'democrats', 'subscribe', 'nowfor', 'full', 'windsopen', 'city', 'settingsfull', 'forecastusa', 'todayphoto', 'videoscrime', 'newsthe', 'job', 'adsdeath', 'noticespublic', 'noticesbusiness', 'directoryusa', 'today', 'todayphoto', 'videoscrime', 'newsthe', 'job', 'adsdeath', 'noticespublic', 'noticesbusiness', 'directoryusa', 'today', 'accountaccess', 'billreport', 'delivery', 'issuespause', 'guidehelp', 'centersign', 'outhave', 'an', 'existing', 'account', 'sign', 'inalready', 'have', 'subscription', 'activate', 'your', 'accountdon', 'have', 'an', 'account', 'create', 'oneget', 'the', 'newsshare', 'this', 'story', 'let', 'friends', 'in', 'your', 'social', 'network', 'know', 'what', 'you', 'are', 'reading', 'this', 'final', 'jeopardy', 'for', 'democrats', 'what', 'the', 'wager', 'wrong', 'answers', 'will', 'leave', 'the', 'party', 'with', 'nothing', 'against', 'the', 'president', 'post', 'to', 'link', 'has', 'been', 'sent', 'to', 'your', 'frien

In [187]:
bigram = Phrases(news_politics_data_tokens, threshold=10)
trigram = Phrases(bigram[news_politics_data_tokens], threshold=10)

In [188]:
bigram_model = Phraser(bigram)
trigram_model = Phraser(trigram)

In [189]:
print(trigram_model[bigram_model[news_politics_data_tokens[500]]])

['bash', 'putin', 'the', 'senate', 'is', 'willing', 'the', 'trick', 'is', 'not', 'impugning', 'trump', 'the', 'new_york_times_skip', 'to', 'contentskip', 'to', 'site_intoday_paperpolitics', 'bash', 'putin', 'the', 'senate', 'is', 'willing', 'the', 'trick', 'is', 'not', 'impugning', 'trump', 'reading', 'the', 'main_storysupported_bycontinue_reading', 'the', 'main', 'storynews', 'analysisbash', 'putin', 'the', 'senate', 'is', 'willing', 'the', 'trick', 'is', 'not', 'impugning', 'trump', 'senate', 'bill', 'to', 'defend', 'national_security', 'against', 'russia', 'looks_like', 'an', 'artifact', 'from', 'different', 'age', 'months_ago', 'when', 'both_parties', 'agreed', 'on', 'the', 'threat', 'from', 'moscow', 'senator_lindsey_graham', 'republican', 'of', 'south_carolina', 'is', 'the', 'lead', 'author', 'of', 'the', 'bill', 'credit', 'anna_moneymaker', 'the', 'new_york_timesby', 'david', 'sangerdec', 'etwashington', 'as', 'the', 'house', 'of', 'representatives_began_debating', 'wednesday', 

In [193]:
from nltk.corpus import stopwords

In [195]:
stop_words = stopwords.words('english')
stop_words.extend(['news'])

In [225]:
corpus_no_stopwords = [[toke for toke in simple_preprocess(str(doc)) if toke not in stop_words] for doc in news_politics_data_tokens]

In [226]:
bigrammed = [bigram_model[doc] for doc in corpus_no_stopwords]

In [227]:
trigrammed = [trigram_model[bigram_model[doc]] for doc in corpus_no_stopwords]

In [253]:
start_time = time()
lemmatized= []
for sent in trigrammed:
    doc = nlp(" ".join(sent)) 
    lemmatized.append([token.lemma_ for token in doc])
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") # Beeps an alert. May not run on your machine

[32m2020-01-12 10:49:58[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m7[0m - [1mIt took 00:05:01 to run this script![0m


NameError: name 'os' is not defined

In [271]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaMulticore, LdaModel

In [298]:
# Filter out words that occur less than 20 documents, or more than 70% of the documents.
id_2_word.filter_extremes(no_below=5, no_above=0.7)

In [306]:
# Create Dictionary
id_2_word = Dictionary(lemmatized)
# Filter out words that occur less than 20 documents, or more than 50% of the documents.
id_2_word.filter_extremes(no_below=10, no_above=0.5)

In [307]:
len(id_2_word)

17199

In [308]:
# Term Document Frequency
corpus = [id_2_word.doc2bow(doc) for doc in lemmatized]

In [309]:
start_time = time()
lda_model =  LdaModel(corpus,
                          num_topics = 30, 
                          id2word = id_2_word,
                          random_state=42,
                          passes = 10,
                          alpha='auto',
                          eta='auto',
                          per_word_topics=True,
                          eval_every=None
                          )
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav")

[32m2020-01-12 12:17:21[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mIt took 00:00:42 to run this script![0m


0

In [310]:
pprint(lda_model.print_topics())

[(19,
  '0.032*"local" + 0.014*"weather" + 0.010*"report" + 0.008*"isd" + '
  '0.007*"section" + 0.007*"place_ad" + 0.007*"late" + '
  '0.006*"san_diego_union_tribune" + 0.006*"contact_information" + '
  '0.006*"ad_directory"'),
 (17,
  '0.015*"live" + 0.010*"top_stories" + 0.010*"special_effect" + '
  '0.009*"president_donald_trump" + 0.009*"las_vegas" + 0.009*"never" + '
  '0.009*"anti" + 0.008*"krqe" + 0.007*"house" + 0.007*"dench"'),
 (20,
  '0.031*"historic_day" + 0.024*"impeachment_vote_marks" + 0.023*"congress" + '
  '0.019*"local" + 0.011*"wednesday" + 0.010*"crime" + '
  '0.008*"russell_falcon_nexstar" + 0.008*"morgan_wright" + '
  '0.008*"anna_wiernicki_posted_dec" + 0.007*"inside"'),
 (28,
  '0.010*"fox" + 0.009*"national" + 0.009*"maryland" + 0.008*"fair" + '
  '0.008*"politics_business_technology" + 0.008*"shopping" + 0.008*"world" + '
  '0.008*"usweekly" + 0.008*"weather_odd" + '
  '0.008*"health_living_travel_science"'),
 (18,
  '0.027*"video" + 0.017*"local" + 0.011*"po

In [315]:
top_topics = lda_model.top_topics(texts=lemmatized, coherence='c_v')

In [323]:
pprint([tokens for tokens, coh in top_topics if coh > 0.7])

[[(0.024817713, 'email_print'),
  (0.02426203, 'facebookshare_tweet'),
  (0.014217977, 'continue'),
  (0.013282056, 'world'),
  (0.013065503, 'share'),
  (0.012759838, 'syria'),
  (0.012582033, 'submit'),
  (0.0124691995, 'woman'),
  (0.012387017, 'science_technology'),
  (0.012122037, 'business'),
  (0.012115659, 'house_fbi_illegal'),
  (0.012115654, 'standards_masthead_privacy_policy'),
  (0.012115649, 'money_entertainment_faith_health'),
  (0.012115649, 'service_connect'),
  (0.012115649, 'wire_conservative'),
  (0.012115647, 'rss_topbuzz_twitter_youtube'),
  (0.012115647, 'op_ed_terms'),
  (0.012115647, 'feedme_flipboard_instagram_parler'),
  (0.012115644, 'sports_wj'),
  (0.012115643, 'terrorism_israel')],
 [(0.01701509, 'post'),
  (0.015470671, 'link'),
  (0.01496721, 'violation'),
  (0.009700586, 'new'),
  (0.008970108, 'send'),
  (0.008353036, 'need'),
  (0.007994513, 'know'),
  (0.0077568647, 'ban'),
  (0.0076808278, 'read'),
  (0.007633126, 'lose'),
  (0.0075618555, 'log'),
 

In [313]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized, dictionary=id_2_word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.724782210936226

Coherence Score:  0.5783244657876211


In [328]:
list(range(0, 55, 5)[1:])

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

In [329]:
start_time = time()
coherence_scores = {}
for k in list(range(0, 55, 5)[1:]):
    lda_model =  LdaModel(corpus,
                              num_topics = k, 
                              id2word = id_2_word,
                              random_state=42,
                              passes = 10,
                              alpha='auto',
                              eta='auto',
                              per_word_topics=True,
                              eval_every=None
                          )
     
    coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized, dictionary=id_2_word, coherence='c_v')
    coherence_scores[k] = coherence_model_lda.get_coherence()
    print(f"done with => {k}")
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") 

done with => 5
done with => 10
done with => 15
done with => 20
done with => 25
done with => 30
done with => 35
done with => 40
done with => 45
done with => 50
[32m2020-01-12 13:42:40[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m19[0m - [1mIt took 00:08:00 to run this script![0m


0

In [330]:
coherence_scores

{5: 0.6134195424830422,
 10: 0.5703540321227571,
 15: 0.5439440604477297,
 20: 0.5271261605905009,
 25: 0.5874181850818787,
 30: 0.5783244657876211,
 35: 0.5255316259562004,
 40: 0.5311503708615435,
 45: 0.543467617771982,
 50: 0.5340368451439997}

In [337]:
path_to_mallet_binary = "/home/ozan/Mallet/bin/mallet"

In [338]:
from gensim.models.wrappers import LdaMallet


In [339]:
lda_model_mallet = LdaMallet(path_to_mallet_binary, corpus=corpus, num_topics=25, id2word=id_2_word)

In [340]:
coherence_model_mallet = CoherenceModel(model=lda_model_mallet, texts=lemmatized, dictionary=id_2_word, coherence='c_v')

In [341]:
coherence_model_lda.get_coherence()

0.5340368451439997

In [345]:
coherence_scores = {}
for k in range(20, 35):
    lda_model =  LdaModel(corpus,
                              num_topics = k, 
                              id2word = id_2_word,
                              random_state=42,
                              passes = 10,
                              alpha='auto',
                              eta='auto',
                              per_word_topics=True,
                              eval_every=None
                          )
     
    coherence_model_lda = CoherenceModel(model=lda_model, texts=lemmatized, dictionary=id_2_word, coherence='c_v')
    coherence_scores[k] = coherence_model_lda.get_coherence()
    print(f"done with => {k}")
elapsed = strftime("%H:%M:%S", gmtime(time() - start_time))
logger.info(f'It took {elapsed} to run this script!')
os.system("play /usr/share/sounds/sound-icons/trumpet-1.wav") 

done with => 20
done with => 21
done with => 22
done with => 23
done with => 24
done with => 25
done with => 26
done with => 27
done with => 28
done with => 29
done with => 30
done with => 31
done with => 32
done with => 33
done with => 34
[32m2020-01-12 14:33:14[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mIt took 00:32:51 to run this script![0m


0

In [347]:
coherence_scores

{20: 0.5271261605905009,
 21: 0.5283489776490575,
 22: 0.5933197738736351,
 23: 0.5467649445937937,
 24: 0.5285951550220905,
 25: 0.5874181850818787,
 26: 0.5851504855567649,
 27: 0.5602890752313544,
 28: 0.5442390466632456,
 29: 0.5549566175011115,
 30: 0.5783244657876211,
 31: 0.5563600039195533,
 32: 0.5827256244179466,
 33: 0.5891212577212019,
 34: 0.5641129215158969}

In [348]:
lda_model =  LdaModel(corpus,
                              num_topics = 22, 
                              id2word = id_2_word,
                              random_state=42,
                              passes = 10,
                              alpha='auto',
                              eta='auto',
                              per_word_topics=True,
                              eval_every=None
                          )


In [368]:
df_topics = pd.DataFrame()
for i, row in enumerate(lda_model[corpus]):
    row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
    
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = lda_model.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            df_topics = df_topics.append(pd.Series([int(topic_num), round(prop_topic,2), topic_keywords]), ignore_index=True)
        else:
            break   

In [370]:
df_topics.columns = ['top_topic', 'perc_contrib', 'topic_kw']


In [390]:
top_10 = df_topics.top_topic.value_counts().nlargest(10).index.tolist()
top_10

[9.0, 5.0, 10.0, 11.0, 18.0, 3.0, 1.0, 8.0, 16.0, 7.0]

In [394]:
for t in top_10:
    wp = lda_model.show_topic(int(t))
    topic_keywords = ", ".join([word for word, prop in wp])
    print(wp, topic_keywords, "\n")

[('president', 0.026044648), ('vote', 0.024901558), ('house', 0.02467349), ('democrats', 0.014591674), ('say', 0.013838652), ('impeach', 0.009608715), ('article', 0.008304843), ('president_donald_trump', 0.0066399537), ('office', 0.0062946184), ('support', 0.006290204)] president, vote, house, democrats, say, impeach, article, president_donald_trump, office, support 

[('state', 0.013070506), ('say', 0.010057605), ('new', 0.0064726924), ('would', 0.004508413), ('post', 0.0043799174), ('people', 0.0043043816), ('candidate', 0.0042224154), ('election', 0.0041520307), ('year', 0.0040505063), ('bill', 0.003665372)] state, say, new, would, post, people, candidate, election, year, bill 

[('say', 0.01095082), ('one', 0.008053726), ('december', 0.005253373), ('day', 0.0051396526), ('year', 0.0046062986), ('time', 0.0045323484), ('report', 0.004464143), ('see', 0.004312752), ('get', 0.004309129), ('former', 0.0037976447)] say, one, december, day, year, time, report, see, get, former 

[('sign'

In [356]:
sent_topics

0                                                   10
1                                               0.2931
2    say, one, december, day, year, time, report, s...
dtype: object

In [360]:
lda_model.show_topic(10, topn=20)

[('say', 0.01095082),
 ('one', 0.008053726),
 ('december', 0.005253373),
 ('day', 0.0051396526),
 ('year', 0.0046062986),
 ('time', 0.0045323484),
 ('report', 0.004464143),
 ('see', 0.004312752),
 ('get', 0.004309129),
 ('former', 0.0037976447),
 ('people', 0.003736288),
 ('make', 0.0037228055),
 ('https_co', 0.0035546585),
 ('fbi', 0.0034681538),
 ('world', 0.0032284926),
 ('new', 0.0030671554),
 ('go', 0.0030512789),
 ('may', 0.0030391335),
 ('page', 0.0030044757),
 ('good', 0.0029962612)]

In [359]:
pprint(lda_model.print_topics(num_words))

[(19,
  '0.015*"local" + 0.013*"sport" + 0.013*"violent_crime" + 0.009*"password" + '
  '0.009*"welcome_log" + 0.009*"account" + 0.009*"report" + 0.009*"york" + '
  '0.008*"barr" + 0.008*"username_password_forgot_password"'),
 (20,
  '0.015*"local" + 0.009*"crime" + 0.008*"subscribe" + 0.007*"inside" + '
  '0.007*"books_music_theater_classical" + '
  '0.007*"games_puzzles_horoscopes_life" + '
  '0.007*"politics_education_education" + '
  '0.007*"sports_seahawks_huskies_cougars" + 0.007*"music_tv_streaming_comics" '
  '+ 0.007*"tv_radio_entertainment_movies"'),
 (6,
  '0.013*"nownightly_newsmeet" + 0.013*"tunedspecial_featuresmore" + '
  '0.013*"latinonbcblknbc_outstay" + 0.013*"searchsectionsu_mediadecision" + '
  '0.013*"archivesknow" + 0.013*"vetsparent_toolkitnbc" + '
  '0.013*"filmsnbc_left" + 0.013*"value_follow_nbc" + '
  '0.013*"nbccnbcnbc_comnbc_learnpeacock_productionsnext" + 0.013*"newsmeet"'),
 (17,
  '0.027*"individual_mandate" + 0.020*"law" + 0.018*"part" + 0.018*"share" +