<h1 align="center">Topic modeling on Medium articles</h1>

In [2]:
## All minimum setup and libraries required

import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## TENSORFLOW        
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer   ## Generate dictionary of word encodings
from tensorflow.keras.preprocessing.sequence import pad_sequences

## GENSIM and NLTK
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
nltk.download('wordnet')

print("Tensorflow\t-\t",tf.__version__)
print("NLTK\t\t-\t",nltk.__version__)
print("Gensim\t\t-\t",nltk.__version__)

/kaggle/input/medium-articles-with-content/Medium_AggregatedData.csv
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Tensorflow	-	 2.3.0
NLTK		-	 3.2.4
Gensim		-	 3.2.4


In [3]:
path = "../input/medium-articles-with-content/Medium_AggregatedData.csv"
dataframe_full = pd.read_csv(path)
dataframe_imp = pd.read_csv(path)
print("Dataset have been read")

Dataset have been read


In [4]:
dataframe_full.head()

Unnamed: 0,audioVersionDurationSec,codeBlock,codeBlockCount,collectionId,createdDate,createdDatetime,firstPublishedDate,firstPublishedDatetime,imageCount,isSubscriptionLocked,...,slug,name,postCount,author,bio,userId,userName,usersFollowedByCount,usersFollowedCount,scrappedDate
0,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,blockchain,Blockchain,265164.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
1,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,samsung,Samsung,5708.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
2,0,,0.0,638f418c8464,2018-09-18,2018-09-18 20:55:34,2018-09-18,2018-09-18 20:57:03,1,False,...,it,It,3720.0,Anar Babaev,,f1ad85af0169,babaevanar,450.0,404.0,20181104
3,0,,0.0,,2018-01-07,2018-01-07 17:04:37,2018-01-07,2018-01-07 17:06:29,13,False,...,technology,Technology,166125.0,George Sykes,,93b9e94f08ca,tasty231,6.0,22.0,20181104
4,0,,0.0,,2018-01-07,2018-01-07 17:04:37,2018-01-07,2018-01-07 17:06:29,13,False,...,robotics,Robotics,9103.0,George Sykes,,93b9e94f08ca,tasty231,6.0,22.0,20181104


In [5]:
x = dataframe_full['name'][10]
y = dataframe_full['publicationdescription'][15]
print(x)
print(y)
print(dataframe_full.shape)
print(dataframe_full['name'][10])
print(dataframe_full['name'][11])
print(dataframe_full['name'][12])
print(dataframe_full['title'][10])

Big Data Training Mumbai
Non-obvious meditation advice from people on the battlefront of daily creation
(279577, 50)
Big Data Training Mumbai
Robotics
Meditation
Ascent of data Science, SAS and Big data Analyst Trainings Programs


# Step_1: Preprocessing and cleaning

**There are ~300000 entries**

In [6]:
print(dataframe_full.columns)

Index(['audioVersionDurationSec', 'codeBlock', 'codeBlockCount',
       'collectionId', 'createdDate', 'createdDatetime', 'firstPublishedDate',
       'firstPublishedDatetime', 'imageCount', 'isSubscriptionLocked',
       'language', 'latestPublishedDate', 'latestPublishedDatetime',
       'linksCount', 'postId', 'readingTime', 'recommends',
       'responsesCreatedCount', 'socialRecommendsCount', 'subTitle',
       'tagsCount', 'text', 'title', 'totalClapCount', 'uniqueSlug',
       'updatedDate', 'updatedDatetime', 'url', 'vote', 'wordCount',
       'publicationdescription', 'publicationdomain',
       'publicationfacebookPageName', 'publicationfollowerCount',
       'publicationname', 'publicationpublicEmail', 'publicationslug',
       'publicationtags', 'publicationtwitterUsername', 'tag_name', 'slug',
       'name', 'postCount', 'author', 'bio', 'userId', 'userName',
       'usersFollowedByCount', 'usersFollowedCount', 'scrappedDate'],
      dtype='object')


The required columns are:
* language
* subTitle
* tagsCount
* text
* title
* url
* wordCount
* publicationdescription
* tag_name
* name

but the most important columns are primarily:
* subTitle
* text
* title

In [7]:
required_col = ['language','subTitle','tagsCount','text','title','url','wordCount','publicationdescription'
               ,'tag_name','name']
most_imp_col = ['subTitle','text','title']

In [8]:
# article_titles = dataframe_full['title']
# art_grp_1 = article_titles[16:25]
# print(art_grp_1)
print(dataframe_full.language.unique())

['en' 'th' 'ja' 'zh' 'ru' 'pt' 'es' 'zh-Hant' 'id' 'my' 'de' 'tr' 'fr'
 'ko' 'it' 'lo' 'un' 'vi' 'cs' 'sk' 'is' 'sv' 'bn' 'mn' 'da' 'no' 'bg'
 'ar' 'pl' 'nl' 'ro' 'ca' 'hu' 'hi' 'ka' 'el' 'ms' 'uk' 'si' 'sr' 'lt'
 'la' 'fa' 'ml' 'sl' 'mr' 'az' 'lv' 'te' 'mk' 'nn' 'fi']


In [9]:
## Number of rows english rows

english_titles = dataframe_full[dataframe_full['language'] == 'en']
# english_titles.head()
print(english_titles.shape)

(257655, 50)


In [10]:
## Number of rows dropped after removing null value rows

print(dataframe_imp.shape)
dataframe_imp.dropna(how = 'all')
print(dataframe_imp.shape)

(279577, 50)
(279577, 50)


So nothing is missing in any rows

In [11]:
## After dropping non-english and columns that are not required really

dataframe_imp.drop(dataframe_imp[dataframe_imp['language'] != 'en'].index, inplace = True)

dataframe_imp = dataframe_imp.drop(['audioVersionDurationSec', 'codeBlock', 'codeBlockCount',
       'collectionId', 'createdDate', 'createdDatetime', 'firstPublishedDate',
       'firstPublishedDatetime', 'imageCount', 'isSubscriptionLocked',
       'language', 'latestPublishedDate', 'latestPublishedDatetime',
       'linksCount', 'postId', 'readingTime', 'recommends',
       'responsesCreatedCount', 'socialRecommendsCount','tagsCount','totalClapCount', 'uniqueSlug',
       'updatedDate', 'updatedDatetime', 'url', 'vote', 'wordCount',
       'publicationdescription', 'publicationdomain',
       'publicationfacebookPageName', 'publicationfollowerCount',
       'publicationname', 'publicationpublicEmail', 'publicationslug',
       'publicationtags', 'publicationtwitterUsername', 'tag_name', 'slug',
       'name', 'postCount', 'author', 'bio', 'userId', 'userName',
       'usersFollowedByCount', 'usersFollowedCount', 'scrappedDate'], axis=1)

dataframe_imp['index'] = dataframe_imp.index

dataframe_imp.shape

(257655, 4)

In [12]:
dataframe_imp.head()

Unnamed: 0,subTitle,text,title,index
0,A major private IT company implements blockcha...,"Private Business, Government and Blockchain\n\...","Private Business, Government and Blockchain",0
1,A major private IT company implements blockcha...,"Private Business, Government and Blockchain\n\...","Private Business, Government and Blockchain",1
2,A major private IT company implements blockcha...,"Private Business, Government and Blockchain\n\...","Private Business, Government and Blockchain",2
3,Introduction,EPQ draft 1 (4844 words)\nhttps://upload.wikim...,EPQ draft 1 (4844 words),3
4,Introduction,EPQ draft 1 (4844 words)\nhttps://upload.wikim...,EPQ draft 1 (4844 words),4


In [13]:
print(dataframe_imp.title[15])
print(dataframe_imp.subTitle[15])
# print(dataframe_imp.text[15])      ## Text is too huge to be displayed
print(dataframe_imp.index[15])

Can a robot love us better than another human can?
I discussed this with Michelle Tsng on my Podcast “Crazy Wisdom”.
15


**after dropping all the non english rows and after dropping all non essential columns `dataframe_imp` is the required dataframe****

In [14]:
title_list = dataframe_imp['title'].astype(str)   ## using astype(str) eliminates the floting type error
title_list.describe()
# title_list.to_numpy()

count                          257655
unique                          64417
top       10 new things to read in AI
freq                              186
Name: title, dtype: object

### Perform lemmatization and stem preprocessing steps on the data set

In [15]:
## Stemmer initialization
stemmer = SnowballStemmer("english")

In [16]:
## Functions for lemmatization, removal of Stopwords

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [17]:
### Code to check the function

doc_sample = dataframe_imp[dataframe_imp['index'] == 1000].values[0][2]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
    
print(dataframe_imp[dataframe_imp['index'] == 1000].values[0][2])
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
Machine Learning Made Easy: What it is and How it Works
['Machine', 'Learning', 'Made', 'Easy:', 'What', 'it', 'is', 'and', 'How', 'it', 'Works']


 tokenized and lemmatized document: 
['machin', 'learn', 'easi', 'work']


### Processed titles

In [18]:
title_list = dataframe_imp['title'].astype(str)   ## using astype(str) eliminates the floting type error
title_list.describe()

count                          257655
unique                          64417
top       10 new things to read in AI
freq                              186
Name: title, dtype: object

In [19]:
## The titles are preprocessed and saved into processd_docs

processed_titles = title_list.map(preprocess)
processed_titles[30:40]

34    [meta, model, meta, meta, model, deep, learn]
35    [meta, model, meta, meta, model, deep, learn]
36               [tip, data, scienc, team, succeed]
37               [tip, data, scienc, team, succeed]
38               [tip, data, scienc, team, succeed]
39                                   [trust, trust]
40                                   [trust, trust]
41                                   [trust, trust]
42                                   [trust, trust]
43                                   [trust, trust]
Name: title, dtype: object

# Step_2: Creation of the Bag of words
Bag of words is a frequency count of the words occuring in the `preprocessed_docs`

In [20]:
## bow --> Bag of Words

bow = gensim.corpora.Dictionary(processed_titles)

## Finding out words with a min_occurance = 10

min_occurance = 10
count = 0
for k, v in bow.iteritems():
    print(k, v)
    count += 1
    if count > min_occurance:    # We can limit the selection based on the frequency
        break

0 blockchain
1 busi
2 govern
3 privat
4 draft
5 word
6 analyst
7 ascent
8 data
9 program
10 scienc


### Filtering tokens based on
* less than 15 occurances in titles
* more than 0.5 of total titles
* after the above two steps, keep only the first 100000 most frequent tokens.


In [21]:
bow.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
print(bow)

Dictionary(5031 unique tokens: ['blockchain', 'busi', 'govern', 'privat', 'draft']...)


### Generating the doc2bow dictionary

For each title we create a dictionary reporting how many words and how many times those words appear. This is saved to the `bow_corpus`.

##### **NOTE:** This step gives a simmillar result for a very small corpus such as title of the articles, but it is important while working on the actual body of the articles.

In [22]:
bow_corpus = [bow.doc2bow(doc) for doc in processed_titles]
bow_corpus[:10]

[[(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(0, 1), (1, 1), (2, 1), (3, 1)],
 [(4, 1), (5, 1)],
 [(4, 1), (5, 1)],
 [(4, 1), (5, 1)],
 [(4, 1), (5, 1)],
 [(6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1)],
 [(6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1)],
 [(6, 1), (7, 1), (8, 2), (9, 1), (10, 1), (11, 1)]]

### Preview of the BOW for the preprocessed titles

In [23]:
## A example of the BOW for the 1000th title

bow_example = bow_corpus[1000]
for i in range(len(bow_example)):
    print("Word {} (\"{}\") appears {} time.".format(bow_example[i][0], 
           bow[bow_example[i][0]], 
           bow_example[i][1]))

Word 126 ("fake") appears 1 time.
Word 294 ("photo") appears 1 time.
Word 610 ("believ") appears 1 time.
Word 611 ("game") appears 1 time.
Word 612 ("generat") appears 1 time.
Word 613 ("mous") appears 1 time.


# Step_3: TF-IDF 
TF-IDF stands for term frequency–inverse document frequency. The higher the TF-IDF score the rarer a word is in a given corpus and vice-versa. We will be using the TF-IDF model for the gensim models library.

In [24]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# from pprint import pprint

for i in corpus_tfidf:
    print(i)
    break

[(0, 0.37920465251741053), (1, 0.3610050132437884), (2, 0.5619103680400833), (3, 0.640418574223968)]


# Step_4: Running LDA algo on the bag of words
Testing LDA(Latent Dirichlet allocation) on the BOW. We will be training our LDA model using `gensim.models.LdaMulticore` and save it to `lda_model`

In [28]:
## LDA when the num_topics = 10

lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=bow, passes=2, workers=2)

**For each topic, we will explore the words occuring in that topic and its relative weight.**

In [29]:
for idx, topic in lda_model.print_topics(-1):
    print('\nTopic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.260*"learn" + 0.155*"machin" + 0.080*"deep" + 0.018*"regress" + 0.015*"guid" + 0.013*"introduct" + 0.012*"model" + 0.012*"linear" + 0.011*"python" + 0.009*"beginn"

Topic: 1 
Words: 0.028*"start" + 0.028*"drive" + 0.027*"tensorflow" + 0.026*"model" + 0.022*"self" + 0.021*"problem" + 0.020*"blockchain" + 0.017*"get" + 0.017*"meet" + 0.015*"team"

Topic: 2 
Words: 0.039*"robot" + 0.035*"custom" + 0.031*"human" + 0.027*"digit" + 0.024*"experi" + 0.017*"think" + 0.016*"year" + 0.015*"interact" + 0.014*"bot" + 0.014*"case"

Topic: 3 
Words: 0.057*"futur" + 0.036*"design" + 0.033*"work" + 0.027*"thing" + 0.024*"real" + 0.024*"time" + 0.023*"convers" + 0.022*"read" + 0.021*"better" + 0.020*"need"

Topic: 4 
Words: 0.258*"data" + 0.088*"scienc" + 0.026*"scientist" + 0.026*"python" + 0.025*"visual" + 0.025*"analyt" + 0.015*"analysi" + 0.014*"person" + 0.012*"social" + 0.010*"explor"

Topic: 5 
Words: 0.086*"chatbot" + 0.026*"autom" + 0.026*"develop" + 0.026*"predict" + 0.025

<h3 style="color:red;">TODO: Visualization needs to be done to rank the topics based on the weights of words.</h3>