In [None]:

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
  
from nltk.corpus import stopwords
import pandas as pd
from string import punctuation
import re
from gensim.models import TfidfModel, LsiModel, CoherenceModel, LdaModel
import numpy as np
from gensim.corpora import Dictionary
from textblob import TextBlob

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
articles = pd.read_csv("/content/drive/My Drive/Topic modelling/Patents data.csv")

In [None]:
articles.drop(['patent_date','patent_number','patent_abstract'], axis=1, inplace=True)


In [None]:
articles

Unnamed: 0,text
0,"""""""Barometer"""" neuron for a neural network"""
1,"""Electronic neural network for solving """"trave..."
2,3 layer liquid crystal neural network with out...
3,3-brain architecture for an intelligent decisi...
4,3-brain architecture for an intelligent decisi...
...,...
3517,Wireless communication system and method and s...
3518,Wireless network coverage based on quality of ...
3519,Wireless network hybrid simulation
3520,Wireless neural network and a wireless neural ...


In [None]:
def cleantext(text):
    text = text.strip(punctuation).lower()
    text = re.sub(r'[!?,.\:;\n\t]+', '', text)
    word= nltk.tokenize.word_tokenize(text)#tokenization
    word = [w for w in word if w.isalpha()]# selecting only words
    word = [w for w in word if w not in stopwords.words('english') and len(w) > 2]#removing stopwords 
    return word

In [None]:
def tfidf_maker(articles,clean_method):
    # creating a list of token of all the articles(documents)
    token = []    
    if clean_method==1:
        #More cleaning with the help of lemmatizing words 
        for i in articles.index:
            words = cleantext(articles.loc[i, 'text']) #calling basic function
            wordnet = nltk.stem.WordNetLemmatizer() #Normalization using Lemmatization technique
            lemmatized_words = [wordnet.lemmatize(w) for w in words] # keeping lemmatized words
            token.append(lemmatized_words)             #appending to empty token list        
        my_dict = Dictionary(token)  #Converting words into a dictonary Tokenization 
        return my_dict,token 
    elif clean_method==2:
        #to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents
        for i in articles.index:
            words = cleantext(articles.loc[i, 'text'])
            token.append(words) #appending to a empty token list
        my_dict = Dictionary(token)  #Converting words into a dictonary Tokenization
        #exclude the top 10% and words that appear less than 5 times
        my_dict.filter_extremes(no_below=5, no_above=0.90)
        return my_dict,token
    elif clean_method==3:
        #Limiting the word list with nouns
        for i in articles.index:
            words = cleantext(articles.loc[i, 'text'])
            modified_text=' '.join([w for w in words])
            blob_object = TextBlob(modified_text)
            #Limiting the word list with nouns
            word_list_nouns = [word for word,pos in blob_object.tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
            token.append(word_list_nouns) #apending a empty token list
        my_dict = Dictionary(token)   #Converting words into a dictonary Tokenization
        return my_dict,token

In [None]:

# Determining optimum number of topics using coherence values 
def maxCoherence(corpus, isLsi,my_dict,token):
    coherence_values = []
    model_list = []
    min_topics, max_topics, step = 1, 10, 1
    for i in range(min_topics, max_topics, step):
        if (isLsi) :
            model = LsiModel(corpus, id2word=my_dict, num_topics=i)
        else:
            model = LdaModel(corpus, id2word=my_dict, num_topics=i)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=token, dictionary=my_dict, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        print(coherence_values)
    return coherence_values.index(max(coherence_values))

In [None]:
# Get dominant topic and corresponding keywords for each article
def getkeywords(model, corpus): 
    # Init output
    topickeyword_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(model[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = model.show_topic(topic_num, topn=5)
                #topn = 5 gives top 5 kwywords 
                topic_keywords = ", ".join([word for word, prop in wp])
                topickeyword_df = topickeyword_df.append(pd.Series([topic_keywords]), ignore_index=True)
            else:
                break
    return(topickeyword_df)

In [None]:
def models_method(clean_method):
    #convert a list of words to bag of words
    my_dict,token=tfidf_maker(articles,clean_method)
    dtm = [my_dict.doc2bow(doc) for doc in token] #convert a list of words to bag of words
    tfidf = TfidfModel(dtm) # TF-IDF Vectorization for the document term matrix
    tfidf = tfidf[dtm]

    # Gensim: LSI
    lsi_model = LsiModel(corpus=tfidf, id2word=my_dict, num_topics=maxCoherence(tfidf,isLsi=True,my_dict = my_dict,token = token))

    # Gensim: LDA
    lda_model = LdaModel(corpus=tfidf, id2word=my_dict, num_topics=maxCoherence(tfidf,isLsi=False,my_dict = my_dict,token = token))
    return lsi_model,lda_model,tfidf

In [None]:
from __future__ import division
lsi_model_1,lda_model_1,tfidf = models_method(1)
# add top 5 keywords for each model into the dataframe after vectorization 
articles['LSI Clean Keywords'] = getkeywords(model=lsi_model_1, corpus=tfidf)
articles['LDA Clean Keywords'] = getkeywords(model=lda_model_1, corpus=tfidf)

[0.1695129409900938]
[0.1695129409900938, 0.22134925005534337]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908, 0.3976668300772702]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908, 0.3976668300772702, 0.36438783582002077]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908, 0.3976668300772702, 0.36438783582002077, 0.34973964443469213]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908, 0.3976668300772702, 0.36438783582002077, 0.34973964443469213, 0.323545224389016]
[0.1695129409900938, 0.22134925005534337, 0.2708191942654471, 0.2591903783406908, 0.3976668300772702, 0.36438783582002077, 0.34973964443469213, 0.323545224389016, 0.36359178064829667]
[0.14924432231144985]
[0.14924432231144985, 0.19102963061667672]
[0.149244322

In [None]:
articles.head(3)

Unnamed: 0,text,LSI Clean Keywords,LDA Clean Keywords
0,"""""""Barometer"""" neuron for a neural network""","recognition, pattern, speech, neural, network","network, neural, system, learning, method"
1,"""Electronic neural network for solving """"trave...","processing, image, network, neural, apparatus","system, neural, processing, network, process"
2,3 layer liquid crystal neural network with out...,"processing, image, network, neural, apparatus","neural, network, system, speech, recognition"


In [None]:
lsi_model_2,lda_model_2,tfidf = models_method(2)
# add top 5 keywords for each model into the dataframe after vectorization 
articles['LSI Clean Keywords 2'] = getkeywords(model=lsi_model_2, corpus=tfidf)
articles['LDA Clean Keywords 2'] = getkeywords(model=lda_model_2, corpus=tfidf)

[0.1525778522171698]
[0.1525778522171698, 0.31621061509413284]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325, 0.41721847457632516]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325, 0.41721847457632516, 0.42716544288355457]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325, 0.41721847457632516, 0.42716544288355457, 0.2768968206321861]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325, 0.41721847457632516, 0.42716544288355457, 0.2768968206321861, 0.36968739673372986]
[0.1525778522171698, 0.31621061509413284, 0.23481758109714912, 0.24977782111065325, 0.41721847457632516, 0.42716544288355457, 0.2768968206321861, 0.36968739673372986, 0.3153517203277904]
[0.12236802729907556]
[0.12236802729907556, 0.2040248215983

In [None]:
articles.head(3)

Unnamed: 0,text,LSI Clean Keywords,LDA Clean Keywords,LSI Clean Keywords 2,LDA Clean Keywords 2
0,"""""""Barometer"""" neuron for a neural network""","recognition, pattern, speech, neural, network","network, neural, system, learning, method","network, neural, system, method, using","recognition, network, neural, pattern, system"
1,"""Electronic neural network for solving """"trave...","processing, image, network, neural, apparatus","system, neural, processing, network, process","network, neural, system, method, using","networks, system, neural, network, method"
2,3 layer liquid crystal neural network with out...,"processing, image, network, neural, apparatus","neural, network, system, speech, recognition","network, neural, system, method, using","neural, network, circuit, optical, learning"


In [None]:
nltk.download('averaged_perceptron_tagger')
from fractions import Fraction
from __future__ import division
lsi_model_3,lda_model_3,tfidf = models_method(3)
# add top 5 keywords for each model into the dataframe after vectorization 
articles['LSI Clean Keywords 3'] = getkeywords(model=lsi_model_3, corpus=tfidf)
articles['LDA Clean Keywords 3'] = getkeywords(model=lda_model_3, corpus=tfidf)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[0.26174636046263694]
[0.26174636046263694, 0.4399516048795701]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.5399177400432231]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.5399177400432231, 0.48775682964990397]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.5399177400432231, 0.48775682964990397, 0.548776632351767]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.5399177400432231, 0.48775682964990397, 0.548776632351767, 0.5209840145863566]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.5399177400432231, 0.48775682964990397, 0.548776632351767, 0.5209840145863566, 0.446321807762332]
[0.26174636046263694, 0.4399516048795701, 0.3673018411455235, 0.539917740043

In [None]:
articles.head(3)

Unnamed: 0,text,LSI Clean Keywords,LDA Clean Keywords,LSI Clean Keywords 2,LDA Clean Keywords 2,LSI Clean Keywords 3,LDA Clean Keywords 3
0,"""""""Barometer"""" neuron for a neural network""","recognition, pattern, speech, neural, network","network, neural, system, learning, method","network, neural, system, method, using","recognition, network, neural, pattern, system","networks, network, control, recognition, method","recognition, process, network, system, method"
1,"""Electronic neural network for solving """"trave...","processing, image, network, neural, apparatus","system, neural, processing, network, process","network, neural, system, method, using","networks, system, neural, network, method","networks, network, control, recognition, method","recognition, process, network, system, method"
2,3 layer liquid crystal neural network with out...,"processing, image, network, neural, apparatus","neural, network, system, speech, recognition","network, neural, system, method, using","neural, network, circuit, optical, learning","networks, network, control, recognition, method","recognition, process, network, system, method"


In [None]:
#combining keywords from LSA , LDA after 3 ceaing methods into a new keyword column
articles['keyword'] = articles[articles.columns[2:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)

In [None]:
from collections import Counter 
for i in articles.index:
    key_word = articles.loc[i, 'keyword']
    key_word = key_word.split(',')
    most_occur = Counter(key_word).most_common(5) 
    articles.loc[i, 'Top 5 Words'] = ','.join([word[0] for word in most_occur])

In [None]:
articles = articles.drop(columns=['keyword']) #every keyword
articles.to_csv('BBC_Keywords.csv',index=False,encoding='utf-8') #write to csv

In [None]:
articles

Unnamed: 0,text,LSI Clean Keywords,LDA Clean Keywords,LSI Clean Keywords 2,LDA Clean Keywords 2,LSI Clean Keywords 3,LDA Clean Keywords 3,Top 5 Words
0,"""""""Barometer"""" neuron for a neural network""","recognition, pattern, speech, neural, network","network, neural, system, learning, method","network, neural, system, method, using","recognition, network, neural, pattern, system","networks, network, control, recognition, method","recognition, process, network, system, method","system, method, neural, network,network"
1,"""Electronic neural network for solving """"trave...","processing, image, network, neural, apparatus","system, neural, processing, network, process","network, neural, system, method, using","networks, system, neural, network, method","networks, network, control, recognition, method","recognition, process, network, system, method","network, method, neural, system, process"
2,3 layer liquid crystal neural network with out...,"processing, image, network, neural, apparatus","neural, network, system, speech, recognition","network, neural, system, method, using","neural, network, circuit, optical, learning","networks, network, control, recognition, method","recognition, process, network, system, method","network, system, method,neural, recognition"
3,3-brain architecture for an intelligent decisi...,"processing, image, network, neural, apparatus","network, neural, system, learning, method","control, system, image, processing, apparatus","network, neural, model, using, system","recognition, control, system, speech, network","networks, detection, system, images, method","system,network, neural, method, learning"
4,3-brain architecture for an intelligent decisi...,"processing, image, network, neural, apparatus","network, neural, system, learning, method","control, system, image, processing, apparatus","network, neural, model, using, system","recognition, control, system, speech, network","networks, detection, system, images, method","system,network, neural, method, learning"
...,...,...,...,...,...,...,...,...
3517,Wireless communication system and method and s...,"processing, image, network, neural, apparatus","pattern, recognition, system, network, neural",,"networks, system, neural, network, method","recognition, control, system, speech, network","system, image, systems, control, network","network, system, neural, control,pattern"
3518,Wireless network coverage based on quality of ...,"processing, image, network, neural, apparatus","neural, network, training, electronic, method",,"networks, system, neural, network, method",,"networks, detection, system, images, method","method, network,networks, system,neural"
3519,Wireless network hybrid simulation,"recognition, pattern, speech, neural, network","pattern, recognition, system, network, neural",,"network, neural, data, system, method",,"system, image, systems, control, network","system, network, neural,pattern, recognition"
3520,Wireless neural network and a wireless neural ...,"processing, image, network, neural, apparatus","learning, neural, network, system, method",,"networks, system, neural, network, method",,"system, image, systems, control, network","network, neural, system, method,learning"
