In [2]:
#pip install gensim
import os
import re
import pandas as pd
from string import punctuation
import nltk
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel, LdaModel,CoherenceModel
from textblob import TextBlob
from collections import Counter 

In [3]:
#reading the data file using pandas
df = pd.read_csv("BBC-articles.csv")

In [4]:
#stopwords
stop_words = stopwords.words('english')
#initializing lemmatizer


In [18]:
#cleaning the text by removing the punctuation at ends of the string, making it lower, 
#removing whitespace characters and extra spaces
#clean method specifies which method to use for cleaning.
def clean_text(text,clean_method):
    text = text.strip(punctuation).lower()
    text = re.sub(r'[^a-zA-Z]',' ',text)
    text = re.sub(r' +',' ',text)
    #tokenizing using the nltk tokenize for generating words in an article
    words = nltk.tokenize.word_tokenize(text)
    words = [w for w in words if w not in stop_words and len(w)>2]
    if(clean_method == 1):
        #initializing the lemmatizer
        lemma = nltk.stem.WordNetLemmatizer()
        lemmatized = [lemma.lemmatize(w) for w in words]
        return lemmatized
    elif(clean_method == 2):
        #joining back to a string to extract nouns using textblob methods
        modified_text=' '.join([w for w in words])
        blob_object = TextBlob(modified_text)
        #Limiting the word list with nouns
        word_list_nouns = [word for word,pos in blob_object.tags if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
        return word_list_nouns
    


In [6]:
# Determining optimum number of topics using coherence values 
def maxCoherence(model,corpus,dict_1,token_1):
    coherence_values = []
    min_topics, max_topics, step = 1, 5, 1
    for i in range(min_topics, max_topics, step):
        m = model(corpus,id2word = dict_1,num_topics = i)       
        coherencemodel = CoherenceModel(model=m,texts = token_1, dictionary=dict_1, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
    return coherence_values.index(max(coherence_values))



In [26]:
#to build model based on the parameters
def model(model,corpus,dict_1,num_topics):
    return model(corpus,id2word = dict_1, num_topics = num_topics)

In [16]:
def tfidf_preparation(method,clean_method):
    #method is to prepare td-idf according to 1 and 2 questions and clean method is to prepare according to 3rd question
    #cleaning the text and creating a new list to store it
    #method is used to solved 3 questions at once by passing parameters
    token = []
    for i in range(0,len(df)):
        token.append(clean_text(df['text'][i],clean_method))
    #creating a dictionary from tokens
    if(method == 1):
        dict_1 = Dictionary(token)
        #creating document term matrix using doc2bow method from dictionary
        dtm = [dict_1.doc2bow(doc) for doc in token]
        # TF-IDF Vectorization
        tfidf = TfidfModel(dtm)
        tfidf = tfidf[dtm]
        dict1 = dict_1
        return tfidf,dict1,token
    elif(method == 2):
        dict_1 = Dictionary(token)
        dict_1.filter_extremes(no_below=5, no_above=0.90) # filtering the top10% and words less than 5 times in docs
        dtm = [dict_1.doc2bow(doc) for doc in token]
        tfidf = TfidfModel(dtm) #tf-idf vectorization
        tfidf = tfidf[dtm]
        dict1 = dict_1
        return tfidf,dict1,token
    elif(method == 3):
        dict_1 = Dictionary(token)
        #creating document term matrix using doc2bow method from dictionary
        dtm = [dict_1.doc2bow(doc) for doc in token]
        # TF-IDF Vectorization
        tfidf = TfidfModel(dtm)
        tfidf = tfidf[dtm]
        dict1 = dict_1
        return tfidf,dict1,token
    
    

In [9]:
# Get dominant topic and corresponding keywords for each article
def keywords_generation(model,tfidf):
    keywords_df = pd.DataFrame()   
    for i, row in enumerate(model[tfidf]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the central topic and Keywords for each article
        for j, (topic_num, topic) in enumerate(row):
            if j == 0:  # row is sorted so every first element in the list is the dominant keyword
                keywords = model.show_topic(topic_num, topn = 5) # top 5 keywords
                words = ' '.join([w for w,p in keywords])
                keywords_df = keywords_df.append(pd.Series([words]), ignore_index=True)
 
    return keywords_df
         
    


In [21]:
#calculating coherence
lsi_coherence1 = maxCoherence(LsiModel,tfidf_preparation(1,1)[0],tfidf_preparation(1,1)[1],tfidf_preparation(1,1)[2])
lda_coherence2 = maxCoherence(LdaModel,tfidf_preparation(1,1)[0],tfidf_preparation(1,1)[1],tfidf_preparation(1,1)[2])
lsi_coherence3 = maxCoherence(LsiModel,tfidf_preparation(2,1)[0],tfidf_preparation(2,1)[1],tfidf_preparation(2,1)[2])
lda_coherence4 = maxCoherence(LdaModel,tfidf_preparation(2,1)[0],tfidf_preparation(2,1)[1],tfidf_preparation(2,1)[2])
lsi_coherence5 = maxCoherence(LsiModel,tfidf_preparation(3,2)[0],tfidf_preparation(3,2)[1],tfidf_preparation(3,2)[2])
lda_coherence6 = maxCoherence(LdaModel,tfidf_preparation(3,2)[0],tfidf_preparation(3,2)[1],tfidf_preparation(3,2)[2])


In [22]:
print("optimal number of topics for BBC articles are", lsi_coherence1,lda_coherence2,lsi_coherence3,lda_coherence4,lsi_coherence5,lda_coherence6)

optimal number of topics for BBC articles are 3 3 1 2 1 3


In [27]:
#building models
lsi_model1 = model(LsiModel,tfidf_preparation(1,1)[0],tfidf_preparation(1,1)[1],lsi_coherence1)
lda_model2 = model(LdaModel,tfidf_preparation(1,1)[0],tfidf_preparation(1,1)[1],lda_coherence2)
lsi_model3 = model(LsiModel,tfidf_preparation(2,1)[0],tfidf_preparation(2,1)[1],lsi_coherence3)
lda_model4 = model(LdaModel,tfidf_preparation(2,1)[0],tfidf_preparation(2,1)[1],lda_coherence4)
lsi_model5 = model(LsiModel,tfidf_preparation(3,2)[0],tfidf_preparation(3,2)[1],lsi_coherence5)
lda_model6 = model(LdaModel,tfidf_preparation(3,2)[0],tfidf_preparation(3,2)[1],lda_coherence6)

# 1.  after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.),

In [28]:
df['LSI_keywords1'] = keywords_generation(lsi_model1,tfidf_preparation(1,1)[0])
df['LDA_keywords1'] = keywords_generation(lda_model2,tfidf_preparation(1,1)[0])

# 2. with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the

In [29]:
df['LSI_keywords2'] = keywords_generation(lsi_model3,tfidf_preparation(2,1)[0])
df['LDA_keywords2'] = keywords_generation(lda_model4,tfidf_preparation(2,1)[0])

# 3. with a part of speech filter, to limit your TD-IDF matrix to nouns only. 

In [30]:
df['LSI_keywords3'] = keywords_generation(lsi_model5,tfidf_preparation(3,2)[0])
df['LDA_keywords3'] = keywords_generation(lda_model6,tfidf_preparation(3,2)[0])

In [32]:
df.head(5)

Unnamed: 0,category,text,LSI_keywords1,LDA_keywords1,LSI_keywords2,LDA_keywords2,LSI_keywords3,LDA_keywords3
0,tech,tv future in the hands of viewers with home th...,labour election blair tax game,mobile phone film search game,labour election game film blair,mobile phone sale market profit,election blair government party people,data phones bank phone virus
1,business,worldcom boss left books alone former worldc...,labour election blair tax game,mobile phone film search game,labour election game film blair,film blair election award search,election blair government party people,data phones bank phone virus
2,sport,tigers wary of farrell gamble leicester say ...,labour election blair tax game,mobile phone film search game,labour election game film blair,film blair election award search,election blair government party people,film sales awards oil dollar
3,sport,yeading face newcastle in fa cup premiership s...,labour election blair tax game,party blair election kilroy tax,labour election game film blair,mobile phone sale market profit,election blair government party people,data phones bank phone virus
4,entertainment,ocean s twelve raids box office ocean s twelve...,labour election blair tax game,mobile phone film search game,labour election game film blair,film blair election award search,election blair government party people,film sales awards oil dollar


# Finding most common keywords among all

In [35]:
df['keyword'] = df[df.columns[2:]].apply(lambda x: ' '.join(x.dropna().astype(str)),axis=1)

In [44]:
# Getting 5 most common keywords from all the LSI and LDA Keywords
top_5_words = []
for i in df.index:
    common_keywords = df['keyword'][i].split(' ')
    most_occur = Counter(common_keywords).most_common(5) 
    top_5_words = ' '.join([word[0] for word in most_occur])

df['common_5_words'] = top_5_words

In [48]:
df.head(2)

Unnamed: 0,category,text,LSI_keywords1,LDA_keywords1,LSI_keywords2,LDA_keywords2,LSI_keywords3,LDA_keywords3,keyword,common_5_words
0,tech,tv future in the hands of viewers with home th...,labour election blair tax game,mobile phone film search game,labour election game film blair,mobile phone sale market profit,election blair government party people,data phones bank phone virus,labour election blair tax game mobile phone fi...,election blair film game party
1,business,worldcom boss left books alone former worldc...,labour election blair tax game,mobile phone film search game,labour election game film blair,film blair election award search,election blair government party people,data phones bank phone virus,labour election blair tax game mobile phone fi...,election blair film game party


In [49]:
df.to_csv('final_bbc.csv')

 From the above methods, LDA_keywords3(which is parts of speech filter(only nouns)) seems the best among all the methods because the words are data,phones,phone and virus. 4 out of the 5 keywords are related to tech category