# Trend Mining 

## Latent Dirichlet Allocation (LDA)

In this notebook you will be able to train and build LDA model. 
- Configurations for this notebook can be found in **LDA.yaml** file inside the **Config** folder
- Make sure you follow the setup instructions on **Readme.md** and have installed all the packages required for this task

### Load packages

In [None]:
import os
import yaml
import spacy
import pyLDAvis
import warnings
import numpy as np
import pandas as pd
import pyLDAvis.sklearn
from yaspin import yaspin
from ast import literal_eval
import statsmodels.api as sma
from collections import Counter
from yaml.loader import SafeLoader
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

pyLDAvis.enable_notebook()
warnings.filterwarnings('ignore')
# !python -m spacy download en # Run this for first time after installation you can comment this

### Load Config files

In [None]:
with open('../Config/Miners.yaml') as f:
    config = yaml.load(f, Loader=SafeLoader)
print('General Config:', config)

In [None]:
with open('../Config/LDA.yaml') as f:
    LDAConfig = yaml.load(f, Loader=SafeLoader)
print('LDA Config:', LDAConfig)

### Common function and class

In [None]:
def readFile(file, path):
    try:
        spinner = yaspin()
        complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}\\{file}'
        file_data = pd.read_csv(complete_path, index_col=0)
        spinner.write("✔️ File loaded.")
        spinner.stop()
        return file_data
    except Exception as e:
        print('Error reading file',e)

#### Common class

In [None]:
class LDA():
    """This is the class implementation for calculating LDA
    """
    def __init__(self, data_frame):
        self.data_frame = data_frame 
        self.dirName = ""
        self.tokenized = ""
        self.lemmatized = ""
        self.vectorizer = ""
        self.vectorized = ""
        self.lda_model = ""
        self.best_lda_model = ""
        self.best_model_output = ""
        self.df_document_topic = ""
        self.df_topic_distribution = ""
        self.df_topic_keywords = ""
        self.hot = ""
        self.cold =""
        self.spinner = yaspin()
    
    

    def createOutputDir(self, dirName):
        """This function creates the folder to store the output graphs and images

        Args:
            dirName (str): Name of the output folder
        """
        self.dirName = dirName
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
        does_folder_exist = os.path.exists(complete_path)
        if (does_folder_exist):
            self.spinner.write("✔️ Output directory already exists.")
        else:
            os.makedirs(complete_path)
            self.spinner.write("✔️ Folder created for output storage")

    def mergeTokenizedData(self):
        """This function converts the string representation of tokenized data into list
        """
        tokenized_rows = []
        for index, row in self.data_frame.iterrows(): 
            tokenized_rows.append(literal_eval(row["Tokenized_data"]))
        self.tokenized = tokenized_rows
        self.spinner.write("✔️ Tokenized data merged")
    
    def lemmatization(self, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
        """This function is used to lemmitize the tokenized data

        Args:
            allowed_postags (list, optional): Allowed postags for lemmitization. Defaults to ['NOUN', 'ADJ', 'VERB', 'ADV'].
        """
        nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
        texts_out = []
        for sent in self.tokenized:
            doc = nlp(" ".join(sent)) 
            texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
        self.lemmatized = texts_out
        self.spinner.write("✔️ Lemmitization applied.")
        
    def vectorization(self):
        """This function is used to vectorize the lemmitized data
        """
        self.vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )
        self.vectorized = self.vectorizer.fit_transform(self.lemmatized)
        self.spinner.write("✔️ Data vectorized")

    def computeSparsicity(self):
        """This function computes the sparsicity
        """
        data_dense = self.vectorized.todense()
        print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

    def buildLDAModel(self, hyper_parameters): 
        """This function builds LDA model and calculates its log-likelihood and Perplexity
        """
        self.spinner.start()
        self.spinner.write("🤖 Model building")
        self.lda_model = LatentDirichletAllocation(
                                      n_components= int(hyper_parameters['n_components']),               # Number of topics
                                      max_iter= int(hyper_parameters['max_iter']),               # Max learning iterations
                                      learning_method= str(hyper_parameters['learning_method']),   
                                      random_state= int(hyper_parameters['random_state']),          # Random state
                                      batch_size= int(hyper_parameters['batch_size']),            # n docs in each learning iter
                                      evaluate_every = int(hyper_parameters['evaluate_every']),       # compute perplexity every n iters, default: Don't
                                      n_jobs = int(hyper_parameters['n_jobs']),               # Use all available CPUs
                                      doc_topic_prior = float(hyper_parameters['doc_topic_prior']),
                                      learning_decay = float(hyper_parameters['learning_decay']),
                                      topic_word_prior = float(hyper_parameters['topic_word_prior']),
                                     )
        lda_output = self.lda_model.fit_transform(self.vectorized)
        self.spinner.stop()
        # See model parameters
        print('Model Parameters',self.lda_model.get_params())
        # Log Likelyhood: Higher the better
        print("Log Likelihood: ", self.lda_model.score(self.vectorized))
        # Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
        print("Perplexity: ", self.lda_model.perplexity(self.vectorized))
        
    def visualizeLDAvis(self):
        """This function generates the pyLDAvis report and saves it to the output folder
        """
        panel = pyLDAvis.sklearn.prepare(self.lda_model, self.vectorized, self.vectorizer, mds='tsne')
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
        pyLDAvis.save_html(panel, os.path.join(complete_path + f'\\{self.dirName}_lda.html'))
        self.spinner.write(f'✔️ Report saved')
         
    def buildImprovisedLDAModel(self, search_params): 
        """This builds the optimized LDA model by using GridSearchCV
        """
        self.spinner.start()
        self.spinner.write('🤖 Building improvised model')
        search_params = search_params
        lda = LatentDirichletAllocation()
        model = GridSearchCV(lda, param_grid=search_params)
        model.fit(self.vectorized)
        self.best_lda_model = model.best_estimator_
        self.best_model_output = self.best_lda_model.fit_transform(self.vectorized)
        self.spinner.stop()
        print("Best Models Params: ", model.best_params_)
        print("Best Log Likelihood Score: ", model.best_score_)
        print("Model Perplexity: ", self.best_lda_model.perplexity(self.vectorized))
        panel = pyLDAvis.sklearn.prepare(self.best_lda_model, self.vectorized, self.vectorizer, mds='tsne')
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
        pyLDAvis.save_html(panel, os.path.join(complete_path +  f'\\{self.dirName}_best_lda.html'))
        self.spinner.write(f'✔️ Report saved')
         
    def wordsInTopics(self):
        """Display first 10 words in each topic
        """
        print('First 10 words in each topic:')
        featureNames = self.vectorizer.get_feature_names()
        for idx, topic in enumerate(self.best_lda_model.components_):
            print ("Topic ", idx, " ".join(featureNames[i] for i in topic.argsort()[:-10 - 1:-1]))       
    
    def calculateDominantTopic(self):
        """This function calculates which topic is dominant for each data point/row in the dataframe
        """
        # Create Document - Topic Matrix
        lda_output = self.best_lda_model.transform(self.vectorized)
        topicnames = ["Topic" + str(i) for i in range(self.best_lda_model.n_components)]
        docnames = ["Doc" + str(i) for i in range(len(self.data_frame))]
        self.df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
        dominant_topic = np.argmax( self.df_document_topic.values, axis=1)
        self.df_document_topic['dominant_topic'] = dominant_topic
        self.data_frame['dominant_topic'] = dominant_topic
        return self.data_frame.head(4)

    def getTopicDistribution(self):
        """This function displays the distribution of data/rows/papers per topic
        """
        self.df_topic_distribution = self.df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
        self.df_topic_distribution.columns = ['Topic Num', 'Num Documents']
        print('Topic distribution')
        return self.df_topic_distribution.sort_values(by=['Topic Num'])

    def topKeywordsInEachTopic(self, n_words=20):
        """This function displays top keywords in each topic

        Args:
            n_words (int, optional): Number of words you want to display. Defaults to 20.
        """
        # Show top n keywords for each topic
        keywords = np.array(self.vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in self.best_lda_model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:n_words]
            topic_keywords.append(keywords.take(top_keyword_locs))
        self.df_topic_keywords = pd.DataFrame(topic_keywords)
        self.df_topic_keywords.columns = ['Word '+str(i) for i in range(self.df_topic_keywords.shape[1])] 
        self.df_topic_keywords['Topic'] = ['Topic '+str(i) for i in range(self.df_topic_keywords.shape[0])]
        self.df_topic_keywords.set_index('Topic')
        print(f'Top {n_words} words in each topic')
        return self.df_topic_keywords

    def printAbstractForTopic(self, topic=0):
        """This function prints the abstract for the given topic

        Args:
            topic (int, optional): Topic number for which you want to display the abstract. Defaults to 0.
        """
        abstract = self.data_frame[self.data_frame.dominant_topic == topic].Abstract_clean
        print(f'Abstract belonging to topic number {topic}')
        return abstract

    def topCitedTopics(self, year):
        """This function calculates the top cited topics according to total cites, topic age, paper count, cite per year and cite per topic
        """
        cite_sum = []
        topic_age = []

        for i in range(self.best_lda_model.n_components):
            group_rows = self.data_frame[self.data_frame.dominant_topic == i]
            cite_sum.append(group_rows.Cites.sum())
            topic_age.append((year - group_rows.Date.astype('datetime64[ns]').dt.year).sum())
            
        self.df_topic_distribution['Cite Sum'] = cite_sum
        self.df_topic_distribution['Topic Age'] = topic_age
        self.df_topic_distribution['Paper Count'] = self.df_topic_distribution['Num Documents']
        self.df_topic_distribution['Cite Per Year'] = self.df_topic_distribution['Cite Sum'] / self.df_topic_distribution['Topic Age']
        self.df_topic_distribution['Cite Per Topic'] = self.df_topic_distribution['Cite Sum'] / self.df_topic_distribution['Paper Count']

        # Top cited per year
        top_cited_per_year = self.df_topic_distribution[self.df_topic_distribution['Cite Per Year'] == self.df_topic_distribution['Cite Per Year'].max()]
        print('Top cited per year')
        print(self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(top_cited_per_year['Topic Num'].values[0])])

        # Most cited
        most_cited = self.df_topic_distribution[self.df_topic_distribution['Cite Sum'] == self.df_topic_distribution['Cite Sum'].max()]
        print('Most cited')
        print(self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(most_cited['Topic Num'].values[0])])

        # Oldest topic
        oldest_topic = self.df_topic_distribution[self.df_topic_distribution['Topic Age'] == self.df_topic_distribution['Topic Age'].max()]
        print('Oldest topic')
        print(self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(oldest_topic['Topic Num'].values[0])])

        # Most popular topic
        most_popular = self.df_topic_distribution[self.df_topic_distribution['Paper Count'] == self.df_topic_distribution['Paper Count'].max()]
        self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(most_popular['Topic Num'].values[0])]

    def getTopFive(self):
        """This function calculates the top five cited topics according to total cites, topic age, paper count, cite per year and cite per topic
        """
        # Top 5 cited per year
        sorted_cite_per_year = self.df_topic_distribution.sort_values(by='Cite Per Year', ascending=False)
        top_five_topic_numbers = sorted_cite_per_year[:5]
        print('Top 5 cited topics per year')
        for index, row in top_five_topic_numbers.iterrows():
            words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(int(row['Topic Num']))]
            print(words)

        # Top 5 most cited 
        sorted_cited = self.df_topic_distribution.sort_values(by='Cite Sum', ascending=False)
        top_five_topic_numbers = sorted_cited[:5]
        print('Top 5 Most cited topics')
        for index, row in top_five_topic_numbers.iterrows():
            words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(int(row['Topic Num']))]
            print(words)

        # Top 5 oldest topic
        sorted_topic_age = self.df_topic_distribution.sort_values(by='Topic Age', ascending=False)
        top_five_topic_numbers = sorted_topic_age[:5]
        print('Top 5 Oldest topics')
        for index, row in top_five_topic_numbers.iterrows():
            words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(int(row['Topic Num']))]
            print(words)

        # Top 5 most popular
        sorted_paper_count = self.df_topic_distribution.sort_values(by='Paper Count', ascending=False)
        top_five_topic_numbers = sorted_paper_count[:5]
        print('Top 5 most cited topics')
        for index, row in top_five_topic_numbers.iterrows():
            words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(int(row['Topic Num']))]
            print(words)
    
    def hotAndColdTopicByDate(self):
        medians = []
        for i in range(self.best_lda_model.n_components):
            group_rows = self.data_frame[self.data_frame.dominant_topic == i]
            median = group_rows.Date.astype('datetime64[ns]').quantile(0.5, interpolation="midpoint")
            medians.append(median)
        
        median_dates = pd.DataFrame(medians, columns=['Date'])
        median_dates['Date'].dt.date 

        self.hot = median_dates['Date'].idxmax() 
        hot_words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(self.hot)]
        print('Hot Words')
        print(hot_words)

        hot_topics = self.data_frame[self.data_frame['dominant_topic'] == self.hot]
        print('Hot topic titles:')
        print(hot_topics.Title_clean)

        self.cold = median_dates['Date'].idxmin() 
        cold_words = self.df_topic_keywords[self.df_topic_keywords.Topic == 'Topic '+str(self.cold)]
        print('Cold Words')
        print(cold_words)

        cold_topics = self.data_frame[self.data_frame['dominant_topic'] == self.cold]
        print('Cold topic titles')
        print(cold_topics.Title_clean)
   
    def plotTopicTrend(self):
        self.data_frame['Year'] = pd.DatetimeIndex(self.data_frame['Date']).year
 
        topic_dictionaries = []

        for i in range(self.best_lda_model.n_components):
            group_rows = self.data_frame[self.data_frame.dominant_topic == i]
            topic_years = group_rows.Year
            topic_year_count = Counter(topic_years) 
            topic_dictionaries.append(topic_year_count)
            
        topic_trend = pd.DataFrame.from_dict(topic_dictionaries)
        topic_trend.set_index(topic_trend.columns[0])
        topic_trend.fillna(0, inplace=True) 
        topic_trend_transposed = topic_trend.T
        topic_trend_transposed['Year'] = list(topic_trend.columns)
        topic_trend_transposed.drop(['Year'], axis=1, inplace=True)
        topic_trend_transposed.sort_index(inplace=True)
        ax = topic_trend_transposed.plot(figsize=(20, 10), title='Topic trends')
        ax.set_xticklabels(topic_trend_transposed.index)
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
        ax.get_figure().savefig(os.path.join(complete_path, f"{self.dirName}_topic_trends.png"))
        self.spinner.write(f'✔️ Figure saved')

    def plotHotVsCold(self):
        hot_topic_data = self.data_frame[self.data_frame['dominant_topic'] == self.hot]
        hot_topic_data_years =  hot_topic_data.Year
        hot_topic_year_count = [Counter(hot_topic_data_years)] 
        hot_topic_year_count=  pd.DataFrame.from_dict(hot_topic_year_count)
        hot_topic_year_count['Type'] = 'Hot topic'
        
        cold_topic_data = self.data_frame[self.data_frame['dominant_topic'] == self.cold]
        cold_topic_data_years =  cold_topic_data.Year
        cold_topic_year_count = [Counter(cold_topic_data_years)] 
        cold_topic_year_count=  pd.DataFrame.from_dict(cold_topic_year_count)
        cold_topic_year_count['Type'] = 'Cold Topic'

        combined = pd.concat([hot_topic_year_count, cold_topic_year_count], ignore_index=True)
        combined.fillna(0, inplace=True)
        
        combined_trasnposed = combined.T 
        combined_trasnposed.rename(columns={0: "Hot Topic", 1: "Cold Topic"}, inplace=True)
        combined_trasnposed.drop(['Type'], axis=0, inplace=True)
        combined_trasnposed.sort_index(inplace=True)
        
        ax = combined_trasnposed.plot(figsize=(20, 10), title='Hot vs Cold')
        ax.set_xticklabels(combined_trasnposed.index)
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
        ax.get_figure().savefig(os.path.join(complete_path, f"{self.dirName}_hot_vs_cold.png"))
        self.spinner.write(f'✔️ Figure saved')

    def trendAnalysisUsingTheta(self):
        theta = self.best_model_output
        years = pd.DatetimeIndex(self.data_frame['Date']).year
        theta_df = pd.DataFrame(theta)
        theta_df['Years'] = years

        unique_years = theta_df['Years'].unique()
        theta_mean_by_year = []

        for i in range(len(unique_years)):
            grouped_thetas = theta_df[theta_df['Years'] == unique_years[i]]
            theta_mean_by_year.append(grouped_thetas.mean())
            
        theta_mean_by_year = pd.DataFrame(theta_mean_by_year)
        x = theta_mean_by_year['Years']

        cols = theta_mean_by_year.drop(['Years'], axis=1).columns

        model_details = []

        for index, value in enumerate(cols):
            y = theta_mean_by_year[value]
            est = sma.OLS(y, x)
            fitted_model = est.fit()
            details = {
                'topic' : value,
                'pvalue' : fitted_model.pvalues[0],
                'coef' : fitted_model.params[0]
            }
            model_details.append(details)
            
        model_details_df = pd.DataFrame.from_dict(model_details)  
        positive_slope = model_details_df[model_details_df['coef'] >=0]
        negative_slope = model_details_df[model_details_df['coef'] <0]
        print(positive_slope.shape, negative_slope.shape)

        p_level = [0.01, 0.03, 0.05]
        trends = []

        for i in range(len(p_level)):
            positive_group = positive_slope[positive_slope['pvalue'] <= p_level[i]]
            negative_group = negative_slope[negative_slope['pvalue'] <= p_level[i]]
            count_pos = len(positive_group)
            count_neg = len(negative_group)
            data = {
                'P-level' : p_level[i],
                'Negative Trend': count_neg,
                'Positive Trend' : count_pos,
                'Hot Topics' : positive_group.topic.values,
                'Cold Topics' : negative_group.topic.values
                
            }
            trends.append(data)
            
        trends = pd.DataFrame(trends)
        
        thetas_by_year = theta_mean_by_year 
        thetas_by_year.sort_values(by='Years',inplace=True)

        hot_topics =  list(trends[trends['P-level'] == 0.05]['Hot Topics'])
        cold_topics =  list(trends[trends['P-level'] == 0.05]['Cold Topics']) 

        hot_topic_trend = thetas_by_year[hot_topics[0]]
        if  hot_topic_trend.shape[1] > 0:
            hot_topic_trend['Years'] = theta_mean_by_year.Years
            
        cold_topic_trend = thetas_by_year[cold_topics[0]]
        if  cold_topic_trend.shape[1] > 0:
            cold_topic_trend['Years'] = theta_mean_by_year.Years


        if hot_topic_trend.shape[1] > 0:
            ax = hot_topic_trend.plot(x='Years',figsize=(20, 10))
            ax.set_xticklabels(theta_mean_by_year.Years)
            complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
            ax.get_figure().savefig(os.path.join(complete_path, f"{self.dirName}_hot_based_on_theta.png"))
            self.spinner.write(f'✔️ Figure saved')
        else: 
            print('No hot topic')

        
        if cold_topic_trend.shape[1] > 0:
            ax = cold_topic_trend.plot(x='Years',figsize=(20, 10))
            ax.set_xticklabels(cold_topic_trend.Years)
            complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\LDA\\{self.dirName}"
            ax.get_figure().savefig(os.path.join(complete_path, f"{self.dirName}_cold_based_on_theta.png"))
            self.spinner.write(f'✔️ Figure saved')
            
        else: 
            print('No cold topic')

### Reddit

In [None]:
reddit_data = readFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
reddit_lda = LDA(reddit_data)

In [None]:
reddit_lda.createOutputDir('Reddit')

In [None]:
reddit_lda.mergeTokenizedData()

In [None]:
reddit_lda.lemmatization(allowed_postags=LDAConfig['ALLOWED_POSTAGS'])

In [None]:
reddit_lda.vectorization()

In [None]:
reddit_lda.computeSparsicity()

In [None]:
reddit_lda.buildLDAModel(LDAConfig['MODEL_HYPER_PARAMETERS'])

In [None]:
reddit_lda.visualizeLDAvis()

In [None]:
reddit_lda.buildImprovisedLDAModel(LDAConfig['SEARCH_PARAMS'])

In [None]:
reddit_lda.wordsInTopics()

In [None]:
reddit_lda.calculateDominantTopic()

In [None]:
reddit_lda.getTopicDistribution()

In [None]:
reddit_lda.topKeywordsInEachTopic()

In [None]:
reddit_lda.printAbstractForTopic(LDAConfig['ABASTRACT_FOR_TOPIC_NUMBER'])

In [None]:
reddit_lda.topCitedTopics(LDAConfig['YEAR'])

In [None]:
reddit_lda.getTopFive()

In [None]:
reddit_lda.hotAndColdTopicByDate()

In [None]:
reddit_lda.plotTopicTrend()

In [None]:
reddit_lda.plotHotVsCold()

In [None]:
reddit_lda.trendAnalysisUsingTheta()

### Stackoverflow

In [None]:
stackoverflow_data = readFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
stackoverflow_lda = LDA(stackoverflow_data)

In [None]:
stackoverflow_lda.createOutputDir('Stackoverflow')

In [None]:
stackoverflow_lda.mergeTokenizedData()

In [None]:
stackoverflow_lda.lemmatization(allowed_postags=LDAConfig['ALLOWED_POSTAGS'])

In [None]:
stackoverflow_lda.vectorization()

In [None]:
stackoverflow_lda.computeSparsicity()

In [None]:
stackoverflow_lda.buildLDAModel(LDAConfig['MODEL_HYPER_PARAMETERS'])

In [None]:
stackoverflow_lda.visualizeLDAvis()

In [None]:
stackoverflow_lda.buildImprovisedLDAModel(LDAConfig['SEARCH_PARAMS'])

In [None]:
stackoverflow_lda.wordsInTopics()

In [None]:
stackoverflow_lda.calculateDominantTopic()

In [None]:
stackoverflow_lda.getTopicDistribution()

In [None]:
stackoverflow_lda.topKeywordsInEachTopic()

In [None]:
stackoverflow_lda.printAbstractForTopic(LDAConfig['ABASTRACT_FOR_TOPIC_NUMBER'])

In [None]:
stackoverflow_lda.topCitedTopics(LDAConfig['YEAR'])

In [None]:
stackoverflow_lda.getTopFive()

In [None]:
stackoverflow_lda.hotAndColdTopicByDate()

In [None]:
stackoverflow_lda.plotTopicTrend()

In [None]:
stackoverflow_lda.plotHotVsCold()

In [None]:
stackoverflow_lda.trendAnalysisUsingTheta()

### Scopus

In [None]:
scopus_data = readFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
scopus_lda = LDA(scopus_data)

In [None]:
scopus_lda.createOutputDir('Scopus')

In [None]:
scopus_lda.mergeTokenizedData()

In [None]:
scopus_lda.lemmatization(allowed_postags=LDAConfig['ALLOWED_POSTAGS'])

In [None]:
scopus_lda.vectorization()

In [None]:
scopus_lda.computeSparsicity()

In [None]:
scopus_lda.buildLDAModel(LDAConfig['MODEL_HYPER_PARAMETERS'])

In [None]:
scopus_lda.visualizeLDAvis()

In [None]:
scopus_lda.buildImprovisedLDAModel(LDAConfig['SEARCH_PARAMS'])

In [None]:
scopus_lda.wordsInTopics()

In [None]:
scopus_lda.calculateDominantTopic()

In [None]:
scopus_lda.getTopicDistribution()

In [None]:
scopus_lda.topKeywordsInEachTopic()

In [None]:
scopus_lda.printAbstractForTopic(LDAConfig['ABASTRACT_FOR_TOPIC_NUMBER'])

In [None]:
scopus_lda.topCitedTopics(LDAConfig['YEAR'])

In [None]:
scopus_lda.getTopFive()

In [None]:
scopus_lda.hotAndColdTopicByDate()

In [None]:
scopus_lda.plotTopicTrend()

In [None]:
scopus_lda.plotHotVsCold()

In [None]:
scopus_lda.trendAnalysisUsingTheta()