# Trend Mining 

## Document term matrix and Dendogram clustring

In this notebook you will be able to analyze Document term matrix and make Dendogram clustring. 
- Configurations for this notebook can be found in **Dendogram.yaml** file inside the **Config** folder
- Make sure you follow the setup instructions on **Readme.md** and have installed all the packages required for this task

### Load packages

In [None]:
import os
import nltk
import yaml
import warnings
import pandas as pd
import seaborn as sns
from yaspin import yaspin
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 
from yaml.loader import SafeLoader
from nltk.stem import PorterStemmer
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist  
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')
warnings.filterwarnings('ignore')

### Load config file

In [None]:
with open('../Config/Miners.yaml') as f:
    config = yaml.load(f, Loader=SafeLoader)
print('General Config:', config)

In [None]:
with open('../Config/Dendogram.yaml') as f:
    dendogramConfig = yaml.load(f, Loader=SafeLoader)
print('Dendogram Config:', dendogramConfig)

### Common Functions and Class

In [None]:
def readFile(file, path):
    try:
        spinner = yaspin()
        complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}\\{file}'
        file_data = pd.read_csv(complete_path, index_col=0)
        spinner.write("✔️ File loaded.")
        spinner.stop()
        return file_data
    except Exception as e:
        print('Error reading file',e)

##### Common Class

In [None]:
class DTM():
    """
    This is the class implementation for generating Document-Term Matrix and Dendogram clustring
  """
  
    def __init__(self, data_frame):
        self.data_frame = data_frame
        self.vec_df = pd.DataFrame()
        self.frequent_words = pd.DataFrame()
        self.sorted_frequent_words = pd.DataFrame()
        self.top_words = pd.DataFrame()
        self.dirName = ""
        self.spinner = yaspin()
    
        print(f'Data has {len(data_frame)} rows')
        
        
    def createOutputDir(self, dirName):
        """This function creates the folder to store the output graphs and images

        Args:
            dirName (str): Name of the output folder
        """
        self.dirName = dirName
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\Dendogram\\{self.dirName}"
        does_folder_exist = os.path.exists(complete_path)
        if (does_folder_exist):
            self.spinner.write("✔️ Output directory already exists.")
        else:
            os.makedirs(complete_path)
            self.spinner.write("✔️ Folder created for output storage")
            

    def saveFile(self, filename, path):
        """This function saves the file with all new columns

        Args:
            file (str): file name
            path (str): file path
        """
        complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}'
        if(os.path.exists(f'{complete_path}\\{filename}')):
            self.spinner.write(f"🔁 Replacing already existing {filename} file")
            os.remove(f'{complete_path}\\{filename}')
        
        self.data_frame.to_csv(f'{complete_path}\\{filename}')
        print()
        self.spinner.write(f'✔️ {filename} saved in {path} directory')
        
        
    def get_data(self):
        """This function returns the dataframe itself

        Returns:
            dataframe: data that is operated upon
        """
        return self.data_frame
  

    def print_data_head(self, rows=3):
        """This function prints the top rows of the data

        Args:
            rows (int, optional): number of rows from dataset you want to print. Defaults to 3.
        """
        print("Data head with top", rows, "rows")
        print(self.data_frame.head(rows))

        
    def print_data_tail(self, rows=3):
        """This function prints last rows of the data

        Args:
            rows (int, optional): number of rows from dataset you want to print. Defaults to 3.
        """
        print("Data tail with last", rows, "rows")
        print(self.data_frame.tail(rows))

        
    def print_dtm(self, rows=3):
        """This function prints the vectorized data

        Args:
        rows (int, optional): number of rows from vectorized data you want to print. Defaults to 3.
        """
        print("Vectorized data with top", rows, "rows")
        print(self.vec_df.head(rows))

        
    def print_frequent_words(self,rows=3):
        """This function prints the most frequent words

        Args:
            rows (int, optional): number of rows to be printed. Defaults to 3.
        """
        print("Frequent top", rows, "rows")
        print(self.frequent_words.head(rows))

        
    def print_sorted_frequent_words(self, rows=3):
        """This function prints the frequent words in sorted order

        Args:
            rows (int, optional): number of rows to be printed. Defaults to 3.
        """
        print(f'Top {rows} most frequent words:')
        self.sorted_frequent_words.set_index('word')
        print (self.sorted_frequent_words.head(rows))  
  

    def print_top_words(self, rows=3):
        """This function prints the   to top words

        Args:
            rows (int, optional): number of rows to be printed. Defaults to 3.
        """
        print("Top", rows, "words")
        print(self.top_words.head(rows))

        
    def remove_stop_words(self, custom_stopwords = [] ):
        """This function is used to remove the stop words

        Args:
            custom_stopwords (list, optional): any other custom stop word. Defaults to [].

        Returns:
            dataframe: dataframe with removed stop words in abstract and in title 
        """
        try:
            stop_words = set(stopwords.words("english"))
            stop_words = stop_words.union(custom_stopwords)
            print('total stop words:', len(stop_words))
            self.data_frame['Abstrat_without_stopwords'] = self.data_frame['Abstract_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
            self.data_frame['Title_without_stopwords'] = self.data_frame['Title_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
            self.spinner.write(f'✔️ Stop words removed')
        except Exception as e:
            print(e)

            
    def combine_title_and_abs(self):
        """This function combines the title and abstract with no stop words

        Returns:
            dataframe: dataframe with merged title and abstract in a new column
        """
        self.data_frame['Merged_title_and_abs'] = self.data_frame["Title_without_stopwords"] + self.data_frame["Abstrat_without_stopwords"]
        self.spinner.write(f'✔️ Data combined')

    
    def stemming(self):
        """This function is used to stem and tokenize the data

        Returns:
            dataframe: dataframe with tokenized and stemmed data
        """
        porter_stemmer = PorterStemmer() 
        self.data_frame['Tokenized_data'] = self.data_frame.apply(lambda row: nltk.word_tokenize(row['Merged_title_and_abs']), axis=1)
        self.data_frame['Stem_data'] = self.data_frame['Tokenized_data'].apply(lambda x : [porter_stemmer.stem(y) for y in x])
        self.spinner.write(f'✔️ Stemming applied to data')

    
    def document_term_matrix(self, column_name):
        """This function generated document term matrix

        Args:
            column_name (str): column of the dataframe to which this function is applied
        """
        vec = CountVectorizer()
        stem_data = self.data_frame .apply(lambda row : ' '.join(row[column_name]), axis=1)
        stem_data  = stem_data.tolist()
        X = vec.fit_transform(stem_data)
        self.vec_df = pd.DataFrame(X.toarray(), columns = vec.get_feature_names())
        self.spinner.write(f'✔️ Document term matrix created for {column_name}')


    def frequent_terms(self): 
        """This function is used to get frequent terms
        """
        vec_df = self.vec_df
        self.frequent_words['word'] = vec_df.columns
        self.frequent_words['frequency'] = list(vec_df.sum())
        self.spinner.write(f'✔️ Frequent words calculated')

        
    def sort_frequent_terms(self):
        """This function sorts the frequent terms based on frequency
        """
        self.sorted_frequent_words = pd.DataFrame(columns=['word', 'frequency'])
        self.sorted_frequent_words = self.frequent_words.sort_values(by=['frequency'], ascending=False)
        self.spinner.write(f'✔️ Frequent word sorted')
        
    def keep_top_words(self, max_frequency=100): 
        """This function keeps top words based on the max_frequency

        Args:
            max_frequency (int, optional): frequency threshold. Defaults to 100.
        """
        self.top_words = self.sorted_frequent_words[self.sorted_frequent_words['frequency'] >= max_frequency]
        self.spinner.write(f'✔️ Top {max_frequency} words kept')
        
    def visualize_frequent_words(self):
        """Saves the frequent words to an image
        """
        self.spinner.write(f'✔️ Figure saved')
        plt.rcParams["figure.figsize"] = 20,40
        sns.barplot(x="frequency", y="word", data=self.top_words)
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\Dendogram\\{self.dirName}"
        plt.savefig(os.path.join(complete_path, f"{self.dirName}_frequent_terms.png"))

    def dendogram_clusting(self):
        """Generates and saves dendogram to an image
        """
        self.spinner.start()
        self.spinner.write(f'✨ Generating dendogram cluster')        
        distance_matrix = pdist(self.vec_df, metric='euclidean')
        plt.figure(figsize=(25, 200))
        plt.title('Hierarchical Clustering Dendrogram') 
        dendrogram = sch.dendrogram(sch.linkage(distance_matrix, method = 'ward'),
                                orientation="right", 
                                labels=self.data_frame['Title_without_stopwords'].tolist(),
                                leaf_font_size=9
                                )
        self.spinner.stop()
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\Dendogram\\{self.dirName}"
        plt.savefig(os.path.join(complete_path, f"{self.dirName}_dendogram.png"))
        self.spinner.write(f'✔️ Figure saved')

### Reddit

In [None]:
reddit_data = readFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])
reddit_data.head(3)

In [None]:
reddit_data_DTM = DTM(reddit_data)

In [None]:
reddit_data_DTM.createOutputDir("Reddit")

In [None]:
reddit_data_DTM.remove_stop_words(dendogramConfig['REDDIT_STOPWORDS'])

In [None]:
reddit_data_DTM.combine_title_and_abs()

In [None]:
reddit_data_DTM.stemming()

In [None]:
reddit_data_DTM.document_term_matrix(dendogramConfig['COLUMN_TO_TOKENIZE']) 

In [None]:
reddit_data_DTM.frequent_terms()  

In [None]:
reddit_data_DTM.print_frequent_words(3)

In [None]:
reddit_data_DTM.sort_frequent_terms()  

In [None]:
reddit_data_DTM.print_sorted_frequent_words(3)

In [None]:
reddit_data_DTM.keep_top_words(dendogramConfig['TOP_WORDS_MAX_FREQUENCY'])  

In [None]:
reddit_data_DTM.print_top_words(3)

In [None]:
reddit_data_DTM.visualize_frequent_words()

In [None]:
reddit_data_DTM.dendogram_clusting()

In [None]:
reddit_data_DTM.saveFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])

### Stackoverflow

In [None]:
stackoverflow_data = readFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])
stackoverflow_data.head(3)

In [None]:
stackoverflow_data_DTM = DTM(stackoverflow_data)

In [None]:
stackoverflow_data_DTM.createOutputDir("Stackoverflow")

In [None]:
stackoverflow_data_DTM.remove_stop_words(dendogramConfig['STACKOVERFLOW_STOPWORDS'])

In [None]:
stackoverflow_data_DTM.combine_title_and_abs()

In [None]:
stackoverflow_data_DTM.stemming()

In [None]:
stackoverflow_data_DTM.document_term_matrix(dendogramConfig['COLUMN_TO_TOKENIZE'])

In [None]:
stackoverflow_data_DTM.frequent_terms()  

In [None]:
stackoverflow_data_DTM.print_frequent_words()

In [None]:
stackoverflow_data_DTM.sort_frequent_terms()  

In [None]:
stackoverflow_data_DTM.print_sorted_frequent_words(3)

In [None]:
stackoverflow_data_DTM.keep_top_words(dendogramConfig['TOP_WORDS_MAX_FREQUENCY'])  

In [None]:
stackoverflow_data_DTM.print_top_words(3)

In [None]:
stackoverflow_data_DTM.visualize_frequent_words()

In [None]:
stackoverflow_data_DTM.dendogram_clusting()

In [None]:
stackoverflow_data_DTM.saveFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])

### Scopus

In [None]:
scopus_data = readFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])
scopus_data.head(3)

In [None]:
scopus_data_DTM = DTM(scopus_data)

In [None]:
scopus_data_DTM.createOutputDir("Scopus")

In [None]:
scopus_data_DTM.remove_stop_words(dendogramConfig['SCOPUS_STOPWORDS'])

In [None]:
scopus_data_DTM.combine_title_and_abs()

In [None]:
scopus_data_DTM.stemming()

In [None]:
scopus_data_DTM.document_term_matrix(dendogramConfig['COLUMN_TO_TOKENIZE'])

In [None]:
scopus_data_DTM.frequent_terms()  

In [None]:
scopus_data_DTM.print_frequent_words(3)

In [None]:
scopus_data_DTM.sort_frequent_terms()  

In [None]:
scopus_data_DTM.print_sorted_frequent_words(3)

In [None]:
scopus_data_DTM.keep_top_words(dendogramConfig['TOP_WORDS_MAX_FREQUENCY'])  

In [None]:
scopus_data_DTM.print_top_words(3)

In [None]:
scopus_data_DTM.visualize_frequent_words()

In [None]:
scopus_data_DTM.dendogram_clusting() 

In [None]:
scopus_data_DTM.saveFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])