# Trend Mining

## Word Clouds

In this notebook you will be able to generate word clouds for **Reddit**, **Scopus**, and **Stackoverflow**. 
- Configurations for this notebook can be found in **WordClouds.yaml** file inside the **Config** folder
- Make sure you follow the setup instructions on **Readme.md** and have installed all the packages required for this task

### Load Packages

In [None]:
import os
import yaml
import warnings
import pandas as pd
from yaspin import yaspin
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from yaml.loader import SafeLoader

### Load Config file

In [None]:
with open('../Config/Miners.yaml') as f:
    config = yaml.load(f, Loader=SafeLoader)
print('General Config:', config)

In [None]:
with open('../Config/WordClouds.yaml') as f:
    wordCloudConfig = yaml.load(f, Loader=SafeLoader)
print('Word cloud Config:', wordCloudConfig)

### Common Functions and Class

In [None]:
def readFile(file, path):
    try:
        spinner = yaspin()
        complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}\\{file}'
        file_data = pd.read_csv(complete_path, index_col=0)
        spinner.write("✔️ File loaded.")
        spinner.stop()
        return file_data
    except Exception as e:
        print('Error reading file',e)

#### Common class

In [None]:
class WordCloudGenerator():
    """This is the class implementation to generate word clouds
    """
    def __init__(self, data_frame):
        self.data_frame = data_frame 
        self.dirName = ""
        self.spinner = yaspin()

    def createOutputDir(self, dirName):
        """This function creates the folder to store the output graphs and images

        Args:
            dirName (str): Name of the output folder
        """
        self.dirName = dirName
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\WordClouds\\{self.dirName}"
        does_folder_exist = os.path.exists(complete_path)
        if (does_folder_exist):
            self.spinner.write("✔️ Output directory already exists.")
        else:
            os.makedirs(complete_path)
            self.spinner.write("✔️ Folder created for output storage")
            

    def make_word_clouds(self, column, max_words=50, scale=2):
        """This function generates the word cloud

        Args:
            column (str): Column in the data frame for which word cloud need to be generated
            max_words (int, optional): Maximum number of words in the cloud. Defaults to 50.
            scale (int, optional): The scale of the cloud. Defaults to 2.
        """
        strings  = ' '.join(self.data_frame[column])
        wordcloud = WordCloud(scale=2, max_words=50, background_color="white").generate(strings)
        plt.figure(figsize = (20, 20))
        plt.imshow(wordcloud)
        plt.axis("off")
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\WordClouds\\{self.dirName}"
        plt.savefig(os.path.join(complete_path, f"{self.dirName}_word_cloud.png"))
        self.spinner.write(f'✔️ Figure saved')
        plt.show()


    def date_based_comparasion_cloud(self):
        """This function is used to generate the date based comparison cloud for titles
        """
        mid_date = self.data_frame['Date'].astype('datetime64[ns]').quantile(0.5, interpolation="midpoint")
        new_titles = self.data_frame[(self.data_frame['Date']).astype('datetime64[ns]') >= mid_date]
        old_titles = self.data_frame[(self.data_frame['Date']).astype('datetime64[ns]') <  mid_date] 
        titles = [old_titles, new_titles] 
        self.spinner.write(f'Mid date: {mid_date}')
        for i in range(len(titles)):  
            try:
                strings = ' '.join(titles[i]['Title_without_stopwords'])
                wordcloud = WordCloud(scale=2, max_words=50, background_color="white").generate(strings) 
                plt.subplot(2, 2, i+1)
                plt.imshow(wordcloud, interpolation="bilinear")
                plt.axis("off")
                if i == 0:
                    plt.title(f'Old Titles before or during {mid_date.date()}')
                else:
                    plt.title(f'New Titles after {mid_date.date()}')
            except:
                continue
                
        complete_path = f"{os.path.dirname(os.path.abspath(os.getcwd()))}\\{config['OUTPUT_PATH']}\\WordClouds\\{self.dirName}"  
        plt.savefig(os.path.join(complete_path, f"{self.dirName}_comparison_cloud.png"))
        self.spinner.write(f'✔️ Figure saved')
        plt.show()    

### Reddit

In [None]:
reddit_data = readFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
reddit_cloud = WordCloudGenerator(reddit_data)

In [None]:
reddit_cloud.createOutputDir('Reddit')

In [None]:
reddit_cloud.make_word_clouds( wordCloudConfig['COLUMN_FOR_WORD_CLOUD'],
                               max_words=wordCloudConfig['MAX_WORDS'],
                               scale=wordCloudConfig['SCALE'])


In [None]:
reddit_cloud.date_based_comparasion_cloud()

### Stackoverflow

In [None]:
stackoverflow_data = readFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
stackoverflow_data = WordCloudGenerator(stackoverflow_data)

In [None]:
stackoverflow_data.createOutputDir('Stackoverflow')

In [None]:
stackoverflow_data.make_word_clouds( wordCloudConfig['COLUMN_FOR_WORD_CLOUD'],
                               max_words=wordCloudConfig['MAX_WORDS'],
                               scale=wordCloudConfig['SCALE'])

In [None]:
stackoverflow_data.date_based_comparasion_cloud()

### Scopus

In [None]:
scopus_data = readFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])

In [None]:
scopus_data = WordCloudGenerator(scopus_data)

In [None]:
scopus_data.createOutputDir('Scopus')

In [None]:
scopus_data.make_word_clouds( wordCloudConfig['COLUMN_FOR_WORD_CLOUD'],
                               max_words=wordCloudConfig['MAX_WORDS'],
                               scale=wordCloudConfig['SCALE'])

In [None]:
scopus_data.date_based_comparasion_cloud()