# Trend Mining

## Miners

In this notebook you will be able to mine **Reddit**, **Scopus**, and **Stackoverflow**. 
- Configurations for this notebook can be found in **Miners.yaml** file inside the **Config** folder
- Make sure you follow the setup instructions on **Readme.md** and have installed all the packages required for this task


### Load Packages

In [None]:
import re
import os
import yaml
import praw  
import requests
import warnings
import pandas as pd
import pybliometrics 
from yaspin import yaspin
from datetime import datetime   
from yaml.loader import SafeLoader 
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus.utils import config 

warnings.filterwarnings('ignore')

### Load Config file

In [None]:
with open('../Config/Miners.yaml') as f:
    config = yaml.load(f, Loader=SafeLoader)

config

### Common Functions

In [None]:
def createFile(file, path): 
    """This function is used to create the directory necessary to store the mined data.        

    Args:
        file (str): Name of the file to be created.
        path (str): Path of the directory where the files will be stored e.g. "Data".
    """
    complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}'
    does_folder_exist = os.path.exists(complete_path)
    does_file_exist  = os.path.exists(f'{complete_path}\\{file}')
    if (does_folder_exist): 
        # Remove existing stack data file if already exist to add new one
        if (does_file_exist):
            print('Removing already existing',file,'file')
            os.remove(f'{complete_path}\\{file}')
        else:
            print( file + ' does not exist yet, ' + 'it will be downloaded')

    # Create Data folder if did not exist to store the csv file
    else: 
        root_dir = os.path.dirname(os.path.abspath(os.getcwd()))
        os.mkdir(f'{root_dir}\\{path}')
        print(f'{path} folder created for csv file storage')
 

In [None]:
def saveFile(file, filename, path):
    complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}'
    file.to_csv(f'{complete_path}\\{filename}')
    print(f'{filename} saved in {path} directory')  

In [None]:
def readFile(file, path):
    try:
        spinner = yaspin()
        complete_path = f'{os.path.dirname(os.path.abspath(os.getcwd()))}\\{path}\\{file}'
        file_data = pd.read_csv(complete_path, index_col=0)
        spinner.write("✔️ File loaded.")
        spinner.stop()
        return file_data
    except Exception as e:
        print('Error reading file',e)

### Reddit Data

In [None]:
def getData(subreddit, reddit_client_id, reddit_client_secret, reddit_user_agent):
    """This function mines the data from subreddits

    Args:
        subreddit (str): Name of the subreddit to be mined
        reddit_client_id (str) : Client ID
        reddit_client_secret (str) : Client Secret
        reddit_user_agent (str) : User agent
    """
    
    try:
        reddit: praw.reddit.Reddit = praw.Reddit(client_id=reddit_client_id,client_secret=reddit_client_secret,user_agent=reddit_user_agent,check_for_async=False)
        subreddit: praw.models.reddit.subreddit.Subreddit = reddit.subreddit(subreddit) 

        spinner = yaspin()
        spinner.start()
        spinner.write( f"⛏️ Mining {subreddit} subreddit")
        

        posts= []
        columns=['AuthorId', 'Q_id', 'Title', 'Abstract', 'Answers', 'Cites',  'Date']

        for post in subreddit.hot(limit=None):
            posts.append([post.author, post.id, post.title, 
                      post.selftext, post.num_comments, post.score,
                       datetime.fromtimestamp( post.created) 
                      ])

        spinner.ok("✔️ Data Mined")
        reddit_data: pandas.core.frame.DataFrame = pd.DataFrame(posts,columns=columns)
    except Exception as e:
        spinner.fail("❌ Failed to mine")
        spinner.write(e)    
    finally:
        spinner.stop()
      
    try:
        createFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(reddit_data, config['REDDIT_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop()

In [None]:
getData(config['REDDIT_SUBREDDIT'], config['REDDIT_CLIENT_ID'], config['REDDIT_CLIENT_SECRET'], config['REDDIT_USER_AGENT'])

In [None]:
def reddit_cleaner(data):
    """This function is applied to the dataframe, it removes the unnecessary characters  and symbols from it

    Args:
        data (string): Data string that needs to be cleaned

    Returns:
        str: Cleaned string 
    """
    try:
        data = str(data)  
        res = re.sub('\[[^]]*\]' , '', data) #remove eveything in []
        res = re.sub("<a.*?>*</a>" , '', data) #remove anchor tags with content
        res = re.sub("[0-9]" , '', res) #remove numbers
        res = re.sub("&quot", '', res) #remove &quot
        res = re.sub("<.*?>", '', res) #remove all HTML tags
        res = re.sub("//.*\n", '', res)
        res = re.sub("\\{\n.*\\}\n", '', res)
        res = re.sub("[\r\n]", '', res)
        res = re.sub("\"", '', res) #remove quotes
        res = re.sub('[^\w\s]', ' ', res) #remove punctuations
        res = res.lower()
        return res
    
    except Exception as e:
        print('Error cleaning data',e)

In [None]:
def clean_reddit_data():
    """This function cleans the dataframes by applying the clean_data function to each title and abstract in the dataframe
        Also it drops the row if it has no date and if its abstract is missing
    """
    spinner = yaspin()
    spinner.start()
    
    try:
        spinner.write('🧹 Data cleaning')
        reddit_data = readFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])
        reddit_data['Title_clean'] = reddit_data['Title'].apply(reddit_cleaner)
        abstract = reddit_data.Abstract
        cleaned_abstract = abstract.apply(reddit_cleaner)
        reddit_data['Abstract_clean'] = cleaned_abstract

        # Drop the rows which have empty abstract
        reddit_data.drop(reddit_data[reddit_data['Abstract'] == ''].index, inplace=True)

        # Drop rows with no date
        reddit_data.drop(reddit_data[(reddit_data['Date'] == '') | 
                               (reddit_data['Date'] == None) |
                               (reddit_data['Date'] == 0) ].index, 
                               inplace=True
                                )
    
        # Drop null rows 
        reddit_data.dropna(axis=0, inplace=True)
        spinner.ok("✔️ Data cleaned")
        
    except Exception as e:
        spinner.fail("❌ Failed to clean data")  
        spinner.write(e)
        print(e)
    finally:
        spinner.stop()
        
    try:
        spinner.write("🔁 Old file will be replaced\n")
        createFile(config['REDDIT_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(reddit_data, config['REDDIT_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop()

In [None]:
clean_reddit_data()

### Stackoverflow Data

In [None]:
def getTotal(stk_query_string,stackoverflow_api_key ):
    """This function gets the total number of results in response

    Args:
        stk_query_string (str): query string
    """
    spinner = yaspin()
    spinner.start()
    try:
        spinner.write('Fetching total')
        total_api_url =  f'https://api.stackexchange.com/2.2/search/advanced?order=desc&sort=activity&q={stk_query_string}&filter=total&site=stackoverflow&key={stackoverflow_api_key}'
        res =  requests.get(total_api_url)
        res = res.json()
        total_num = res['total']
        spinner.write(f'✔️ total: {total_num}')
    except Exception as e:
        spinner.fail("❌ Failed to get total")  
        spinner.write(e)
    finally:
        spinner.stop()

In [None]:
getTotal(config['STACKOVERFLOW_QUERY_STRING'], config['STACKOVERFLOW_API_KEY'])

In [None]:
def fetch_data(query, page_number, stackoverflow_api_key ):
    """This function is used to fetch data.

    Args:
        query (str): query string.   
        page_number (int): page number to be mined.
        stackoverflow_api_key (str): api key
    Returns:
        pd.DataFrame: response of the API stored in the pandas data frame.
    """
    try:
        url = f'https://api.stackexchange.com/2.2/search/advanced?order=desc&sort=activity&q={query}&filter=withbody&site=stackoverflow&key={stackoverflow_api_key}&page={page_number}'
        res =  requests.get(url)
        res = res.json() 
        return pd.DataFrame(res)
    
    except Exception as e:
        print(e)

In [None]:
def getBody():
    """This function mines Stackoverflow.

    Args:
        stk_query_string (str): query string to be searched, mined and saved in a CSV.
    """
    spinner = yaspin()
    spinner.start()
    try:
        spinner.write('⛏️ Mining Stackoverflow')
        
        page_number = 1 
        df = fetch_data(config['STACKOVERFLOW_QUERY_STRING'], page_number, config['STACKOVERFLOW_API_KEY'])

        while df.iloc[-1]['has_more']:
            page_number = page_number + 1
            fetched_data = fetch_data(config['STACKOVERFLOW_QUERY_STRING'], page_number, config['STACKOVERFLOW_API_KEY'])
            df = pd.concat([df, fetched_data], ignore_index=True) 

            if not fetched_data.iloc[-1]['has_more']:
                spinner.write(f'ℹ️ Data fetch completed with {len(df)} records')
                break

        # Organize Data
        spinner.write('🗃️ Organizing data')
        user_data = []

        for index, row in df.iterrows():
            user = {}
            user['AuthorId'] = row['items']['owner'].get('user_id',0)
            user['Q_id'] = row['items'].get('question_id', '') 
            user['Title'] = row['items'].get('title', '')
            user['Abstract'] = row['items'].get('body', '') 
            user['Views'] = row['items'].get('view_count', 0) 
            user['Answers'] = row['items'].get('answer_count', 0)  
            user['Cites'] = row['items'].get('score', 0) 
            user['Tags_n'] = len(row['items'].get('tags', []))  
            user['Tags'] = ';'.join(row['items'].get('tags', ''))
            user['Date'] =  datetime.fromtimestamp( row['items']['creation_date']) 
            user['CR_Date'] =  datetime.fromtimestamp( row['items']['creation_date']) 
            user['LA_Date'] =  datetime.fromtimestamp( row['items']['last_activity_date'])   

            user_data.append(user) 

        stack_data = pd.DataFrame(data=user_data)
        spinner.ok("✔️ Data Mined") 
    except Exception as e:
        spinner.fail("❌ Failed to mine stackoverflow")  
        spinner.write(e)
    finally:
        spinner.stop()
        
    try: 
        createFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(stack_data, config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop()

In [None]:
getBody()

In [None]:
def stackoverflow_cleaner(data,is_abstract):
    """This function is applied to the dataframe, it removes the unnecessary characters  and symbols from it.

    Args:
        data (str): data to be cleaned
        is_abstract (bool): flag to indicate if this function is applied on abstract or title

    Returns:
        str: cleaned data
    """
    data = str(data)  
    if is_abstract:
        reg_str = "<p>(.*?)</p>" #get only text for abastracts
        res = re.findall(reg_str, data)
        res = ' '.join(res)
    else:
        res = data

    res = re.sub("<a.*?>*</a>" , '', res) #remove anchor tags with content
    res = re.sub("[0-9]" , '', res) #remove numbers
    res = re.sub("&quot", '', res) #remove &quot
    res = re.sub("<.*?>", '', res) #remove all HTML tags
    res = re.sub("//.*\n", '', res)
    res = re.sub("\\{\n.*\\}\n", '', res)
    res = re.sub("[\r\n]", '', res)
    res = re.sub("\"", '', res) #remove quotes
    res = re.sub('[^\w\s]', ' ', res) #remove punctuations
    res = res.lower()

    return res


In [None]:
def cleanData():
    """
    This function cleans the dataframes by applying the clean function to each abstract in the dataframe. 
    In this function data points has been droped where abstract and date is missing 
    """
    spinner = yaspin()
    spinner.start()
    try:
        
        spinner.write('🧹 Data cleaning')
        stack_data =  readFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])
        abstract = stack_data.Abstract
        title = stack_data.Title
        cleaned_abstract = abstract.apply(stackoverflow_cleaner, is_abstract=True)
        cleaned_title = title.apply(stackoverflow_cleaner, is_abstract=False)
        stack_data['Abstract_clean'] = cleaned_abstract
        stack_data['Title_clean'] = cleaned_title
        #Drop rows where abstract has empty value
        stack_data.drop(stack_data[stack_data['Abstract'] == ''].index, inplace=True)
        stack_data.drop(stack_data[stack_data['Abstract_clean'] == ''].index, inplace=True)

        #Drop rows with no date
        stack_data.drop(stack_data[(stack_data['Date'] == '') | (stack_data['Date'] == None) | (stack_data['Date'] == 0) ].index, inplace=True)
        # Drop null rows
         
        stack_data.dropna(axis=0, inplace=True, how="any")
        spinner.ok("✔️ Data cleaned")
    except Exception as e:
        spinner.fail("❌ Failed to clean data")  
        spinner.write(e)
    finally:
        spinner.stop()
    
    try:
        spinner.write("🔁 Old file will be replaced\n")
        createFile(config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(stack_data, config['STACKOVERFLOW_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop()

In [None]:
cleanData()

### Scopus Data

In [None]:
print(f"🔑 Please enter the following key in the input \n")
print(config['SCOPUS_API_KEY'])

In [None]:
pybliometrics.scopus.utils.create_config()

In [None]:
def getData(query):
    """This function mines the Scopus database

    Args:
        query (str): query or string that will be used as a criteria while mining
    """
    spinner = yaspin()
    spinner.start()
    try:
        spinner.write('⛏️ Mining scopus')
        scopus_query = query
        scopus_res = ScopusSearch(scopus_query,  download=True, view='COMPLETE')
        spinner.write(f'ℹ️ Total entries {scopus_res.get_results_size()}' ) 

        scopus_data = pd.DataFrame(pd.DataFrame(scopus_res.results))
        spinner.write(f'ℹ️ Dataframe shape {scopus_data.shape}' )

        scopus_data_subset = scopus_data[['eid', 'doi', 'title', 'creator', 'publicationName', 'coverDate', 'description', 
                           'authkeywords', 'citedby_count', 'pageRange', 'aggregationType', 'subtypeDescription',
                          'author_count', 'author_names', 'author_ids', 'affilname', 'affiliation_country'
                          ]]
        spinner.ok("✔️ Data Mined") 
    except Exception as e:
        spinner.fail("❌ Failed to mine scopus")  
        spinner.write(e)
    finally:
        spinner.stop()
    
    try:
        createFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(scopus_data_subset, config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop() 

In [None]:
sco_query = f"TITLE-ABS-KEY(\ '{config['SCOPUS_QUERY_STRING_1']}' \) AND ALL ( '{config['SCOPUS_QUERY_STRING_2']}')"
# sco_query = "TITLE(config['SCOPUS_QUERY_STRING_1']) AND PUBYEAR > 2021"
getData(sco_query)

In [None]:
def scopus_cleaner(data):
    """This function is applied to the dataframe, it removes the unnecessary characters  and symbols from it

    Args:
        data (string): Data string that needs to be cleaned

    Returns:
        str: Cleaned string 
    """
    data = str(data)
    res = re.sub("[©®™%]", "", data) #remove ©,®,™,% sign 
    res = re.sub("<a.*?>*</a>", '', data) #remove anchor tags with content
    res = re.sub("[0-9]", '', res) #remove numbers
    res = re.sub("<.*?>", '', res) #remove all HTML tags
    res = re.sub("//.*\n", '', res)
    res = re.sub("\\{\n.*\\}\n", '', res)
    res = re.sub("[\r\n]", '', res)
    res = re.sub("\"", '', res) #remove quotes
    res = re.sub('[^\w\s]', ' ', res) #remove punctuations
    res = re.sub("All right reserved[.]", ' ', res) #
    res = data.lower()
    return res

In [None]:
def clean():
    """
    This function cleans the dataframes by applying the clean_scopus_data function to each abstract in the dataframe
        Also it renames few important column names  
    """
    spinner = yaspin()
    spinner.start()
    try:
        spinner.write('🧹 Data cleaning')
        scopus_data_subset = readFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])
        abstract = scopus_data_subset['description']
        title = scopus_data_subset['title']
        cleaned_abstract = abstract.apply(scopus_cleaner)
        cleaned_title = title.apply(scopus_cleaner)
        scopus_data_subset['Abstract_clean'] = cleaned_abstract 
        scopus_data_subset['Title_clean'] = cleaned_title
        scopus_data_subset.dropna(axis=0, inplace=True)
        scopus_data_subset.rename(columns={'description':'Abstract', 'coverDate': 'Date', 'citedby_count': 'Cites', 'title': 'Title'}, inplace=True)
        spinner.ok("✔️ Data cleaned")
    except Exception as e:
        spinner.fail("❌ Failed to clean data")  
        spinner.write(e)
    finally:
        spinner.stop()
         
        
    try:
        spinner.write("🔁 Old file will be replaced\n")
        createFile(config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'])
        saveFile(scopus_data_subset, config['SCOPUS_DATA_CSV'], config['STORAGE_PATH'] )
        spinner.ok("✔️ File saved")
    except Exception as e:
        spinner.fail("❌ Failed to save file")  
        spinner.write(e)
    finally:
        spinner.stop()

        # TODO: Remove papers that are summaries of conference proceedings. 


In [None]:
clean()