## Objective: Pre-process wikipedia articles nominated for deltion 

In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import Phrases
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from pprint import pprint
import yaml

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

In [27]:
with open('../data_engineering/config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

In [5]:
NUM_TOPICS = 5 # number of topics to fit on LDA model

## Load wiki articles

In [6]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [7]:
article_to_afd_join_key = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['JOINED_ARTICLE_SCRAPE_DATES_AND_AFD_NAMES'],
                                          )
test_primary_key(article_to_afd_join_key, ['article_id', 'file_name'])

In [8]:
pronoun_data = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['INFERRED_GENDER_BY_PRONOUN_COUNT'],
                                          )
test_primary_key(pronoun_data, ['article_id', 'file_name'])

In [9]:
original_rows = pronoun_data.shape[0]
pronoun_data = pronoun_data.merge(article_to_afd_join_key[['article_id', 'file_name', 'afd_result', 'discussion']],
                                on = ['article_id', 'file_name'])
assert original_rows == pronoun_data.shape[0]

In [10]:
article_text = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['SCRAPED_ARTICLE_TEXT_AND_REFERENCE_TEXT'],
                                          )
test_primary_key(article_text, ['article_id', 'file_name'])

In [11]:
original_rows = pronoun_data.shape[0]
pronoun_data = pronoun_data.merge(article_text[['article_id', 'file_name', 'articles_text']],
                                on = ['article_id', 'file_name'])
assert original_rows == pronoun_data.shape[0]

In [12]:
pronoun_data[0:3]

Unnamed: 0,article_id,file_name,scraped_path,num_male_tokens,num_female_tokens,num_non_binary_tokens,num_neo_tokens,max_pronoun_column,afd_result,discussion,articles_text
0,A.S.D._Villabiagio,daily_afd_log/2023-01-03/2022_December_23.txt,individual_afd_page_html/2023-01-01/A.S.D._Vil...,1,0,0,0,male,keep,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
1,Aaron_Kemmer,daily_afd_log/2023-01-19/2023_January_8.txt,individual_afd_page_html/2023-01-01/Aaron_Kemm...,5,0,2,0,male,delete,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."
2,Abbas_Sajwani,daily_afd_log/2023-01-07/2022_December_27.txt,individual_afd_page_html/2023-01-01/Abbas_Sajw...,1,0,0,0,male,delete,"<div class=""boilerplate afd vfd xfd-closed arc...","<!DOCTYPE html>\n<html class=""client-nojs"" lan..."


## Pre-process Wikipedia articles
* extract text from HTML
* remove standard Wikipedia banner messages that are not actual article text

In [13]:
pronoun_data['article_soup'] = pronoun_data['articles_text'].apply(lambda x: BeautifulSoup(x, "html.parser"))

### Extract text

In [14]:
pronoun_data['article_body'] = pronoun_data['article_soup'].apply(lambda x: x.find_all('div', 
                                                                                       class_='mw-body-content'))
pronoun_data['article_body_text'] = pronoun_data['article_body'].apply(lambda x: x[0].get_text(separator=' '))

### Remove standard Wikipedia banner messages

In [15]:
def get_afd_warning_element(soup):
    '''
    Retrieves Articles for Deletion (AFD) warning elements from a BeautifulSoup object.

    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML page.

    Returns:
        list: A list of AFD warning elements found in the HTML page. Each element is a string.

    Description:
        This method searches for AFD warning elements within the given BeautifulSoup object. 
        An AFD warning element indicates that the article is being considered for deletion. 
        The method looks for <div> elements with the class
        'mbox-text-span' and checks if they contain the specific text 'This article is being considered for deletion'.
        If a match is found, the warning element is added to the list of found_warnings.

        Note:
            The returned warning elements may contain additional HTML tags and formatting.

    '''
    found_warnings = []
    possible_afd_warnings = soup.find_all('div', class_ = 'mbox-text-span')
    if len(possible_afd_warnings) > 0:
        for possible_afd_warning in possible_afd_warnings:
            if 'This article is being considered for deletion' in possible_afd_warning.text:
                found_warnings = found_warnings + [possible_afd_warning.get_text(separator=' ')]
    return found_warnings
    
def get_notability_warning_element(soup):
    '''
    Retrieves notability warning elements from a BeautifulSoup object.

    Parameters:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML page.

    Returns:
        list: A list of notability warning elements found in the HTML page. Each element is a string.

    Description:
        This method searches for notability warning elements within the given BeautifulSoup object. 
        It iterates over a list of classes and a list of specific warning texts. 
        For each class, it finds <div> elements with that class
        and checks if they contain any of the specified warning texts. 
        If a match is found, the warning element is added
        to the list of found_warnings.

        Note:
            The returned warning elements may contain additional HTML tags and formatting.

    '''
    classes = ['multiple-issues-text', 'mbox-text-span']
    found_warnings = []
    
    for this_class in classes:
        possible_warnings = soup.find_all('div', class_ = this_class)
        warnings = ['This article has multiple issues',
                    'deletion policy', 'notability guideline', 'nominated for deletion', 
                    'You can help Wikipedia by expanding it',
                   'This article does not cite any sources',
                   'improve this article',
                   'needs additional citations',
                   'Please help improve',
                   'link rot', 
                   'no other articles link to it',
                   'The neutrality of this article is disputed']
        for warning in warnings:
            if len(possible_warnings) > 0:
                for possible_warning in possible_warnings:
                    if warning.lower() in possible_warning.text.lower():
                        found_warnings = found_warnings + [possible_warning.get_text(separator=' ')]
    return found_warnings

In [16]:
pronoun_data['afd_warning_element'] = pronoun_data['article_soup'].apply(lambda x: get_afd_warning_element(x) )
pronoun_data['notability_warning_element'] = pronoun_data['article_soup'].apply(lambda x: get_notability_warning_element(x) )


In [17]:
pronoun_data['afd_warning_element'].value_counts().reset_index()[0:5]

Unnamed: 0,index,afd_warning_element
0,[],105
1,[This article is being considered for deletion...,2
2,[This article is being considered for deletion...,2
3,[This article is being considered for deletion...,2
4,[This article is being considered for deletion...,2


In [18]:
pronoun_data['notability_warning_element'].value_counts().reset_index()[0:5]

Unnamed: 0,index,notability_warning_element
0,[],77
1,[This article is being considered for deletion...,2
2,[The topic of this article may not meet Wikip...,2
3,[This article is being considered for deletion...,2
4,[This article is being considered for deletion...,2


In [19]:
def remove_warning(body_text, warning_text):
    '''
    Removes warning texts from the body text.

    Parameters:
        body_text (str): The original body text.
        warning_text (list): A list of warning texts to be removed from the body text.

    Returns:
        str: The modified body text with the specified warning texts removed.

    Description:
        This method removes specific warning texts from the given body text. It iterates over each warning text in the
        provided list and uses the `replace()` method to remove each occurrence of the warning text from the body text.
        The modified body text is then returned.

    Example:
        # Original body text
        body_text = "This article has multiple issues. Please help improve it. KEEP"

        # Warning texts to be removed
        warning_texts = ["This article has multiple issues", "Please help improve it."]

        # Remove warning texts
        modified_body_text = remove_warning(body_text, warning_texts)

        # Print the modified body text
        print(modified_body_text)
        # Output: " KEEP"
    '''
    try:
        for this_warning_text in warning_text:
            body_text = body_text.replace(this_warning_text, "")
        return body_text
    except:
        return body_text

In [20]:
pronoun_data['article_body_text_wo_warning'] = pronoun_data[['article_body_text', 'afd_warning_element']].apply(lambda x: remove_warning(x[0], x[1]), axis=1)

In [21]:
pronoun_data['article_body_text_wo_warning'] = pronoun_data[['article_body_text_wo_warning', 'notability_warning_element']].apply(lambda x: 
                                                                                                                remove_warning(x[0], x[1]), axis=1)

In [22]:
def remove_common_warning_messages(text_to_clean):
    '''
    Removes warning texts from the body text.

    Parameters:
        text_to_clean (str): The text to clean

    Returns:
        str: The modified body text with the specified warning texts removed.

    Description:
        This method removes specific warning texts from the given body text. It iterates over each warning text in the
        provided list and uses the `replace()` method to remove each occurrence of the warning text from the body text.
        The modified body text is then returned.
    '''
    common_warning_text = ['Learn how and when to remove this template message', 
                          'citation needed']
    for substring in common_warning_text:
        text_to_clean = text_to_clean.replace(substring, ' ')
    return text_to_clean

In [23]:
pronoun_data['article_body_text_wo_common_warning'] = pronoun_data['article_body_text_wo_warning'].apply(lambda x: remove_common_warning_messages(x))



In [24]:
pronoun_data['article_body_text_wo_warning_remove_edit'] = pronoun_data['article_body_text_wo_common_warning'].apply(
    lambda x: x.replace("[edit]","").replace("\n"," "))

## Write out pre-processed data

In [25]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [30]:
out_buffer = io.BytesIO()
pronoun_data.drop(['article_soup','article_body'], axis=1).to_parquet(out_buffer, index=False) # drop a beautiful soup column
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['PREPROCESSED_ARTICLE_TEXT'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '0AEEP32RJT6AQ7Z0',
  'HostId': 'Lyl4vOnDw599/hY/PU1AqF51zp2PfdoowYat2U8ue4Fz0es/QPtlFHLbeLhRbm2nAR9nQEiYO3I=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Lyl4vOnDw599/hY/PU1AqF51zp2PfdoowYat2U8ue4Fz0es/QPtlFHLbeLhRbm2nAR9nQEiYO3I=',
   'x-amz-request-id': '0AEEP32RJT6AQ7Z0',
   'date': 'Sun, 16 Jul 2023 16:46:24 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"75a7769aeb7b13f103a3c16e385ecc7f"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"75a7769aeb7b13f103a3c16e385ecc7f"',
 'ServerSideEncryption': 'AES256'}