## Objectives: 
1) Gather names of people whose Wikipedia articles have been nominated for deletion

2) Gather free text discussion about why those articles were nominated.

In [1]:
import awswrangler as wr
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime
import io
import pandas as pd
import re
import spacy

In [2]:
INPUT_BUCKET = 'afd-scraped'
PREFIX = "daily_afd_log/2023"
OUTPUT_BUCKET = 'women-in-red-intermediary'
OUTPUT_FILE = 'afd_names_and_discussion.parquet'

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
s3 = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [5]:
type(s3)

boto3.resources.factory.s3.ServiceResource

## Functions

In [6]:
def create_afd_containers(bs4_result_set):
    """
    Extracts the text and tags of the specified type in a BeautifulSoup result set.

    Args:
        bs4_result_set (bs4.element.ResultSet): A result set containing h3 tags.

    Returns:
        dict: A dictionary of tag names and their contents, where each value is a list of
              BeautifulSoup tags representing the contents between the corresponding tags.

    Example:
        >>> h3_tags = soup.find_all("h3")
        >>> h3_containers = create_afd_containers(h3_tags)
        >>> print(h3_containers["Section 1"])
        [<p>Some text</p>, <ul><li>Item 1</li><li>Item 2</li></ul>, <p>More text</p>]
    """

    afd_containers = {}
    TEXT_TO_REMOVE = "[edit]"

    for i, current_tag in enumerate(bs4_result_set):
        clean_object_name = get_afd_article_name(current_tag)
        if clean_object_name != "Not Found":
            afd_containers[clean_object_name] = str(current_tag)

    return afd_containers

In [7]:
def get_afd_article_name(div_tag):
    """
    Returns the name of the AFD (Articles for Deletion) article if found in the provided div tag object.
    In historical data we have seen that the name of the article is found in the <span> tag
    with class ="mw-headline"

    Parameters:
        div_tag (bs4.element.Tag): The div tag object.

    Returns:
        str: The name of the AFD article if found, otherwise returns 'Not Found'.
    """
    try:
        article_name = div_tag.find_all('span', class_="mw-headline")[0]['id']
        article_name = article_name.replace("_"," ")
        return article_name
    except:
        return 'Not Found'

In [8]:
def get_afd_result(div_tag_str):
    """
    Extracts and standardizes the outcome of the AFD (Articles for Deletion) discussions
    from the HTML code of a div tag.
    In historical data we've found the outcome stored in a sentence "The result was <b>[OUTCOME]</b>"

    Parameters:
        div_tag_str (str): The HTML code of a div tag.

    Returns:
        str or None: The standardized result of the AFD if found, otherwise returns None.
    """
    
    result = re.search(r"The result was <b>([A-Za-z0-9_ ]+)</b>", div_tag_str)
    standardization_lookup = {"withdrew": "withdrawn"}

    try:
        result_term = result.group(1).lower()
        if result_term in standardization_lookup.keys():
            return standardization_lookup.get(result_term)
        else:
            return result_term
    except:
        return None

In [9]:
def extract_people_metadata_from_logs(logs_df, s3_bucket):
    '''
    Given a Pandas DataFrame of log files and an S3 bucket name, 
    extracts metadata about people mentioned in the logs.
    
    :param logs_df: A Pandas DataFrame containing a column of S3 object keys.
    :type logs_df: pandas.DataFrame
    :param s3_bucket: The name of the S3 bucket containing the log files.
    :type s3_bucket: str
    :return: A Pandas DataFrame containing metadata about people mentioned in the logs.
    :rtype: pandas.DataFrame
    '''
    
    people_metadata = None

    for file in logs_df['file_name'].values:
        print(file)

        object_content = read_s3_file(s3_bucket, file)
        div_tags = get_div_tags(object_content)
        
        afd_content = create_afd_containers(div_tags)

        for name in afd_content.keys():
                
            results = pd.DataFrame(identify_people(name), index=[0])
            results['file_name'] = file
            results['discussion'] = afd_content.get(name) 
            results['afd_result'] = get_afd_result(afd_content.get(name))
            
            if people_metadata is None:
                people_metadata = results
            else:
                people_metadata = pd.concat([people_metadata, results])
                
    return people_metadata


In [10]:
def get_list_of_s3_files(BUCKET, PREFIX):
    """
    This function takes in the name of an Amazon S3 bucket and a prefix for an S3 key 
    and returns a list of S3 object keys that match the given prefix.

    :param BUCKET: A string representing the name of an S3 bucket
    :type BUCKET: str
    :param PREFIX: A string representing the prefix to search for in the S3 objects' keys
    :type PREFIX: str

    :return: A list of S3 object keys that match the given prefix
    :rtype: list[str]
    
    Example Usage:
    >>> s3_files = get_list_of_s3_files('my-s3-bucket', 'path/to/my/files/')
    >>> print(s3_files)
    ['path/to/my/files/file1.txt', 'path/to/my/files/file2.txt', 'path/to/my/files/file3.txt']
    """
    bucket = s3.Bucket(BUCKET)
    objects = bucket.objects.filter(Prefix=PREFIX)
    return [obj.key for obj in objects]

In [11]:
def get_div_tags(html_page):
    """
    Parses an HTML page and returns a list of all the <div> tags with class "boilerplate afd vfd xfd-closed archived"
    found in it.

    Args:
        html_page (str): The HTML page to parse.

    Returns:
        A list of BeautifulSoup Tag objects, each representing an <h3> tag found in the HTML page.

    Raises:
        None.

    Example:
        >>> html_page = '<html><body><h1>Title</h1><h3>First article</h3><p>Some text.</p><h3>Second article</h3><p>More text.</p></body></html>'
        >>> parse_h3_tags(html_page)
        [<h3>First article</h3>, <h3>Second article</h3>]
    """
    soup = BeautifulSoup(html_page, "html.parser")
    div_tags = soup.find_all("div", class_='boilerplate afd vfd xfd-closed archived')
    return div_tags


In [12]:
def get_most_recent_log(object_df):
    """
    Returns a DataFrame with the most recent 'scrape_date' for each 'log_date'.

    :param object_df: pandas.DataFrame
        A DataFrame with columns 'file_name', 'scrape_date', and 'log_date'.
    :return: pandas.DataFrame
        A DataFrame with columns 'log_date' and 'scrape_date', where each row
        corresponds to the most recent 'scrape_date' for each 'log_date' in the input DataFrame.
    """
    max_scrape_date = object_df.groupby('log_date')['scrape_date'].max().reset_index()
    return max_scrape_date

In [13]:
def identify_people(afd_article_name):
    """
    Identify people in an AFD (Articles for Deletion) article name using Named Entity Recognition (NER) provided by the
    spaCy library. Returns a dictionary with information about the identified entities.

    :param afd_article_name: The name of the AFD article to analyze.
    :type afd_article_name: str
    :return: A dictionary with information about the identified entities.
    :rtype: dict

    The dictionary contains the following keys:
        - 'entity': The name of the AFD article that was analyzed.
        - 'found_person': A boolean value indicating whether at least one person was identified in the article name.
        - 'num_entities': The number of unique entity types identified in the article name.
        - 'is_multiple_entity_types': A boolean value indicating whether more than one entity type was identified in the
                                       article name.

    Example usage:
    >>> identify_people("Wikipedia:Articles for deletion/John Doe")
    {'entity': 'Wikipedia:Articles for deletion/John Doe',
     'found_person': True,
     'num_entities': 1,
     'is_multiple_entity_types': False}
    """
    doc = nlp(afd_article_name)

    FOUND_PERSON = False 
    MULTIPLE_ENTITY_TYPES = False

    unique_entity_labels = len(set([entity.label_ for entity in doc.ents])) 

    if any(entity.label_=="PERSON" for entity in doc.ents):
        FOUND_PERSON = True
    if unique_entity_labels>1 and any(entity.label_=="PERSON" for entity in doc.ents):
        MULTIPLE_ENTITY_TYPES = True

    return {'entity': afd_article_name, 'found_person': FOUND_PERSON, 'num_entities': unique_entity_labels,
            'is_multiple_entity_types': MULTIPLE_ENTITY_TYPES}


In [14]:
def read_s3_file(s3_reader, bucket_name, file_key):
    """
    Reads the contents of a file stored on S3.

    :param bucket_name: The name of the S3 bucket.
    :type bucket_name: str
    :param file_key: The unique key of the file in the S3 bucket.
    :type file_key: str
    :return: The contents of the file as a string.
    :rtype: str
    """
    s3_object = s3_reader.Object(bucket_name, file_key)
    object_content = s3_object.get()['Body'].read().decode('utf-8')
    return object_content

In [15]:
def test_primary_key(df, primary_key_col):
    """
    Checks if a specified column or set of columns constitutes a primary key for a given pandas dataframe. 

    :param df: Pandas dataframe to be checked for a primary key.
    :type df: pandas.DataFrame
    :param primary_key_col: Name or list of names of column(s) to be checked for being a primary key.
    :type primary_key_col: str or list[str]
    :raises AssertionError: If the specified column or set of columns is not a primary key for the given dataframe.
    :return: None

    Example Usage:
    >>> import pandas as pd
    >>> data = {'Name': ['John', 'Alex', 'Mike', 'John'], 'Age': [24, 26, 27, 24], 'Gender': ['M', 'M', 'M', 'M']}
    >>> df = pd.DataFrame(data)
    >>> test_primary_key(df, 'Name')
    AssertionError: Name is not the primary key
    """
    try:
        assert any(df[primary_key_col].duplicated())==False 
    except:
        raise AssertionError(f'{primary_key_col} is not the primary key')


## Collect a list of Article for Deletion logs in this bucket with desired prefix

In [16]:
objects = get_list_of_s3_files(INPUT_BUCKET, PREFIX)

In [17]:
objects_pd = pd.DataFrame({"file_name": objects})
objects_pd['scrape_date'] = objects_pd['file_name'].apply(lambda x: pd.to_datetime(x.split("/")[1]))
objects_pd['log_date'] = objects_pd['file_name'].apply(lambda x: x.split("/")[2])

In [18]:
objects_pd

Unnamed: 0,file_name,scrape_date,log_date
0,daily_afd_log/2023-01-01/2022_December_21.txt,2023-01-01,2022_December_21.txt
1,daily_afd_log/2023-01-01/2022_December_22.txt,2023-01-01,2022_December_22.txt
2,daily_afd_log/2023-01-01/2022_December_23.txt,2023-01-01,2022_December_23.txt
3,daily_afd_log/2023-01-01/2022_December_24.txt,2023-01-01,2022_December_24.txt
4,daily_afd_log/2023-01-01/2022_December_25.txt,2023-01-01,2022_December_25.txt
...,...,...,...
820,daily_afd_log/2023-05-27/2023_May_23.txt,2023-05-27,2023_May_23.txt
821,daily_afd_log/2023-05-27/2023_May_24.txt,2023-05-27,2023_May_24.txt
822,daily_afd_log/2023-05-27/2023_May_25.txt,2023-05-27,2023_May_25.txt
823,daily_afd_log/2023-05-27/2023_May_26.txt,2023-05-27,2023_May_26.txt


## We captured snapshots of Articles for Deletion logs on multiple dates, so let's filter to the most recent snapshot for a given log

In [19]:
most_recent_log = get_most_recent_log(objects_pd)

In [20]:
objects_pd = objects_pd.merge(most_recent_log, on = ['log_date','scrape_date'])

In [21]:
objects_pd

Unnamed: 0,file_name,scrape_date,log_date
0,daily_afd_log/2023-01-01/2022_December_21.txt,2023-01-01,2022_December_21.txt
1,daily_afd_log/2023-01-02/2022_December_22.txt,2023-01-02,2022_December_22.txt
2,daily_afd_log/2023-01-03/2022_December_23.txt,2023-01-03,2022_December_23.txt
3,daily_afd_log/2023-01-04/2022_December_24.txt,2023-01-04,2022_December_24.txt
4,daily_afd_log/2023-01-05/2022_December_25.txt,2023-01-05,2022_December_25.txt
...,...,...,...
97,daily_afd_log/2023-05-27/2023_May_23.txt,2023-05-27,2023_May_23.txt
98,daily_afd_log/2023-05-27/2023_May_24.txt,2023-05-27,2023_May_24.txt
99,daily_afd_log/2023-05-27/2023_May_25.txt,2023-05-27,2023_May_25.txt
100,daily_afd_log/2023-05-27/2023_May_26.txt,2023-05-27,2023_May_26.txt


In [22]:
test_primary_key(objects_pd, 'log_date')

## The H3 tag only contains the name of the person or organization associated with the article nominated for deletion. Let's collect the articles for deletion discussion, found between H3 tags.

In [23]:
people_metadata = extract_people_metadata_from_logs(objects_pd, INPUT_BUCKET)

daily_afd_log/2023-01-01/2022_December_21.txt
daily_afd_log/2023-01-02/2022_December_22.txt
daily_afd_log/2023-01-03/2022_December_23.txt
daily_afd_log/2023-01-04/2022_December_24.txt
daily_afd_log/2023-01-05/2022_December_25.txt
daily_afd_log/2023-01-06/2022_December_26.txt
daily_afd_log/2023-01-07/2022_December_27.txt
daily_afd_log/2023-01-08/2022_December_28.txt
daily_afd_log/2023-01-09/2022_December_29.txt
daily_afd_log/2023-01-10/2022_December_30.txt
daily_afd_log/2023-01-11/2022_December_31.txt
daily_afd_log/2023-01-12/2023_January_1.txt
daily_afd_log/2023-01-13/2023_January_2.txt
daily_afd_log/2023-01-14/2023_January_3.txt
daily_afd_log/2023-01-15/2023_January_4.txt
daily_afd_log/2023-01-16/2023_January_5.txt
daily_afd_log/2023-01-17/2023_January_6.txt
daily_afd_log/2023-01-18/2023_January_7.txt
daily_afd_log/2023-01-19/2023_January_8.txt
daily_afd_log/2023-01-20/2023_January_9.txt
daily_afd_log/2023-01-21/2023_January_10.txt
daily_afd_log/2023-01-21/2023_January_11.txt
daily_af

In [24]:
people_metadata[0:5]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
0,Featherston Drive Public School,False,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
0,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
0,Index of World War II articles,False,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
0,Radical love (social psychology),False,0,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",soft delete


In [25]:
people_metadata.shape

(2107, 7)

In [26]:
people_metadata['afd_result'].value_counts()

delete                    938
keep                      370
soft delete               222
redirect                  209
merge                     104
no consensus               96
speedy keep                61
speedy delete              30
draftify                   27
withdrawn                   9
delete and redirect         5
speedy deleted              3
procedural close            2
procedural keep             2
iar draftify                1
userfy and delete           1
speedy delete and salt      1
keep but rename             1
soft redirect               1
weak keep                   1
soft keep                   1
wrong venue                 1
speedy deleted as g11       1
withdrawn by nominator      1
deleted bc socking          1
Name: afd_result, dtype: int64

## Write results

In [29]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [30]:
out_buffer = io.BytesIO()
people_metadata.to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=OUTPUT_BUCKET, Key=OUTPUT_FILE, Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'NW277YAGATAJZW0C',
  'HostId': 'WE6niZhZ2DasTYmhDPDYzThDRyW8Ti2Ql6avAoAgsSmvFdb+9uxjpRlbiCMe6xSYDxYm+e/Brfg=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'WE6niZhZ2DasTYmhDPDYzThDRyW8Ti2Ql6avAoAgsSmvFdb+9uxjpRlbiCMe6xSYDxYm+e/Brfg=',
   'x-amz-request-id': 'NW277YAGATAJZW0C',
   'date': 'Sat, 27 May 2023 15:46:28 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"d9258d20f1351923a536016cf1567655"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"d9258d20f1351923a536016cf1567655"',
 'ServerSideEncryption': 'AES256'}