## Objectives: 
1) Gather names of people whose Wikipedia articles have been nominated for deletion

2) Gather free text discussion about why those articles were nominated.

In [1]:
import awswrangler as wr
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime
import pandas as pd
import spacy

In [2]:
INPUT_BUCKET = 'afd-scraped'
PREFIX = "daily_afd_log/2023"
OUTPUT_BUCKET = 'women-in-red-intermediary'
OUTPUT_FILE = 'afd_names_and_discussion.csv'

In [3]:
nlp = spacy.load("en_core_web_sm")

In [4]:
s3 = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

## Functions

In [5]:
def create_h3_containers(bs4_result_set):
    """
    Extracts the text and tags of the specified type between h3 tags in a BeautifulSoup result set.

    Args:
        bs4_result_set (bs4.element.ResultSet): A result set containing h3 tags.

    Returns:
        dict: A dictionary of h3 tag names and their contents, where each value is a list of
              BeautifulSoup tags representing the contents between the corresponding h3 tags.

    Example:
        >>> h3_tags = soup.find_all("h3")
        >>> h3_containers = create_h3_containers(h3_tags)
        >>> print(h3_containers["Section 1"])
        [<p>Some text</p>, <ul><li>Item 1</li><li>Item 2</li></ul>, <p>More text</p>]
    """

    h3_containers = {}
    TEXT_TO_REMOVE = "[edit]"

    for i, current_h3 in enumerate(bs4_result_set):
        next_h3 = bs4_result_set[i + 1] if i < len(bs4_result_set) - 1 else None
        this_tag_sibling = [sibling for sibling in current_h3.next_siblings if sibling != next_h3]
        clean_h3_name = current_h3.text.replace(TEXT_TO_REMOVE, "")
        h3_containers[clean_h3_name] = this_tag_sibling

    return h3_containers

In [6]:
def get_afd_result_from_containers(bs4_result_set):
    """
    Extracts the text and tags of the specified type between h3 tags in a BeautifulSoup result set.

    Args:
        bs4_result_set (bs4.element.ResultSet): A result set containing h3 tags.

    Returns:
        dict: A dictionary of h3 tag names and the preceding <p> tag, which provides the result of the 
        articles for deletion discussion

    Example:
        >>> h3_tags = soup.find_all("h3")
        >>> h3_containers = create_h3_containers(h3_tags)
        >>> print(h3_containers["Section 1"])
        [<p>The result was delete</p>]
    """

    h3_containers = {}
    TEXT_TO_REMOVE = "[edit]"

    for h3_tag in bs4_result_set:
 
        p_tag = h3_tag.find_previous_sibling('p')
        if p_tag:
            clean_h3_name = h3_tag.text.replace(TEXT_TO_REMOVE, "")
            h3_containers[clean_h3_name] = p_tag
        
    return h3_containers

In [7]:
def extract_people_metadata_from_logs(logs_df, s3_bucket):
    '''
    Given a Pandas DataFrame of log files and an S3 bucket name, 
    extracts metadata about people mentioned in the logs.
    
    :param logs_df: A Pandas DataFrame containing a column of S3 object keys.
    :type logs_df: pandas.DataFrame
    :param s3_bucket: The name of the S3 bucket containing the log files.
    :type s3_bucket: str
    :return: A Pandas DataFrame containing metadata about people mentioned in the logs.
    :rtype: pandas.DataFrame
    '''
    
    people_metadata = None

    for file in logs_df['file_name'].values:
        print(file)

        object_content = read_s3_file(s3_bucket, file)
        h3_tags = parse_h3_tags(object_content)
        h3_content = create_h3_containers(h3_tags)
        afd_result = get_afd_result_from_containers(h3_tags)

        for name in h3_content.keys():
            results = pd.DataFrame(identify_people(name), index=[0])
            results['file_name'] = file
            results['discussion'] = ''.join([str(x) for x in h3_content.get(name)]) # flatten list of content between h3 stags
            
            try:
                results['afd_result'] = str(afd_result.get(name))
            except:
                results['afd_result'] = None
            
            if people_metadata is None:
                people_metadata = results
            else:
                people_metadata = pd.concat([people_metadata, results])
                
    return people_metadata


In [8]:
def get_list_of_s3_files(BUCKET, PREFIX):
    """
    This function takes in the name of an Amazon S3 bucket and a prefix for an S3 key 
    and returns a list of S3 object keys that match the given prefix.

    :param BUCKET: A string representing the name of an S3 bucket
    :type BUCKET: str
    :param PREFIX: A string representing the prefix to search for in the S3 objects' keys
    :type PREFIX: str

    :return: A list of S3 object keys that match the given prefix
    :rtype: list[str]
    
    Example Usage:
    >>> s3_files = get_list_of_s3_files('my-s3-bucket', 'path/to/my/files/')
    >>> print(s3_files)
    ['path/to/my/files/file1.txt', 'path/to/my/files/file2.txt', 'path/to/my/files/file3.txt']
    """
    bucket = s3.Bucket(BUCKET)
    objects = bucket.objects.filter(Prefix=PREFIX)
    return [obj.key for obj in objects]

In [9]:
def parse_h3_tags(html_page):
    """
    Parses an HTML page and returns a list of all the <h3> tags found in it.

    Args:
        html_page (str): The HTML page to parse.

    Returns:
        A list of BeautifulSoup Tag objects, each representing an <h3> tag found in the HTML page.

    Raises:
        None.

    Example:
        >>> html_page = '<html><body><h1>Title</h1><h3>First article</h3><p>Some text.</p><h3>Second article</h3><p>More text.</p></body></html>'
        >>> parse_h3_tags(html_page)
        [<h3>First article</h3>, <h3>Second article</h3>]
    """
    soup = BeautifulSoup(html_page, "html.parser")
    h3_tags = soup.find_all("h3")
    return h3_tags


In [10]:
def get_most_recent_log(object_df):
    """
    Returns a DataFrame with the most recent 'scrape_date' for each 'log_date'.

    :param object_df: pandas.DataFrame
        A DataFrame with columns 'file_name', 'scrape_date', and 'log_date'.
    :return: pandas.DataFrame
        A DataFrame with columns 'log_date' and 'scrape_date', where each row
        corresponds to the most recent 'scrape_date' for each 'log_date' in the input DataFrame.
    """
    max_scrape_date = object_df.groupby('log_date')['scrape_date'].max().reset_index()
    return max_scrape_date

In [11]:
def identify_people(afd_article_name):
    """
    Identify people in an AFD (Articles for Deletion) article name using Named Entity Recognition (NER) provided by the
    spaCy library. Returns a dictionary with information about the identified entities.

    :param afd_article_name: The name of the AFD article to analyze.
    :type afd_article_name: str
    :return: A dictionary with information about the identified entities.
    :rtype: dict

    The dictionary contains the following keys:
        - 'entity': The name of the AFD article that was analyzed.
        - 'found_person': A boolean value indicating whether at least one person was identified in the article name.
        - 'num_entities': The number of unique entity types identified in the article name.
        - 'is_multiple_entity_types': A boolean value indicating whether more than one entity type was identified in the
                                       article name.

    Example usage:
    >>> identify_people("Wikipedia:Articles for deletion/John Doe")
    {'entity': 'Wikipedia:Articles for deletion/John Doe',
     'found_person': True,
     'num_entities': 1,
     'is_multiple_entity_types': False}
    """
    doc = nlp(afd_article_name)

    FOUND_PERSON = False 
    MULTIPLE_ENTITY_TYPES = False

    unique_entity_labels = len(set([entity.label_ for entity in doc.ents])) 

    if any(entity.label_=="PERSON" for entity in doc.ents):
        FOUND_PERSON = True
    if unique_entity_labels>1 and any(entity.label_=="PERSON" for entity in doc.ents):
        MULTIPLE_ENTITY_TYPES = True

    return {'entity': afd_article_name, 'found_person': FOUND_PERSON, 'num_entities': unique_entity_labels,
            'is_multiple_entity_types': MULTIPLE_ENTITY_TYPES}


In [12]:
def read_s3_file(bucket_name, file_key):
    """
    Reads the contents of a file stored on S3.

    :param bucket_name: The name of the S3 bucket.
    :type bucket_name: str
    :param file_key: The unique key of the file in the S3 bucket.
    :type file_key: str
    :return: The contents of the file as a string.
    :rtype: str
    """
    s3_object = s3.Object(bucket_name, file_key)
    object_content = s3_object.get()['Body'].read().decode('utf-8')
    return object_content

In [13]:
def test_primary_key(df, primary_key_col):
    """
    Checks if a specified column or set of columns constitutes a primary key for a given pandas dataframe. 

    :param df: Pandas dataframe to be checked for a primary key.
    :type df: pandas.DataFrame
    :param primary_key_col: Name or list of names of column(s) to be checked for being a primary key.
    :type primary_key_col: str or list[str]
    :raises AssertionError: If the specified column or set of columns is not a primary key for the given dataframe.
    :return: None

    Example Usage:
    >>> import pandas as pd
    >>> data = {'Name': ['John', 'Alex', 'Mike', 'John'], 'Age': [24, 26, 27, 24], 'Gender': ['M', 'M', 'M', 'M']}
    >>> df = pd.DataFrame(data)
    >>> test_primary_key(df, 'Name')
    AssertionError: Name is not the primary key
    """
    try:
        assert any(df[primary_key_col].duplicated())==False 
    except:
        raise AssertionError(f'{primary_key_col} is not the primary key')


## Collect a list of Article for Deletion logs in this bucket with desired prefix

In [14]:
objects = get_list_of_s3_files(INPUT_BUCKET, PREFIX)

In [15]:
objects_pd = pd.DataFrame({"file_name": objects})
objects_pd['scrape_date'] = objects_pd['file_name'].apply(lambda x: pd.to_datetime(x.split("/")[1]))
objects_pd['log_date'] = objects_pd['file_name'].apply(lambda x: x.split("/")[2])

In [16]:
objects_pd

Unnamed: 0,file_name,scrape_date,log_date
0,daily_afd_log/2023-01-01/2022_December_21.txt,2023-01-01,2022_December_21.txt
1,daily_afd_log/2023-01-01/2022_December_22.txt,2023-01-01,2022_December_22.txt
2,daily_afd_log/2023-01-01/2022_December_23.txt,2023-01-01,2022_December_23.txt
3,daily_afd_log/2023-01-01/2022_December_24.txt,2023-01-01,2022_December_24.txt
4,daily_afd_log/2023-01-01/2022_December_25.txt,2023-01-01,2022_December_25.txt
...,...,...,...
500,daily_afd_log/2023-05-01/2023_April_27.txt,2023-05-01,2023_April_27.txt
501,daily_afd_log/2023-05-01/2023_April_28.txt,2023-05-01,2023_April_28.txt
502,daily_afd_log/2023-05-01/2023_April_29.txt,2023-05-01,2023_April_29.txt
503,daily_afd_log/2023-05-01/2023_April_30.txt,2023-05-01,2023_April_30.txt


## We captured snapshots of Articles for Deletion logs on multiple dates, so let's filter to the most recent snapshot for a given log

In [17]:
most_recent_log = get_most_recent_log(objects_pd)

In [18]:
objects_pd = objects_pd.merge(most_recent_log, on = ['log_date','scrape_date'])

In [19]:
objects_pd

Unnamed: 0,file_name,scrape_date,log_date
0,daily_afd_log/2023-01-01/2022_December_21.txt,2023-01-01,2022_December_21.txt
1,daily_afd_log/2023-01-02/2022_December_22.txt,2023-01-02,2022_December_22.txt
2,daily_afd_log/2023-01-03/2022_December_23.txt,2023-01-03,2022_December_23.txt
3,daily_afd_log/2023-01-04/2022_December_24.txt,2023-01-04,2022_December_24.txt
4,daily_afd_log/2023-01-05/2022_December_25.txt,2023-01-05,2022_December_25.txt
...,...,...,...
71,daily_afd_log/2023-05-01/2023_April_27.txt,2023-05-01,2023_April_27.txt
72,daily_afd_log/2023-05-01/2023_April_28.txt,2023-05-01,2023_April_28.txt
73,daily_afd_log/2023-05-01/2023_April_29.txt,2023-05-01,2023_April_29.txt
74,daily_afd_log/2023-05-01/2023_April_30.txt,2023-05-01,2023_April_30.txt


In [20]:
test_primary_key(objects_pd, 'log_date')

## The H3 tag only contains the name of the person or organization associated with the article nominated for deletion. Let's collect the articles for deletion discussion, found between H3 tags.

In [21]:
people_metadata = extract_people_metadata_from_logs(objects_pd, INPUT_BUCKET)

daily_afd_log/2023-01-01/2022_December_21.txt
daily_afd_log/2023-01-02/2022_December_22.txt
daily_afd_log/2023-01-03/2022_December_23.txt
daily_afd_log/2023-01-04/2022_December_24.txt
daily_afd_log/2023-01-05/2022_December_25.txt
daily_afd_log/2023-01-06/2022_December_26.txt
daily_afd_log/2023-01-07/2022_December_27.txt
daily_afd_log/2023-01-08/2022_December_28.txt
daily_afd_log/2023-01-09/2022_December_29.txt
daily_afd_log/2023-01-10/2022_December_30.txt
daily_afd_log/2023-01-11/2022_December_31.txt
daily_afd_log/2023-01-12/2023_January_1.txt
daily_afd_log/2023-01-13/2023_January_2.txt
daily_afd_log/2023-01-14/2023_January_3.txt
daily_afd_log/2023-01-15/2023_January_4.txt
daily_afd_log/2023-01-16/2023_January_5.txt
daily_afd_log/2023-01-17/2023_January_6.txt
daily_afd_log/2023-01-18/2023_January_7.txt
daily_afd_log/2023-01-19/2023_January_8.txt
daily_afd_log/2023-01-20/2023_January_9.txt
daily_afd_log/2023-01-21/2023_January_10.txt
daily_afd_log/2023-01-21/2023_January_11.txt
daily_af

In [22]:
people_metadata[0:5]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"\n<dl><dd><span id=""Margaret_Louise_Skourlis"">...",<p>The result was <b>delete</b>. Consider this...
0,Featherston Drive Public School,False,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"\n<div class=""other-afds"" style=""width:33%; bo...","<p>The result was <b>delete</b>. <span style=""..."
0,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"\n<dl><dd><span id=""Michael_D._Mehta""></span><...","<p>The result was <b>delete</b>. ♠<a href=""/wi..."
0,Index of World War II articles,False,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"\n<dl><dd><span id=""Index_of_World_War_II_arti...",<p>The result was <b>delete</b>. Whether index...
0,Radical love (social psychology),False,0,False,daily_afd_log/2023-01-01/2022_December_21.txt,"\n<dl><dd><span id=""Radical_love_(social_psych...",<p>The result was <b>soft delete</b>. Based on...


## Write results

In [23]:
s3 = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [24]:
csv_string = people_metadata.to_csv(index=False)
s3.Bucket(OUTPUT_BUCKET).put_object(Key=OUTPUT_FILE, Body=csv_string)

s3.Object(bucket_name='women-in-red-intermediary', key='afd_names_and_discussion.csv')