In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime
import io
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import re
import yaml

## Objective: Join together a data set listing articles nominated for deletion and a data set listing scraped Wikipedia articles

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

In [4]:
with open('config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

In [5]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [7]:
scraped_files = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['ARTICLE_SCRAPE_DATES'])

In [8]:
scraped_files[0:3]

Unnamed: 0,article_id,scrape_date
0,"""Marvel_vs._DC""_cards",[2023-04-30T00:00:00.000000]
1,$teven_Cannon,"[2023-02-22T00:00:00.000000, 2023-02-23T00:00:..."
2,-_(album),"[2023-03-02T00:00:00.000000, 2023-03-03T00:00:..."


In [9]:
afd_metadata = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['AFD_ARTICLE_NAMES'])

In [10]:
afd_metadata[0:3]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
1,Featherston Drive Public School,False,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete
2,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete


## Identify articles about people vs other entities

In [11]:
people_to_process = afd_metadata[(afd_metadata['found_person']) & afd_metadata['num_entities']==1]
initial_people_count = people_to_process.shape[0]

## Extract article_id, a unique identifier of the article name, from the Articles for Deletion Metadata so that we can join it onto the scraped_articles table

In [12]:
def get_a_tags(html):
    """
    Parses an HTML document and returns a list of all <a> tags found in the HTML.

    Parameters:
        html (str): The HTML document to parse.

    Returns:
        list: A list of dictionaries representing the <a> tags found in the HTML document. Each dictionary contains the attributes and values of the respective <a> tag.
    """
    soup = BeautifulSoup(html, "html.parser")
    href_tags = soup.find_all("a")
    return href_tags

def get_href_tags(a_tags):
    """
    Extracts the href attribute from a list of <a> tags and returns a list of valid non-empty href values.

    Parameters:
        a_tags (list): A list of dictionaries representing the <a> tags.

    Returns:
        list: A list of valid non-empty href values extracted from the <a> tags.
    """
    href_tags = [x.get('href') for x in a_tags]
    href_tags = [x for x in href_tags if x is not None]
    return href_tags

def get_title_links(href_tags):
    """
    Filters a list of href values to return only the links that are Wikipedia page titles.

    Parameters:
        href_tags (list): A list of href values.

    Returns:
        list: A list of unique Wikipedia page title links derived from the href values.
    """
    title_links =  [x for x in href_tags if 'https://en.wikipedia.org/w/index.php?title=' in x]
    title_links =  [x for x in title_links if 'Special:' not in x]
    title_links = list(set([x.split("&action")[0] for x in title_links]))
    return title_links

In [13]:
people_to_process['a_tags'] = people_to_process['discussion'].apply(lambda x: get_a_tags(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
people_to_process['href_tags'] = people_to_process['a_tags'].apply(lambda x: get_href_tags(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
people_to_process['title_links'] = people_to_process['href_tags'].apply(lambda x: get_title_links(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [16]:
people_to_process[0:3]

Unnamed: 0,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,a_tags,href_tags,title_links
0,Margaret Louise Skourlis,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,"[[talk page], [deletion review], [soft-delete]...","[/wiki/Help:Using_talk_pages, /wiki/Wikipedia:...",[https://en.wikipedia.org/w/index.php?title=Ma...
2,Michael D. Mehta,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,"[[talk page], [deletion review], [PMC], [(talk...","[/wiki/Help:Using_talk_pages, /wiki/Wikipedia:...",[https://en.wikipedia.org/w/index.php?title=Mi...
6,Sangsadia Nirbachan 1991,True,1,False,daily_afd_log/2023-01-01/2022_December_21.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",merge,"[[talk page], [deletion review], [Military awa...","[/wiki/Help:Using_talk_pages, /wiki/Wikipedia:...",[https://en.wikipedia.org/w/index.php?title=Bi...


## Assess how well this logic work to obtain an article_id
* Hope to find only 1 title link per row

In [17]:
people_to_process['len_title_links'] = people_to_process['title_links'].apply(lambda x: len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
people_to_process['len_title_links'].value_counts()

1     874
2      35
3      11
4       4
37      1
6       1
7       1
11      1
0       1
Name: len_title_links, dtype: int64

In [19]:
people_to_process = people_to_process[people_to_process['len_title_links']==1]

In [20]:
people_to_process['article_id'] = people_to_process['title_links'].apply(lambda x: x[0].split("title=")[1])

In [21]:
dropped_rows = initial_people_count - people_to_process.shape[0]
dropped_rows_pct = dropped_rows / initial_people_count

print(f'Dropped rows: {dropped_rows}')
print(f'Dropped percent: {100*round(dropped_rows_pct,2)}')

Dropped rows: 55
Dropped percent: 6.0


In [22]:
test_primary_key(people_to_process, ['article_id', 'file_name'])

## Join two data frames

In [23]:
scraped_files['article_id'] = scraped_files['article_id'].astype(str)
people_to_process['article_id'] = people_to_process['article_id'].astype(str)
articles_with_scraping_metadata = scraped_files.merge(people_to_process,
                                                    on = ['article_id'],
                                                     how="left"
                                            )

## Write results

In [24]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [25]:
out_buffer = io.BytesIO()
output_cols = ['article_id','scrape_date',
               'entity','found_person','num_entities',
               'is_multiple_entity_types','file_name','discussion',
               'afd_result','title_links']
articles_with_scraping_metadata[pd.isnull(articles_with_scraping_metadata['entity'])==False][output_cols].to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['JOINED_ARTICLE_SCRAPE_DATES_AND_AFD_NAMES'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '97NRH0P08N5RS2SJ',
  'HostId': '4LqVzDndXgGNvVoc1SZRLflilqQOBoW3+PaPMwJZWnzzZkUE9xJuFXlDe3UsEQa8h1WcCo5s7EM=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '4LqVzDndXgGNvVoc1SZRLflilqQOBoW3+PaPMwJZWnzzZkUE9xJuFXlDe3UsEQa8h1WcCo5s7EM=',
   'x-amz-request-id': '97NRH0P08N5RS2SJ',
   'date': 'Sun, 28 May 2023 15:13:06 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"916a9d371aad898f194eeaea59ab46bc"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"916a9d371aad898f194eeaea59ab46bc"',
 'ServerSideEncryption': 'AES256'}