In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime 
import io
import numpy as np
import pandas as pd
import re
import time
from urllib.parse import urlparse
import yaml

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

In [4]:
with open('config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

## Read in base files

In [5]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [6]:
scraped_articles = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['DELETED_VS_POPULATED_AFD_ARTICLES'])
scraped_articles = scraped_articles[scraped_articles['is_kept']]
test_primary_key(scraped_articles, ['article_id'])

In [7]:
afd_nomination_metadata = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['JOINED_ARTICLE_SCRAPE_DATES_AND_AFD_NAMES'])
test_primary_key(afd_nomination_metadata, ['article_id', 'file_name'])

In [8]:
afd_nomination_metadata[0:3]

Unnamed: 0,article_id,scrape_date,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,title_links
0,2022_Glen_Waverley_Suicide,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",2022 Glen Waverley Suicide,True,1.0,False,daily_afd_log/2023-01-03/2022_December_23.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=20...
1,A.S.D._Villabiagio,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",A.S.D. Villabiagio,True,1.0,False,daily_afd_log/2023-01-03/2022_December_23.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",keep,[https://en.wikipedia.org/w/index.php?title=A....
2,Aaron_Kemmer,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",Aaron Kemmer,True,1.0,False,daily_afd_log/2023-01-19/2023_January_8.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=Aa...


In [9]:
# limit scope to afd nominated articles where we found a populated wikipedia page
print(afd_nomination_metadata.shape)
afd_nomination_metadata = afd_nomination_metadata.merge(scraped_articles,
                                                      on = ['article_id'])
print(afd_nomination_metadata.shape)

(782, 10)
(566, 13)


In [10]:
test_primary_key(afd_nomination_metadata, ['article_id', 'file_name'])

In [11]:
afd_nomination_metadata[0:3]

Unnamed: 0,article_id,scrape_date,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,title_links,scraped_date,is_kept,scraped_path
0,A.S.D._Villabiagio,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",A.S.D. Villabiagio,True,1.0,False,daily_afd_log/2023-01-03/2022_December_23.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",keep,[https://en.wikipedia.org/w/index.php?title=A....,2023-01-01,True,individual_afd_page_html/2023-01-01/A.S.D._Vil...
1,Aaron_Kemmer,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",Aaron Kemmer,True,1.0,False,daily_afd_log/2023-01-19/2023_January_8.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=Aa...,2023-01-01,True,individual_afd_page_html/2023-01-01/Aaron_Kemm...
2,Abbas_Sajwani,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",Abbas Sajwani,True,1.0,False,daily_afd_log/2023-01-07/2022_December_27.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=Ab...,2023-01-01,True,individual_afd_page_html/2023-01-01/Abbas_Sajw...


## Load scraped wikipedia articles for articles nominated for deletion, then extract article text vs reference links

In [12]:
start_time = time.time()

In [13]:
afd_text_extracts = pd.DataFrame()

# using a for loop for now; it runs relatively fast (90 minutes across 600 records)
for index, row in afd_nomination_metadata.iterrows():
    
    wiki_article = read_s3_file(s3_reader, config_files['RAW_BUCKET'], 
                   row['scraped_path'])
    
    article_soup = BeautifulSoup(wiki_article, "html.parser")
    
    # get the span tag with ID references, where reference links are stored
    reference_tag = article_soup.find('span', id='References')
    if reference_tag is not None:
        has_references = True
    else:
        has_references = False
    
    if has_references:
        article_text = wiki_article.split(str(reference_tag))[0] # article text before references
        references_text = wiki_article.split(str(reference_tag))[1] # reference links
        references_text = references_text.split("<h2>Navigation menu")[0] # Remove wikipedia navigation links
    else:
        article_text = wiki_article
        references_text = None
        
    temp_results = pd.DataFrame({"article_id": row['article_id'],
                                 "file_name": row['file_name'],
                                 "scraped_path": row['scraped_path'],
                                 "wiki_article": wiki_article,
                                "articles_text": article_text,
                                 "references_text": references_text,
                                 "has_references": has_references
                                }, index=[0])
    
    if afd_text_extracts is None:
        afd_text_extracts = temp_results
    else:
        afd_text_extracts = afd_text_extracts.append(temp_results)

In [14]:
end_time = time.time()
execution_time = end_time - start_time
print(execution_time)

89.38166213035583


In [15]:
afd_text_extracts.shape

(566, 7)

In [16]:
afd_text_extracts['has_references'].value_counts()

True     521
False     45
Name: has_references, dtype: int64

In [17]:
afd_text_extracts['has_references'].value_counts() / afd_text_extracts.shape[0]

True     0.920495
False    0.079505
Name: has_references, dtype: float64

## For articles that have references, extract individual reference links

In [18]:
start_time = time.time()

In [19]:
def extract_domain(url):
    """
    Extracts the domain from a given URL.

    Parameters:
        url (str): The URL from which to extract the domain.

    Returns:
        str: The domain extracted from the URL.

    Example Usage:
        url = "https://www.example.com/some-page"
        domain = extract_domain(url)
        print(domain)  # Output: www.example.com
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc
    return domain

In [20]:
reference_link_results = pd.DataFrame()

# using a for loop for now; it runs relatively fast (13 seconds on 500 records)
for index, row in afd_text_extracts[afd_text_extracts['has_references']].iterrows():
    
    references_soup = BeautifulSoup(row['references_text'], "html.parser")
    
    # get <a> tags with links
    a_tags = references_soup.find_all("a")
    
    reference_links = pd.DataFrame({"article_id": row['article_id'],
                                 "file_name": row['file_name'],
                                 "scraped_path": row['scraped_path'],
                                 "references_text": row['references_text'],
                                "a_tag": a_tags})
    
    # Extract the URL link from the <a> tag
    reference_links['href'] = reference_links['a_tag'].apply(lambda x: x.get('href'))
    reference_links = reference_links[pd.isnull(reference_links['href'])==False]
    
    # some reference links start with "web.archive.org/[date]/[ACTUAL_URL]"
    # because archive.org stores snapshots of web apges over time
    # extract the actual URL
    archive_org_pattern = r'https://web\.archive\.org/web/\d+/(.*)'
    reference_links['href_after_archive_org'] = reference_links['href'].apply(lambda x: re.sub(archive_org_pattern, r'\1', x))
    
    reference_links['domain'] = reference_links['href_after_archive_org'].apply(lambda x: extract_domain(x))
    
     
    wiki_keywords = ['/wiki/', 'wikipedia.org', 'wikimedia', 'mediawiki', 'creativecommons']
    # Assign 'True' to 'is_wiki_page' if any keyword is found in 'href', otherwise 'False'
    reference_links['is_wiki_page'] = reference_links['href'].apply(lambda x: any(keyword in x for keyword in wiki_keywords))
   
    if reference_link_results is None:
        reference_link_results = reference_links
    else:
        reference_link_results = reference_link_results.append(reference_links)

In [21]:
end_time = time.time()
execution_time = end_time - start_time
print(execution_time)

12.82423710823059


In [22]:
reference_link_results[reference_link_results['href'].str.contains("web.archive.org")][0:3]

Unnamed: 0,article_id,file_name,scraped_path,references_text,a_tag,href,href_after_archive_org,domain,is_wiki_page
6,Aaron_Kemmer,daily_afd_log/2023-01-19/2023_January_8.txt,individual_afd_page_html/2023-01-01/Aaron_Kemm...,"<span class=""mw-editsection""><span class=""mw-e...","[""Made In Space - Aaron Kemmer""]",https://web.archive.org/web/20160420005254/htt...,http://www.madeinspace.us/aaron-kemmer/,www.madeinspace.us,False
56,Aaron_Kemmer,daily_afd_log/2023-01-19/2023_January_8.txt,individual_afd_page_html/2023-01-01/Aaron_Kemm...,"<span class=""mw-editsection""><span class=""mw-e...",[Archived],https://web.archive.org/web/20180110124346/htt...,https://www.bloomberg.com/news/articles/2018-0...,www.bloomberg.com,False
4,Adam_Cella,daily_afd_log/2023-01-21/2023_January_10.txt,individual_afd_page_html/2023-01-11/Adam_Cella...,"<span class=""mw-editsection""><span class=""mw-e...",[Archived],https://web.archive.org/web/20151105092616/htt...,http://www.sherdog.com/fighter/Adam-Cella-69956,www.sherdog.com,False


In [23]:
reference_link_results[reference_link_results['is_wiki_page']==False]['domain'].value_counts().reset_index()[0:10]

Unnamed: 0,index,domain
0,,8406
1,cricketarchive.com,441
2,novelasyseries.univision.com,209
3,www.theguardian.com,140
4,www.worldcat.org,134
5,www.youtube.com,94
6,doi.org,73
7,www.uefa.com,70
8,news.bbc.co.uk,68
9,www.bbc.co.uk,67


## Write results

In [24]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [25]:
out_buffer = io.BytesIO()
afd_text_extracts.to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['SCRAPED_ARTICLE_TEXT_AND_REFERENCE_TEXT'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '2NQBJTWT7HV3MD0Q',
  'HostId': 'CY3OslgWFuyEavlQFpopO0C6+71n/mS761U8OLoD8QyAMYC24aigknk9vAlt2z/59t0E36rIpKluiTBGDt0cUw==',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'CY3OslgWFuyEavlQFpopO0C6+71n/mS761U8OLoD8QyAMYC24aigknk9vAlt2z/59t0E36rIpKluiTBGDt0cUw==',
   'x-amz-request-id': '2NQBJTWT7HV3MD0Q',
   'date': 'Wed, 31 May 2023 23:25:06 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"e10581aabb966717e89030f331ce3f85"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"e10581aabb966717e89030f331ce3f85"',
 'ServerSideEncryption': 'AES256'}

In [26]:
reference_link_results = reference_link_results.drop('a_tag', axis=1) #drop column we don't need; we've extract relevant detail from it
reference_link_results = reference_link_results[reference_link_results['domain']!=""] # only keep meaningful results

reference_link_results.to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['SCRAPED_REFERENCE_LINKS'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'D74Y88Z4FMRHYV0H',
  'HostId': 'psK7VU4dSMma06PRVdOZa2xxwd/6L8TTgHlncZL6G/Svna1KlDUQ6To1LJHqO5/jQud1rOzp6AY=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'psK7VU4dSMma06PRVdOZa2xxwd/6L8TTgHlncZL6G/Svna1KlDUQ6To1LJHqO5/jQud1rOzp6AY=',
   'x-amz-request-id': 'D74Y88Z4FMRHYV0H',
   'date': 'Wed, 31 May 2023 23:26:12 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"e25fb657fe34d6dca96c4e0386bf8776"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"e25fb657fe34d6dca96c4e0386bf8776"',
 'ServerSideEncryption': 'AES256'}