In [1]:
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime 
import io
import numpy as np
import pandas as pd
import time
import yaml

## Objective: Identify Articles for Deletion articles that we found populated vs those that were already deleted when scraped

In [2]:
%run "../libraries/aws_utils.ipynb"

In [3]:
%run "../libraries/general_utils.ipynb"

In [4]:
with open('config.yml', 'r') as file:
   config_files = yaml.safe_load(file)

In [5]:
s3_reader = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

In [6]:
buffer = io.BytesIO()

In [7]:
scraped_files = read_parquet_file(s3_reader, 
                                  config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                      config_files['JOINED_ARTICLE_SCRAPE_DATES_AND_AFD_NAMES'])

In [8]:
scraped_files[0:3]

Unnamed: 0,article_id,scrape_date,entity,found_person,num_entities,is_multiple_entity_types,file_name,discussion,afd_result,title_links
0,2022_Glen_Waverley_Suicide,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",2022 Glen Waverley Suicide,True,1.0,False,daily_afd_log/2023-01-03/2022_December_23.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=20...
1,A.S.D._Villabiagio,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",A.S.D. Villabiagio,True,1.0,False,daily_afd_log/2023-01-03/2022_December_23.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",keep,[https://en.wikipedia.org/w/index.php?title=A....
2,Aaron_Kemmer,"[2023-01-01T00:00:00.000000, 2023-01-02T00:00:...",Aaron Kemmer,True,1.0,False,daily_afd_log/2023-01-19/2023_January_8.txt,"<div class=""boilerplate afd vfd xfd-closed arc...",delete,[https://en.wikipedia.org/w/index.php?title=Aa...


In [9]:
scraped_files.shape[0]

782

## Check if articles are deleted or populated

In [10]:
def check_if_article_is_deleted(file_text):
    ARTICLE_DELETED_STRING = 'Wikipedia does not have an article with this exact name'
    if ARTICLE_DELETED_STRING in file:
        return 'deleted'
    else:
        return 'kept'

In [11]:
start_time = time.time()

In [12]:
results = pd.DataFrame()

# using a for loop for now; it runs relatively fast (2 minutes across 700 records)
for index, row in scraped_files.iterrows():
    
    num_scraped_pages = len(row['scrape_date'])
    
    for j in range(0, num_scraped_pages):
        scraped_article_path = f'''individual_afd_page_html/{row['scrape_date'][j].astype(datetime.datetime).strftime('%Y-%m-%d')}/{row['article_id']}.txt'''
        file = read_s3_file(s3_reader, config_files['RAW_BUCKET'], 
                            scraped_article_path)
        outcome = check_if_article_is_deleted(file)
        
        if outcome=="kept":
            temp_results = pd.DataFrame({"article_id": row['article_id'],
                                        "scraped_date": row['scrape_date'][j],
                                         "is_kept": True,
                                        "scraped_path": scraped_article_path}, index=[0])
            
            break
        
        if outcome != "kept" and j==(num_scraped_pages-1):
                temp_results = pd.DataFrame({"article_id": row['article_id'],
                                            "scraped_date": None,
                                             "is_kept": False,
                                            "scraped_path": None}, index=[0])
        
    if results is None:
        results = temp_results
    else:
        results = results.append(temp_results)

In [13]:
end_time = time.time()
execution_time = end_time - start_time

In [14]:
print(f'Run time: {execution_time}')

Run time: 106.89912104606628


In [15]:
results[0:3]

Unnamed: 0,article_id,scraped_date,is_kept,scraped_path
0,2022_Glen_Waverley_Suicide,NaT,False,
0,A.S.D._Villabiagio,2023-01-01 00:00:00,True,individual_afd_page_html/2023-01-01/A.S.D._Vil...
0,Aaron_Kemmer,2023-01-01 00:00:00,True,individual_afd_page_html/2023-01-01/Aaron_Kemm...


In [16]:
results['is_kept'].value_counts()

True     566
False    216
Name: is_kept, dtype: int64

In [17]:
results['is_kept'].value_counts() / results.shape[0]

True     0.723785
False    0.276215
Name: is_kept, dtype: float64

## Write results

In [18]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [19]:
out_buffer = io.BytesIO()
results.to_parquet(out_buffer, index=False)
s3_writer.put_object( Bucket=config_files['INTEREDIARY_OUTPUT_BUCKET'], 
                     Key=config_files['DELETED_VS_POPULATED_AFD_ARTICLES'], 
                     Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': '8ER8Q0BD2M97ZAT3',
  'HostId': 'Zgp9ZEkNBAKc1liH6tFFZMFil3ga3caUFAuOTgPMZ8HaoXZuUki775KkJGheb6O72N5TueelISI=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'Zgp9ZEkNBAKc1liH6tFFZMFil3ga3caUFAuOTgPMZ8HaoXZuUki775KkJGheb6O72N5TueelISI=',
   'x-amz-request-id': '8ER8Q0BD2M97ZAT3',
   'date': 'Sun, 28 May 2023 17:05:47 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"c663203b90c31eb3f43c81a514689dae"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"c663203b90c31eb3f43c81a514689dae"',
 'ServerSideEncryption': 'AES256'}