In [25]:
import awswrangler as wr
from bs4 import BeautifulSoup
import boto3
import config as cfg
import datetime
import io
import numpy as np
import pandas as pd
import re
import spacy

In [2]:
%run "../libraries/aws_utils.ipynb"

In [74]:
INPUT_BUCKET = 'afd-scraped'
PREFIX = "individual_afd_page_html/2023"
OUTPUT_BUCKET = 'women-in-red-intermediary'
OUTPUT_FILE = 'afd_article_scrape_dates.parquet'

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
s3 = boto3.resource('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_reader['accessCode'],
                    aws_secret_access_key=cfg.aws_reader['secretCode'])

## Compile list of scraped articles

In [6]:
objects = get_list_of_s3_files(s3, INPUT_BUCKET, PREFIX)

In [45]:
objects_pd = pd.DataFrame({"file_name": objects})
objects_pd['scrape_date'] = objects_pd['file_name'].apply(lambda x: pd.to_datetime(x.split("/")[1]))
objects_pd['article_id'] = objects_pd['file_name'].apply(lambda x: x.split("/")[2].replace(".txt",""))

In [46]:
objects_pd.shape

(21330, 3)

## Get list of scraped dates for each article

In [47]:
scrape_dates_by_article = objects_pd.groupby('article_id')['scrape_date'].apply(list).reset_index()
scrape_dates_by_article['scrape_date'] = scrape_dates_by_article['scrape_date'].apply(lambda x: sorted(x))

In [48]:
scrape_dates_by_article[0:5]

Unnamed: 0,article_id,scrape_date
0,"""Marvel_vs._DC""_cards",[2023-04-30 00:00:00]
1,$teven_Cannon,"[2023-02-22 00:00:00, 2023-02-23 00:00:00, 202..."
2,-_(album),"[2023-03-02 00:00:00, 2023-03-03 00:00:00, 202..."
3,100_Days_Campaign,[2023-03-08 00:00:00]
4,15.ai,[2023-01-16 00:00:00]


## Write output

In [75]:
s3_writer = boto3.client('s3',
                    region_name='us-east-1',
                    aws_access_key_id=cfg.aws_writer['accessCode'],
                    aws_secret_access_key=cfg.aws_writer['secretCode'])

In [86]:
out_buffer = io.BytesIO()
scrape_dates_by_article.to_parquet(out_buffer, index=False)

In [88]:
s3_writer.put_object( Bucket=OUTPUT_BUCKET, Key=OUTPUT_FILE, Body=out_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'Z9WA60C5VZQDZEKT',
  'HostId': 'LzbcUukUHquaaRiuObxqT7dv7mSrjIwi356EpNmA1NmH1xM6wiQYQ+AId/poU/Hdo3aqmtocOng=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'LzbcUukUHquaaRiuObxqT7dv7mSrjIwi356EpNmA1NmH1xM6wiQYQ+AId/poU/Hdo3aqmtocOng=',
   'x-amz-request-id': 'Z9WA60C5VZQDZEKT',
   'date': 'Sat, 27 May 2023 14:14:24 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"c5d6a70383e811262d9091cdbf6c54cb"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"c5d6a70383e811262d9091cdbf6c54cb"',
 'ServerSideEncryption': 'AES256'}

In [91]:
df

Unnamed: 0,article_id,scrape_date
0,"""Marvel_vs._DC""_cards",[2023-04-30T00:00:00.000000]
1,$teven_Cannon,"[2023-02-22T00:00:00.000000, 2023-02-23T00:00:..."
2,-_(album),"[2023-03-02T00:00:00.000000, 2023-03-03T00:00:..."
3,100_Days_Campaign,[2023-03-08T00:00:00.000000]
4,15.ai,[2023-01-16T00:00:00.000000]
...,...,...
3867,Ángel_Gaspar,[2023-03-05T00:00:00.000000]
3868,École_Française_Internationale_de_Djeddah,"[2023-01-12T00:00:00.000000, 2023-01-13T00:00:..."
3869,École_Française_Internationale_de_Riyad,"[2023-01-16T00:00:00.000000, 2023-01-17T00:00:..."
3870,Ömer_Aysan_Barış,[2023-05-03T00:00:00.000000]


## For each article, check if we have full text or if article was already deleted

In [15]:
def check_if_article_is_deleted(file_text):
    ARTICLE_DELETED_STRING = 'Wikipedia does not have an article with this exact name'
    if ARTICLE_DELETED_STRING in file:
        return 'deleted'
    else:
        return 'kept'

In [16]:
for file_name in objects_pd['file_name'].values[0:5]:
    file = read_s3_file(s3, INPUT_BUCKET, file_name)
    outcome = check_if_article_is_deleted(file)

In [17]:
o'a'].apply(lambda x: x + 1)

'deleted'