### Reading the filenames for the whole database (205,538 files)

In [1]:
import os
data_path = '/home/ec2-user/SageMaker/data/refugee_dataset_v1/'
files = os.listdir(data_path)
print(f'Cantidad de archivos: {len(files):,}')
print(files[:5])

Cantidad de archivos: 205,538
['1352134238.xml', '1400269650.xml', '1323706441.xml', '1527231392.xml', '1335009129.xml']


### Reading XML files to retrieve PublisherName (Globe and Mail or Tronto Star)

In [3]:
import re

publishers = []
for file in files:
    publisher = re.search('<PublisherName>(.*)</PublisherName>', open(os.path.join(data_path,file)).read()).groups()[0]
    publishers.append(publisher)


### Randomly choosing 20 articles from each publisher

In [4]:
import numpy as np
import pandas as pd

data = list(zip(range(len(files)),files,publishers))

toronto_star=[]
globe_and_mail = []
for file,publisher in zip(files,publishers):
    if publisher=='The Globe and Mail':
        globe_and_mail.append(file)
    else:
        toronto_star.append(file)
print(f'Toronto Star    count: {len(toronto_star):6,}')
print(f'Globe and Mail  count: {len(globe_and_mail):6,}')


# FIXING SEED FOR REPRODUCIBILITY
rand = np.random.default_rng(42)
examples_GM = rand.choice(globe_and_mail ,size=20, replace=False)
examples_TS = rand.choice(toronto_star ,size=20, replace=False)
    

df = pd.DataFrame(np.concatenate([examples_GM,examples_TS]), columns=['file_id'])

Toronto Star    count: 97,336
Globe and Mail  count: 108,202


In [9]:
from lxml import etree


toronto_star_titles=[]
for filename in toronto_star:
    tree = etree.parse(data_path + filename)
    root = tree.getroot()
    

    title = root.find('.//Title').text
    toronto_star_titles.append(title)


In [15]:
count = 0
for title in toronto_star_titles:
    if "Page" in title:
        count+=1
  
        
print(f'Number of Full Pages in Toronto Star {count}')        
print(f'Percentage of Full Pages in Toronto Star {count/len(toronto_star_titles)}')

Number of Full Pages in Toronto Star 97336
Percentage of Full Pages in Toronto Star 1.0


In [16]:
len(toronto_star_titles)

97336

### Reading XML file (retrieving title, date, text, publisher)

In [65]:

from lxml import etree
from bs4 import BeautifulSoup

# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
        return(root.find('.//HiddenText').text)
    
    elif root.find('.//Text') is not None:
        return(root.find('.//Text').text)
    
    else:
        return None
    
# Creating three lists to store filename, fulltext, and date
# In TDM studio - the article ID is the same as the filename
filename_list = []
text_list = []
date_list = []
title_list = []
publisher_list = []
url_list = []

# Parse files and add data to lists
for file in df['file_id']:
    tree = etree.parse(data_path + file)
    root = tree.getroot()
    
    if getxmlcontent(root) is not None:
        soup = BeautifulSoup(getxmlcontent(root))
        text = soup.get_text()
    else:
        text = 'Error in processing document'
        
    date = root.find('.//NumericDate').text
    
    title = root.find('.//Title').text
    publisher = root.find('.//PublisherName').text
    
    
    
    filename_list.append(file)
    text_list.append(text)
    date_list.append(date)
    publisher_list.append(publisher)
    title_list.append(title)
    url_list.append('https://www.proquest.com/docview/'+file[:-4])
    
df['Title'] = title_list
df['Date'] = date_list
df['Publisher'] = publisher_list
df['Text'] =  [text.replace('\n', '<br>') for text in text_list]
df['URL'] = [f'<a href={url}>{url}</a>' for url in url_list]
df.to_html('for analyzing_locally.html',escape=False)
df[["file_id", "Publisher"]].to_html("for_export.html",escape=False)