### Reading all file names

In [None]:
import os
DATA_PATH = '/home/ec2-user/SageMaker/data/refugee_dataset_v1/'
files = os.listdir(DATA_PATH)
print(f'Number of files recovered: {len(files)}')

### Reading file IDs from Serperi examples

In [14]:
import re

SERPERI_EXAMPLE_FILE = 'refugee_DP_serperi_examples.csv'
lines = open(SERPERI_EXAMPLE_FILE, 'r').read().splitlines()
lines = [line for line in lines if line!=',' and line!='folder,URL']
serperi_files_id = set([re.findall('docview/([0-9]*)/',line)[0]  for line in lines])
print(f'Number of examples provided by Serperi: {len(serperi_files_id)}')


Number of examples provided by Serperi: 72


### Removing Serperi examples from all the files

In [16]:
remaining_files = [file_ for file_ in files if not file_[:-4] in serperi_files_id]
print(f'Number of files after removing the ones provided by Serperi: {len(remaining_files)}')


Number of files after removing the ones provided by Serperi: 205466


### Randomly choosing K examples for labelling

In [67]:
import numpy as np

# FIXING SEED FOR REPRODUCIBILITY
rand = np.random.default_rng(42)
examples = rand.choice(remaining_files ,size=150, replace=False)

#GENERATING FILE FOR EXPORTING 
EXAMPLES_FOR_LABELING_FILE = '150docs_for_labeling.csv'
writer = open(EXAMPLES_FOR_LABELING_FILE, 'w')
writer.write('\n'.join([example[:-4] for example in examples])+'\n')
writer.close()


#CONTENT AND SIZE OF THE FILE
print(f'{str(examples[:4])[:-1]} ... {str(examples[-4:])[1:]}')

!head -4 150docs_for_labeling.csv
!echo ---
!tail -4 150docs_for_labeling.csv
!echo Size of file to export: $(ls -l 150docs_for_labeling.csv | sed 's/.*ec2-user\ \([0-9KMB]*\).*/\1/g') Bytes \($(ls -hl 150docs_for_labeling.csv | sed 's/.*ec2-user\ \([0-9KMB\.]*\).*/\1/g')\)

['1348934186.xml' '1399153841.xml' '1432628450.xml' '1325661554.xml' ... '1519667988.xml' '1240415588.xml' '1137039205.xml' '1151743100.xml']
1348934186
1399153841
1432628450
1325661554
---
1519667988
1240415588
1137039205
1151743100
Size of file to export: 1650 Bytes (1.7K)


In [68]:
data_to_export = EXAMPLES_FOR_LABELING_FILE # The file and path you want to export
!aws s3 cp $data_to_export s3://pq-tdm-studio-results/tdm-ale-data/623/results/

Completed 1.6 KiB/1.6 KiB (14.9 KiB/s) with 1 file(s) remainingupload: ./150docs_for_labeling.csv to s3://pq-tdm-studio-results/tdm-ale-data/623/results/150docs_for_labeling.csv


In [70]:
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup

keywords = ['refugees',
            'negros',
            'southerners',
            'fugitives',
            'fugitive',
            'slaves',
            'displaced people',
            'displaced persons',
            'stateless',
            'boat people',
            'asylum seekers'
           ]
# We define a function to get the text content that we need from the XML articles available in our dataset
def getxmlcontent(root):
    if root.find('.//HiddenText') is not None:
        return(root.find('.//HiddenText').text)
    
    elif root.find('.//Text') is not None:
        return(root.find('.//Text').text)
    
    else:
        return None
# Creating three lists to store filename, fulltext, and date
# In TDM studio - the article ID is the same as the filename
filename_list = []
text_list = []
date_list = []

# Parse files and add data to lists
for file in examples:
    tree = etree.parse(DATA_PATH + file)
    root = tree.getroot()
    
    if getxmlcontent(root) is not None:
        soup = BeautifulSoup(getxmlcontent(root))
        text = soup.get_text()
    else:
        text = 'Error in processing document'
        
    date = root.find('.//NumericDate').text
    
    filename_list.append(file)
    text_list.append(text)
    date_list.append(date)
# Creating a dataframe, setting each of the columns to one of the lists we made in the cell above
df = pd.DataFrame({'Article ID': filename_list,'Text': text_list, 'Date': date_list})
df

Unnamed: 0,Article ID,Text,Date
0,1348934186.xml,"\n\n\n \n\n\n\n1HE 10R0NT0 SIAII Wednesday, De...",1999-12-08
1,1399153841.xml,"\n\n\n \n\n\n\nTORONTO STAR. TUESDAY, MAY 25, ...",1982-05-25
2,1432628450.xml,\n\n\n \n\n\n\n1 26 Classified Want Ad Headqua...,1943-09-03
3,1325661554.xml,\n\n\n \n\n\n\nSTATE ECONOMY MUST BE AIMED AT ...,1943-05-12
4,1356320334.xml,\n\n\n \n\n\n\nNOTES AND COMMENTS\n\n\nJudging...,1903-11-20
...,...,...,...
145,2032180012.xml,NEWSBest of today? INTERACTIVEAdopting Toronto...,2015-09-22
146,1519667988.xml,\n\n\n \n\n\n\nTHE SIAMESE COURT\n\n\nTHE ENGL...,1870-12-29
147,1240415588.xml,\n\n\n \n\n\n\nMan of the year\n\n\nHow you co...,1972-01-07
148,1137039205.xml,\n\n\n \n\n\n\nBooksellers' Selection Advertis...,1996-10-26


In [3]:
!conda update -y -n base conda

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ec2-user/anaconda3/envs/JupyterSystemEnv

  added / updated specs:
    - conda


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.10.3               |   py36h5fab9bb_2         3.1 MB  conda-forge
    cryptography-35.0.0        |   py36hb60f036_0         1.5 MB  conda-forge
    glib-2.70.0                |       h780b84a_1         430 KB  conda-forge
    glib-tools-2.70.0          |       h780b84a_1         107 KB  conda-forge
    gst-plugins-base-1.18.5    |       hf529b03_1         2.6 MB  conda-forge
    gstreamer-1.18.5           |       h9f60fe5_1         2.0 MB  conda-forge
    icu-68.2                   |       h9c3ff4c_0        13.1 MB  conda-forge
    jinja2-3.0.2               |     pyhd8ed1ab_0          99 KB  conda-forge
    lerc-3.0    

In [5]:
import spacy
nlp = spacy.load('en_core_web_sm')

ModuleNotFoundError: No module named 'spacy'

In [None]:
!python3 -m spacy download en_core_web_lg