In [1]:
import os
import numpy as np

files = os.listdir('predictions/')
ids = np.array([int(id_[:-2]) for id_ in files])
files = [f'predictions/{file_}' for file_ in files]
print(files[:3])
print(ids[:3])

['predictions/1287607959.p', 'predictions/1287343100.p', 'predictions/1288797774.p']
[1287607959 1287343100 1288797774]


In [2]:
import pandas as pd
import numpy as np
import pickle

predictions = np.zeros(shape=(len(files),3))

for idx,file_ in enumerate(files):
    predictions[idx,:]=pickle.load(open(file_,'rb'))
predictions

array([[2.03386001e-01, 7.14730674e-03, 2.72858794e-01],
       [6.10242114e-01, 3.58896992e-01, 4.46094272e-01],
       [1.70181899e-01, 1.76092343e-02, 2.67719784e-02],
       ...,
       [1.46156590e-01, 1.05522829e-02, 3.63079231e-02],
       [3.39778976e-01, 6.99874202e-02, 7.55072712e-03],
       [2.19300533e-03, 2.53532929e-04, 1.83536192e-03]])

In [3]:
mask1 = predictions[:,0]>0.5
mask2 = predictions[:,1]>0.5
mask3 = predictions[:,2]>0.5

predictions_rel = predictions[mask1 & mask2 & mask3,:]
ids_rel = ids[mask1&mask2&mask3]
predictions_rel.shape

assert ids_rel.shape[0]==predictions_rel.shape[0], '{} != {}'.format(ids_rel.shape,predictions_rel.shape[0])

In [4]:
average_values = np.average(predictions_rel,axis=1)
average_values

array([0.78219333, 0.90152124, 0.71454707, ..., 0.87624631, 0.88951657,
       0.60048552])

In [5]:
df = pd.DataFrame(
                  np.hstack([ids_rel[:,np.newaxis], predictions_rel,average_values[:,np.newaxis]]),
                  columns = ['id', 'yhat_1', 'yhat_2', 'yhat_3', 'average']
                 )
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average
0,1.288526e+09,0.516935,0.979761,0.849884,0.782193
1,1.270251e+09,0.791592,0.982341,0.930631,0.901521
2,1.290632e+09,0.513273,0.693918,0.936450,0.714547
3,1.287219e+09,0.644008,0.977683,0.979506,0.867066
4,1.325800e+09,0.508568,0.887852,0.944473,0.780298
...,...,...,...,...,...
66231,1.288858e+09,0.650216,0.982210,0.985291,0.872572
66232,1.287548e+09,0.759739,0.704049,0.968284,0.810691
66233,1.282773e+09,0.632152,1.000000,0.996587,0.876246
66234,1.291218e+09,0.806665,0.984518,0.877366,0.889517


In [9]:
df = df.sort_values(by=['average'],ascending=False)
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source
42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995,GM1
54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117,GM1
39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844,GM1
47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747,GM1
63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701,GM1
...,...,...,...,...,...,...
53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432,GM1
47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405,GM2
34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111,GM1
34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012,GM1


In [7]:
source = np.array(['GM1' if os.path.isfile(f'/home/ec2-user/SageMaker/data/GM_all_1945_1956/{int(id_)}.xml') else 'GM2' for id_ in df['id'] ])
df['source'] = source
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source
42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995,GM1
54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117,GM1
39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844,GM1
47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747,GM1
63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701,GM1
...,...,...,...,...,...,...
53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432,GM1
47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405,GM2
34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111,GM1
34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012,GM1


In [10]:
from bs4 import BeautifulSoup
from lxml import etree

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

In [None]:
import spacy
import string
nlp = spacy.load('en_core_web_sm', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.text for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens

from tqdm import tqdm
GM1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'

locations = open('cities.txt','r').read().splitlines()
locations += open('provinces.txt','r').read().splitlines()
locations = [loc for loc in locations]


titles = []
inCanada = []
for idx in tqdm(range(df.shape[0])):
    id_ = df.iloc[idx,:]['id']
    source = df.iloc[idx,:]['source']
    file_ = str(int(id_))+'.xml'
    file_ = GM1+file_ if source=='GM1' else GM2+file_
    assert os.path.isfile(file_), f'{file_}, {source}'
    title, text = get_title_and_text(file_)
    tokens = set(tokenize(text))
    inCanada.append(any([loc in tokens for loc in locations]))
    
    titles.append(title)
    
    
df['title'] = titles
df['inCanada'] = inCanada
df.to_csv('cache/df.csv')

# titles = np.array(titles)
print(f'{np.sum(np.array(inCanada))} of {len(inCanada)} ')

  2%|▏         | 1439/66236 [00:34<19:35, 55.12it/s]

In [14]:
import pandas as pd
df = pd.read_csv('cache/df.csv')
print(df.shape)
df

(66236, 9)


Unnamed: 0.1,Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source,title,inCanada
0,42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995,GM1,Missing Persons,True
1,54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117,GM1,McNarney Urges Jews Resettled in Palestine,False
2,39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844,GM1,Refugees,True
3,47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747,GM1,"Searching for Freedom, 55 DP's Reach Toronto",True
4,63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701,GM1,Mother Guilty,True
...,...,...,...,...,...,...,...,...,...
66231,53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432,GM1,Jehovah Witnesses To Hold Assembly,True
66232,47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405,GM2,Mental care advised,True
66233,34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111,GM1,One Hundred Years Ago From The Globe Files,True
66234,34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012,GM1,Retread,False


In [31]:
# df['title'] = titles
# df['inCanada'] = inCanada
# df.to_csv('cache/df.csv')

In [15]:
df['url'] = np.array([f'https://proquest.com/docview/{id_}' for id_ in df['id']])

df.loc[df['inCanada']==True,:].shape

(37693, 10)

In [18]:
df_to_export = df.loc[df['inCanada']==True,:]

In [51]:
import re
relevant_ids = set()
for dirpath, dirnames, filenames in os.walk('DP-relevant articles/'):
    for filename in filenames:
        file_ = os.path.join(dirpath,filename)
        if file_.endswith('.bib'):
            content = open(file_,'r').read()
            relevant_ids = relevant_ids.union(set(re.findall('/docview/([0-9]*)/',content)))

'1291446397' in (relevant_ids)

True

In [55]:
df_to_export

Unnamed: 0.1,Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source,title,inCanada,url
0,42312,1291354226,0.999994,0.999999,0.999992,0.999995,GM1,Missing Persons,True,https://proquest.com/docview/1291354226.0
2,39434,1323412512,0.996536,0.999997,1.000000,0.998844,GM1,Refugees,True,https://proquest.com/docview/1323412512.0
3,47401,1291626059,0.996256,0.999999,0.999985,0.998747,GM1,"Searching for Freedom, 55 DP's Reach Toronto",True,https://proquest.com/docview/1291626059.0
4,63418,1287325254,0.993114,0.999990,1.000000,0.997701,GM1,Mother Guilty,True,https://proquest.com/docview/1287325254.0
7,44043,1325917865,0.999997,0.991729,0.999998,0.997241,GM1,Displaced Persons,True,https://proquest.com/docview/1325917865.0
...,...,...,...,...,...,...,...,...,...,...
66228,35851,1325875743,0.517504,0.521094,0.520300,0.519633,GM1,Christmas Carollers Entertain Patients,True,https://proquest.com/docview/1325875743.0
66230,58933,1316351702,0.510385,0.512833,0.531182,0.518134,GM2,Workers may quit Metro bridges to get improved...,True,https://proquest.com/docview/1316351702.0
66231,53227,1287808967,0.522059,0.524707,0.505530,0.517432,GM1,Jehovah Witnesses To Hold Assembly,True,https://proquest.com/docview/1287808967.0
66232,47762,1270466754,0.512565,0.523660,0.512991,0.516405,GM2,Mental care advised,True,https://proquest.com/docview/1270466754.0


In [59]:
writer = open('to_export.csv', 'w')
for idx in range(df_to_export.shape[0]):
    _,id_,yhat1,yhat2,yhat3,average,_,title,_,_ = df_to_export.iloc[idx,:]
    if not str(int(id_)) in relevant_ids:
        str_ = f'{id_},{title}\n'
        writer.write(str_)
writer.close()
!cat to_export.csv | wc -l

37690


In [50]:
!cat to_export.csv | wc -l

37693


In [60]:
!head to_export.csv
!ls -lh to_export.csv 

1291354226,Missing Persons
1323412512,Refugees
1291626059,Searching for Freedom, 55 DP's Reach Toronto
1287325254,Mother Guilty
1325917865,Displaced Persons
1314013378,Winnipeg Evacuation Scenes
1323636046,Displaced Jews Are Returned to Camps
1270411098,Foster mother fights for sisters
1284671825,Selective Immigration
1291335626,Aliens Seized in Toronto On Illegal Entry Count Lose Deportation Plea
-rw-rw-r-- 1 ec2-user ec2-user 1.7M Mar  1 20:44 to_export.csv


In [47]:
set(df_to_export['id'].astype('U')).intersection(relevant_ids)

{'1287189092', '1291295108', '1325698070'}

In [49]:
[ (idx,title) for idx,title in enumerate(df_to_export['title']) if 'alien' in title.lower()]

[(9, 'Aliens Seized in Toronto On Illegal Entry Count Lose Deportation Plea'),
 (93, 'Tried Suicide 3 Times, Alienation Jury Told'),
 (855, 'False Passport Aliens Freed on $1,000 Bail Each'),
 (928, 'Urge Conference Not to Alienate Enslaved Peoples'),
 (1027, 'Doctors, Painters Among 800 Aliens Through Falls'),
 (1087, 'U.S. Clamps Down on Alien Visas'),
 (3327, 'Alienation Suit'),
 (4426, 'Alienation Action'),
 (4616, 'Woman Ruled Hostile, Detained During Recess At Alienation Hearing'),
 (5440, 'Dionne Blames Intruders For Alienating Quintuplets'),
 (5709, '3 Officials Fired After 75 Aliens Enter Illegally'),
 (5930, 'Alienation Case Postponed As Ukrainian Doctor Missing'),
 (7652, 'Alien Smuggler, Toronto Citizen Given 10 Months'),
 (8098, 'Alienation Defendant Denies Improper Acts'),
 (9846, 'States Ontario Centre For Smuggling Aliens'),
 (10848, 'Canada Will Bar Alien Travellers Unles Visas OK'),
 (12604, 'Charge Customs Man Smuggled Seven Aliens Into U.S. for $100 Each'),
 (14762,