In [2]:
import os
import numpy as np

files = os.listdir('predictions/')
ids = np.array([int(id_[:-2]) for id_ in files])
files = [f'predictions/{file_}' for file_ in files]
print(files[:3])
print(ids[:3])

['predictions/1287607959.p', 'predictions/1287343100.p', 'predictions/1288797774.p']
[1287607959 1287343100 1288797774]


### Run only onces (takes a lot of time)

In [3]:
import pandas as pd
import numpy as np
import pickle

predictions = np.zeros(shape=(len(files),3))

for idx,file_ in enumerate(files):
    predictions[idx,:]=pickle.load(open(file_,'rb'))
predictions

array([[2.03386001e-01, 7.14730674e-03, 2.72858794e-01],
       [6.10242114e-01, 3.58896992e-01, 4.46094272e-01],
       [1.70181899e-01, 1.76092343e-02, 2.67719784e-02],
       ...,
       [1.46156590e-01, 1.05522829e-02, 3.63079231e-02],
       [3.39778976e-01, 6.99874202e-02, 7.55072712e-03],
       [2.19300533e-03, 2.53532929e-04, 1.83536192e-03]])

In [4]:
mask1 = predictions[:,0]>0.5
mask2 = predictions[:,1]>0.5
mask3 = predictions[:,2]>0.5

predictions_rel = predictions[mask1 & mask2 & mask3,:]
ids_rel = ids[mask1&mask2&mask3]
predictions_rel.shape

assert ids_rel.shape[0]==predictions_rel.shape[0], '{} != {}'.format(ids_rel.shape,predictions_rel.shape[0])

In [5]:
average_values = np.average(predictions_rel,axis=1)
average_values

array([0.78219333, 0.90152124, 0.71454707, ..., 0.87624631, 0.88951657,
       0.60048552])

In [6]:
df = pd.DataFrame(
                  np.hstack([ids_rel[:,np.newaxis], predictions_rel,average_values[:,np.newaxis]]),
                  columns = ['id', 'yhat_1', 'yhat_2', 'yhat_3', 'average']
                 )
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average
0,1.288526e+09,0.516935,0.979761,0.849884,0.782193
1,1.270251e+09,0.791592,0.982341,0.930631,0.901521
2,1.290632e+09,0.513273,0.693918,0.936450,0.714547
3,1.287219e+09,0.644008,0.977683,0.979506,0.867066
4,1.325800e+09,0.508568,0.887852,0.944473,0.780298
...,...,...,...,...,...
66231,1.288858e+09,0.650216,0.982210,0.985291,0.872572
66232,1.287548e+09,0.759739,0.704049,0.968284,0.810691
66233,1.282773e+09,0.632152,1.000000,0.996587,0.876246
66234,1.291218e+09,0.806665,0.984518,0.877366,0.889517


In [7]:
df = df.sort_values(by=['average'],ascending=False)
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average
42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995
54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117
39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844
47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747
63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701
...,...,...,...,...,...
53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432
47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405
34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111
34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012


In [8]:
source = np.array(['GM1' if os.path.isfile(f'/home/ec2-user/SageMaker/data/GM_all_1945_1956/{int(id_)}.xml') else 'GM2' for id_ in df['id'] ])
df['source'] = source
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source
42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995,GM1
54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117,GM1
39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844,GM1
47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747,GM1
63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701,GM1
...,...,...,...,...,...,...
53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432,GM1
47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405,GM2
34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111,GM1
34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012,GM1


In [10]:
from bs4 import BeautifulSoup
from lxml import etree

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

In [20]:
[loc for loc in locations if 'toronto' in loc]

['îles de toronto',
 'lac toronto',
 'new toronto',
 'north toronto',
 'toronto',
 'toronto bay',
 'toronto creek',
 'toronto gore',
 'toronto harbour',
 'toronto island',
 'toronto islands',
 'toronto islets',
 'toronto lake']

In [None]:
GM1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'

locations = open('loc2.csv','r').read().splitlines()
locations = [loc.lower() for loc in locations]

titles = []
inCanada = []
for idx in range(df.shape[0]):
    id_ = df.iloc[idx,0]
    source = df.iloc[idx,-1]
    file_ = str(int(id_))+'.xml'
    file_ = GM1+file_ if source=='GM1' else GM2+file_
    assert os.path.isfile(file_), f'{file_}, {source}'
    title, text = get_title_and_text(file_)
    inCanada.append(any([loc in text for loc in locations]))
    
    titles.append(title)
titles = np.array(titles)
titles

In [17]:
id_in_canada_list = list(zip(df['id'],inCanada))
pickle.dump(id_in_canada_list, open('cache/id_in_canada_list.p', 'wb'))
id_in_canada_list[:3]

[(1291354226.0, True), (1325811105.0, True), (1323412512.0, True)]

In [19]:
np.sum(np.array(inCanada))

66098

In [18]:
df['title'] = titles
df['inCanada'] = inCanada
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source,title,inCanada
42312,1.291354e+09,0.999994,0.999999,0.999992,0.999995,GM1,Missing Persons,True
54005,1.325811e+09,0.999999,1.000000,0.997352,0.999117,GM1,McNarney Urges Jews Resettled in Palestine,True
39434,1.323413e+09,0.996536,0.999997,1.000000,0.998844,GM1,Refugees,True
47401,1.291626e+09,0.996256,0.999999,0.999985,0.998747,GM1,"Searching for Freedom, 55 DP's Reach Toronto",True
63418,1.287325e+09,0.993114,0.999990,1.000000,0.997701,GM1,Mother Guilty,True
...,...,...,...,...,...,...,...,...
53227,1.287809e+09,0.522059,0.524707,0.505530,0.517432,GM1,Jehovah Witnesses To Hold Assembly,True
47762,1.270467e+09,0.512565,0.523660,0.512991,0.516405,GM2,Mental care advised,True
34657,1.287362e+09,0.506678,0.529410,0.509246,0.515111,GM1,One Hundred Years Ago From The Globe Files,True
34890,1.289132e+09,0.516377,0.509698,0.515960,0.514012,GM1,Retread,True


In [105]:
df['id'] = df['id'].astype('int')
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source,title
42312,1291354226,0.999994,0.999999,0.999992,0.999995,GM1,Missing Persons
54005,1325811105,0.999999,1.000000,0.997352,0.999117,GM1,McNarney Urges Jews Resettled in Palestine
39434,1323412512,0.996536,0.999997,1.000000,0.998844,GM1,Refugees
47401,1291626059,0.996256,0.999999,0.999985,0.998747,GM1,"Searching for Freedom, 55 DP's Reach Toronto"
63418,1287325254,0.993114,0.999990,1.000000,0.997701,GM1,Mother Guilty
...,...,...,...,...,...,...,...
53227,1287808967,0.522059,0.524707,0.505530,0.517432,GM1,Jehovah Witnesses To Hold Assembly
47762,1270466754,0.512565,0.523660,0.512991,0.516405,GM2,Mental care advised
34657,1287362299,0.506678,0.529410,0.509246,0.515111,GM1,One Hundred Years Ago From The Globe Files
34890,1289131751,0.516377,0.509698,0.515960,0.514012,GM1,Retread


In [107]:
df['url'] = np.array([f'https://proquest.com/docview/{id_}' for id_ in df['id']])
df

Unnamed: 0,id,yhat_1,yhat_2,yhat_3,average,source,title,url
42312,1291354226,0.999994,0.999999,0.999992,0.999995,GM1,Missing Persons,https://proquest.com/docview/1291354226
54005,1325811105,0.999999,1.000000,0.997352,0.999117,GM1,McNarney Urges Jews Resettled in Palestine,https://proquest.com/docview/1325811105
39434,1323412512,0.996536,0.999997,1.000000,0.998844,GM1,Refugees,https://proquest.com/docview/1323412512
47401,1291626059,0.996256,0.999999,0.999985,0.998747,GM1,"Searching for Freedom, 55 DP's Reach Toronto",https://proquest.com/docview/1291626059
63418,1287325254,0.993114,0.999990,1.000000,0.997701,GM1,Mother Guilty,https://proquest.com/docview/1287325254
...,...,...,...,...,...,...,...,...
53227,1287808967,0.522059,0.524707,0.505530,0.517432,GM1,Jehovah Witnesses To Hold Assembly,https://proquest.com/docview/1287808967
47762,1270466754,0.512565,0.523660,0.512991,0.516405,GM2,Mental care advised,https://proquest.com/docview/1270466754
34657,1287362299,0.506678,0.529410,0.509246,0.515111,GM1,One Hundred Years Ago From The Globe Files,https://proquest.com/docview/1287362299
34890,1289131751,0.516377,0.509698,0.515960,0.514012,GM1,Retread,https://proquest.com/docview/1289131751


In [109]:
df[['id', 'yhat_1', 'yhat_2', 'yhat_3', 'average', 'url']].to_csv('sorted_list.csv')

In [110]:
!head sorted_list.csv

,id,yhat_1,yhat_2,yhat_3,average,url
42312,1291354226,0.9999943943443248,0.9999985711204106,0.9999924378133236,0.9999951344260197,https://proquest.com/docview/1291354226
54005,1325811105,0.9999993687950708,0.9999999883554902,0.9973516852054988,0.9991170141186866,https://proquest.com/docview/1325811105
39434,1323412512,0.9965357970652695,0.9999971270208927,0.9999999788854788,0.9988443009905469,https://proquest.com/docview/1323412512
47401,1291626059,0.9962558483823435,0.9999993369046168,0.9999846961970227,0.9987466271613276,https://proquest.com/docview/1291626059
63418,1287325254,0.9931136914080131,0.9999903696695294,0.9999999507254229,0.9977013372676552,https://proquest.com/docview/1287325254
53115,1282696478,0.993088079142227,0.9999988650745111,0.9999983495142334,0.9976950979103237,https://proquest.com/docview/1282696478
20192,1325855906,0.9962782822618911,0.9999957505736143,0.9962210620750738,0.9974983649701931,https://proquest.com/docview/1325855906
44043,1325917865,0.99999670169437

In [62]:
print(f'{average_values[42312]:4.3f}')

1.000


In [63]:
ids[53341]

1351150227

In [67]:
get_title_and_text('/home/ec2-user/SageMaker/data/GM_all_1945_1956/1351150227.xml')[0]

'Other 15 -- No Title'

In [64]:
ids[42312]

1287662836

In [68]:
get_title_and_text('/home/ec2-user/SageMaker/data/GM_all_1945_1956/1287662836.xml')[0]

'Auto Economy'

In [65]:
!ls /home/ec2-user/SageMaker/data/GM_all_1945_1956/1287662836.xml

/home/ec2-user/SageMaker/data/GM_all_1945_1956/1287662836.xml


In [66]:
!ls /home/ec2-user/SageMaker/data/GM_all_1945_1956/1351150227.xml

/home/ec2-user/SageMaker/data/GM_all_1945_1956/1351150227.xml


In [45]:
!find /home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/ -name 13236583.xml