### Label data

In [1]:
import os
import re
from utils.general import id2file
from utils.general import info,ok,warning


# SOURCE #1 (second round labeling, relevant and irrelevant data)
new_data = [line.split(';') for line in open('new_data.csv').read().splitlines()]
labeled_data = set([id_ for id_,_ in new_data])

# SOURCE #2 (first round labeling, only relevant)
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'
first_data = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        content = open(os.path.join(dirpath,filename),'r').read()
        ids = re.findall('/docview/([^/]*)/',content)
        labeled_data = labeled_data.union(set(ids))
    
# SOURCE #3 (irrelevant data from first round labeling)
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'
files = os.listdir(GM_dp_dirpath)
labeled_data = labeled_data.union([file_[:-4] for file_ in files if file_[:-4] not in labeled_data and file_.endswith('.xml')])

# Removing non existing files
not_found=[]
for id_ in list(labeled_data):
    if id2file(id_) is None:
        not_found.append(id_)
print(f'Not found: {not_found}')
for id_ in not_found:
    labeled_data = labeled_data.difference(set(not_found))
    
info(f'len(labeled_data)   = {len(labeled_data)}')


Not found: ['1242257052', '1222379804', '1238204962', '1143160388', '2459666609', '2122279956', '1239753620', '1238440920', '1411697642', '1151348424', '1136691129', '2122281371', '2459964104']
2022-03-11 12:45:21.110699 [ [1;94mINFO[0m  ] len(labeled_data)   = 7020


In [2]:
from utils.general import info,ok,warning
import os

predictions_dirpath = './predictions/'

files = os.listdir(predictions_dirpath)
total_predictions = len([file_ for file_ in files if file_.endswith('_v2.p')])

info(f"Total of files with predictions:                     {total_predictions:,}")
files = [file_ for file_ in files if file_.endswith('_v2.p') if not file_[:-5] in labeled_data]
info(f"Number of prediction ignoring already labeled data = {len(files):,}  (-{total_predictions-len(files):,})")

2022-03-11 12:45:23.752500 [ [1;94mINFO[0m  ] Total of files with predictions:                     2,051,869
2022-03-11 12:45:24.444366 [ [1;94mINFO[0m  ] Number of prediction ignoring already labeled data = 2,044,861  (-7,008)


In [3]:
import pandas as pd
import pickle
import numpy as np
from utils.tdmstudio import TDMStudio
from utils.general import id2file
data = {}
data['ID']=[]
data['yhat1']=[]
data['yhat2']=[]
data['yhat3']=[]
data['average']=[]
# data['URL']=[]
# data['title']=[]
for file_ in files:
    yhat1,yhat2,yhat3 = pickle.load(open(predictions_dirpath+file_,'rb'))
    if yhat1>0.5 and yhat2>0.5 and yhat3>0.5:
        data['ID'].append(file_[:-5])
        data['yhat1'].append(yhat1)
        data['yhat2'].append(yhat2)
        data['yhat3'].append(yhat3)
        data['average'].append(np.average([yhat1,yhat2,yhat3]))
    #     data['title'].append(TDMStudio.get_title(id2file(file_[:-5])))
#         data['URL'].append(f'https://proquest.com/docview/{file_[:-5]}')
    
#     data.append((file_,pickle.load(open(predictions_dirpath+file_,'rb'))))

info(f'Number of irrelevant (<0.5) articles discarded: {len(files)-len(data):,}')
df = pd.DataFrame(data).sort_values(by=['average'],ascending=False)
df

2022-03-11 13:21:46.591666 [ [1;94mINFO[0m  ] Number of irrelevant (<0.5) articles discarded: 2,044,856


Unnamed: 0,ID,yhat1,yhat2,yhat3,average
1465,1325811105,0.990952,0.997326,0.988864,0.992381
10562,1325855906,0.962391,0.990835,0.999985,0.984404
20771,1323412960,0.975553,0.999996,0.966560,0.980703
2694,1291263794,0.948477,1.000000,0.991822,0.980100
7643,1291456757,0.958632,0.996535,0.984521,0.979896
...,...,...,...,...,...
17362,1289761581,0.506423,0.509098,0.530303,0.515275
15566,1314012344,0.528989,0.509048,0.507524,0.515187
5793,1289194852,0.505084,0.510030,0.529077,0.514730
3680,1313994980,0.505016,0.524752,0.509527,0.513098


In [7]:
from utils.tdmstudio import TDMStudio
from utils.general import id2file
import spacy
nlp = spacy.load('en_core_web_sm', disable=['textcat','ner','parser','tagger','lemmatizer'])
tokenizer = nlp.tokenizer
def tokenize(text):
    return [token.text for token in tokenizer(text)]
canada_locs = set(open('cities.txt','r').read().splitlines() + open('provinces.txt','r').read().splitlines()+['Canada'])

def about_canada(text):
    tokens = tokenize(text)
    return any([token in canada_locs for token in tokens])

writer = open('to_export_second_round.csv', 'w')
count=0
cap=1000
idx=0
while count<cap:
    id_,y1,y2,y3,avg = df.iloc[idx,:]
    title,text = TDMStudio.get_title_and_text(id2file(id_))
    title=title.replace('\n','')
    if about_canada(f'{title}. {text}'):
#         str_ = f'{count+1:>3}: {id_}\t{y1:4.3f}\t{y2:4.3f}\t{y3:4.3f}\t{avg:4.3f}\thttps://proquest.com/docview/{id_}\t{title}\n'
        str_ = f'{id_}\t{y1:4.3f}\t{y2:4.3f}\t{y3:4.3f}\t{title}\n'
        writer.write(str_)
        count+=1
    idx+=1
writer.close()
info(f'Number of discarded for not mentioning Canada: {idx-cap}')
!head to_export_second_round.csv

2022-03-11 13:26:01.834448 [ [1;94mINFO[0m  ] Number of discarded for not mentioning Canada: 660
1323412960	0.976	1.000	0.967	Jews in U.S. Zone Insist on Going On to Palestine
1291263794	0.948	1.000	0.992	Apparel Made by Skilful DP's Impresses Senate Committee
1287505669	0.959	0.994	0.974	Textile Workers
1287587801	0.983	0.988	0.956	Polish Writers Under 'Red' Pressure
1412095279	0.990	0.971	0.956	Welcome Mat
1325795691	0.982	0.997	0.927	OP's Influx Here Hits New Record Of 100 Each Day
1284729689	0.950	0.982	0.971	314 Skilled Migrants Signed in West Indies
1325698778	0.991	0.996	0.911	Canada Shortly Will Admit 6,000 Disked Persons
1287413237	0.929	1.000	0.964	Ottawa Slows Down Flow of Immigrants
1270515457	0.976	0.949	0.966	Immigration


In [8]:
!ls -lh to_export_second_round.csv

-rw-rw-r-- 1 ec2-user ec2-user 61K Mar 11 13:26 to_export_second_round.csv


In [30]:
#proportion
mask1 = (df['yhat1']>0.5).values
mask2 = (df['yhat2']>0.5).values
mask3 = (df['yhat3']>0.5).values

meeting_requirment = df.iloc[mask1 & mask2 & mask3,:]

info(f'{(meeting_requirment.shape[0]/df.shape[0])*100:04.2f} %')

2022-03-09 14:44:33.392658 [ [1;94mINFO[0m  ] 01.23 %
