In [1]:
from utils.general import info, ok, warning, id2file

### Label data

In [2]:
import os
import re
relevant_set = set()
irrelevant_set = set()

# Loading new_data
new_data = [line.split(';') for line in open('new_data.csv').read().splitlines()]
relevant_set = relevant_set.union(set([id_ for id_,label in new_data if label.strip()=='R']))
irrelevant_set = irrelevant_set.union(set([id_ for id_,label in new_data if label.strip()=='I']))

# Loading original data
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'

first_data = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        content = open(os.path.join(dirpath,filename),'r').read()
        ids = re.findall('/docview/([^/]*)/',content)
        relevant_set = relevant_set.union(set(ids))
    
# articles containg DP and Canada from that period, that were not deteted by Serperi
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

files = os.listdir(GM_dp_dirpath)

irrelevant_set = irrelevant_set.union([file_[:-4] for file_ in files if file_[:-4] not in relevant_set and file_.endswith('.xml')])

not_found=[]
for id_ in list(relevant_set)+list(irrelevant_set):
    if id2file(id_) is None:
        not_found.append(id_)
print(f'Not found: {not_found}')
for id_ in not_found:
    relevant_set = relevant_set.difference(set(not_found))
    irrelevant_set = irrelevant_set.difference(set(not_found))
    
info(f'len(relevant_set)   = {len(relevant_set)}')
info(f'len(irrelevant_set) = {len(irrelevant_set)}')


Not found: ['2122279956', '1151348424', '1242257052', '1136691129', '1411697642', '1238204962', '1143160388', '2459666609', '1222379804', '1239753620', '1238440920', '2122281371', '2459964104']
2022-03-10 00:39:57.013015 [ [1;94mINFO[0m  ] len(relevant_set)   = 542
2022-03-10 00:39:57.013161 [ [1;94mINFO[0m  ] len(irrelevant_set) = 6478


In [3]:
from tqdm import tqdm
from sklearn.svm import SVC
from utils.models import get_glove600,get_glove300, get_bow
from utils.general import id2file 
import numpy as np
from utils.tdmstudio import TDMStudio

import pickle

info('Creating models...')
classifiers = [
               SVC(kernel='linear', probability=True),              # BOW
               SVC(C=15, kernel='linear', probability=True),        # GloVe 300
               SVC(C=4, degree=1, kernel='poly', probability=True) # GloVe 600
              ]
files = [id2file(id_) for id_ in list(relevant_set)+list(irrelevant_set)]


texts_and_titles = [TDMStudio.get_title_and_text(file_) for file_ in files]
info('Loading data')
if os.path.isfile('cache/data.p'):
    info('Data loaded from disk.')
    data = pickle.load(open('cache/data.p','rb'))
else:
    warning('Data needs to be rebuild.')
    data = [
            np.vstack([get_bow(title, text) for title, text in tqdm(texts_and_titles)]),
            np.vstack([get_glove300(title, text) for title, text in tqdm(texts_and_titles)]),
            np.vstack([get_glove600(title, text) for title, text in tqdm(texts_and_titles)]),
#             np.vstack = stack([get_glove600(id_) for id_ in tqdm(files)]),
           ]

y = np.zeros(shape=(len(files)))
y[:len(relevant_set)]=1

info('Training models')
for model,X in zip(classifiers,data):
    model.fit(X,y)
ok('Done!')

2022-03-10 00:40:12.362435 [ [1;94mINFO[0m  ] Creating models...
2022-03-10 00:40:29.889101 [ [1;94mINFO[0m  ] Loading data
2022-03-10 00:40:29.890839 [ [1;94mINFO[0m  ] Data loaded from disk.
2022-03-10 00:40:31.175349 [ [1;94mINFO[0m  ] Training models
2022-03-10 00:43:11.307201 [  [1;92mOK[0m   ] Done!


In [None]:
from sklearn.model_selection import cross_validate
rtas = [
       cross_validate(classifiers[0],X,y,scoring=['accuracy','precision','recall','f1']),
       cross_validate(classifiers[1],X,y,scoring=['accuracy','precision','recall','f1']),
       cross_validate(classifiers[2],X,y,scoring=['accuracy','precision','recall','f1']),
       ]

In [57]:
import pandas as pd
for rta in rtas:
#     print(pd.DataFrame(rta))
    print("Averages")
    print(pd.DataFrame(np.average(pd.DataFrame(rta).values,axis=0).reshape(1,6),columns=pd.DataFrame(rta).columns))
    print()

Averages
   fit_time  score_time  test_accuracy  test_precision  test_recall   test_f1
0  5.427517    0.128385       0.966952        0.786546      0.78595  0.785293

Averages
   fit_time  score_time  test_accuracy  test_precision  test_recall   test_f1
0  5.512406     0.10344       0.962393         0.75481     0.761961  0.756796

Averages
   fit_time  score_time  test_accuracy  test_precision  test_recall   test_f1
0  6.063117    0.149544       0.967664        0.812515     0.758274  0.783145



In [4]:
GM_all_part1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM_all_part2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'

files = [GM_all_part1+filename for filename in os.listdir(GM_all_part1)]
files += [GM_all_part2+filename for filename in os.listdir(GM_all_part2)]

predictions_dirpath = './predictions/'


def process_file(file_):
    file_id = file_.split('/')[-1][:-4]
    output_file = predictions_dirpath+file_id+'_v2.p'
    if not os.path.isfile(output_file):
        yhat = np.zeros(shape=(3,))
        title, text = TDMStudio.get_title_and_text(file_)
        x_bow = get_bow(title, text)
        x_glove300 = get_glove300(title, text)  
        x_glove600 = get_glove600(title, text)
        for idx,(model,x) in enumerate(zip(classifiers, [x_bow,x_glove300, x_glove600])):
            yhat[idx]=model.predict_proba([x])[0,1]
        pickle.dump(yhat,open(output_file, 'wb'))
        del(x_bow,x_glove300,x_glove600,title,text,yhat,file_id,output_file)
ok('done')

2022-03-10 00:43:20.792650 [  [1;92mOK[0m   ] done


In [5]:
del(data)

In [None]:
%%time

import datetime
import concurrent.futures

writer = open('done.txt', 'w')
writer.write(f'{datetime.datetime.now()} Starting...\n')

info('Starting...')

files = files[1067422:]
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    executor.map(process_file, files, chunksize=1000)

writer.write(f'{datetime.datetime.now()} Done!')
writer.close()
ok('Done!')

2022-03-10 00:51:49.987474 [ [1;94mINFO[0m  ] Starting...


In [9]:
print('hello worlds')

hello worlds
