## Aux functions

In [1]:
from IPython.core.display import display, HTML
import datetime
def info(str_):
    print(f'{datetime.datetime.now()} [ \033[1;94mINFO\x1b[0m  ] {str_}')
def ok(str_):
    print(f'{datetime.datetime.now()} [  \033[1;92mOK\x1b[0m   ] {str_}')
def warning(str_):
    print(f'{datetime.datetime.now()} [\x1b[1;31mWARNING\x1b[0m] {str_}')
def html(str_=''):
    display(HTML(str_))

In [1]:
print('')




In [7]:
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup

def get_date(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
#     title = root.find('.//Title').text
    date = root.find('.//NumericDate').text
#     publisher = root.find('.//PublisherName').text
    assert date is not None
    
    return date

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

## Retrieving Label data

In [2]:
import os
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'
assert os.path.exists(DP_examples_dirpath)
files = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        filepath = os.path.join(dirpath,filename)
        assert os.path.isfile(filepath)
        files.append(filepath)
info(f'Number of files retrieved: {len(files)}')

2022-03-08 16:50:49.855616 [ [1;94mINFO[0m  ] Number of files retrieved: 11


In [3]:
import re
total_count=0
urls = []
ids = []
for file_ in files:
    content = open(file_, 'r').read()
    urls += re.findall('url = {(.*)}',content)
    ids += re.findall('/docview/([^/]*)/',content)
    total_count += len(urls)
    assert len(ids)==len(urls)

relevant_ids = set(ids)
# print(len(urls))
info(f'Relevant article count: {len(relevant_ids)}')

2022-03-08 16:50:50.182696 [ [1;94mINFO[0m  ] Relevant article count: 535


## Reading files from disk

In [4]:
import os
GM_all_part1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM_all_part2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

all_files = [GM_all_part1+file_ for file_ in os.listdir(GM_all_part1)]
all_files += [GM_all_part2+file_ for file_ in os.listdir(GM_all_part2)]

dp_files = [GM_dp_dirpath+file_ for file_ in os.listdir(GM_dp_dirpath)]

# GM_dirpath = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
# all_files = [TS_dirpath+file_id for file_id in os.listdir(TS_dirpath)]
# all_files += [GM_dirpath+file_id for file_id in os.listdir(GM_dirpath)]

info(f'len(all_files):       {len(all_files):10,}')
info(f'len(dp_files):        {len(dp_files):10,}')

2022-03-08 16:50:55.886680 [ [1;94mINFO[0m  ] len(all_files):        2,057,868
2022-03-08 16:50:55.886856 [ [1;94mINFO[0m  ] len(dp_files):             6,938


#### Creating dataframe with ids + 'unknown' label


In [5]:
import pandas as pd
dp_articles_df = pd.DataFrame(
                  ['unknown']*len(dp_files), 
                  columns=['label']
                 )
dp_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in dp_files]
# df['source']='GM'

# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
dp_articles_df

Unnamed: 0,label,id
0,unknown,1287338646
1,unknown,1270339579
2,unknown,1287501005
3,unknown,1289163109
4,unknown,1289176186
...,...,...
6933,unknown,1282757714
6934,unknown,1270441158
6935,unknown,1288866985
6936,unknown,1356078942


#### Adding correct label + date to DataFrame


In [8]:
import numpy as np

relevants=0
dates = []

for idx,file_ in enumerate(dp_files):
    id_ = file_.split('/')[-1][:-4]
    date = get_date(file_)
    year = int(date[:4])
    if id_ in relevant_ids:
        dp_articles_df.iloc[idx,0]='relevant'
        relevants+=1
    else:
        dp_articles_df.iloc[idx,0]='irrelevant'
        
    dates.append(date)
dp_articles_df['date']=dates
info(f"Number of relevants:   {np.sum(dp_articles_df['label']=='relevant'):,}")
info(f"Number of irrelevants: {np.sum(dp_articles_df['label']=='irrelevant'):,}")
info(f"'Number of unknown:    {np.sum(dp_articles_df['label']=='unknown'):,}")
dp_articles_df

2022-03-08 16:51:38.270132 [ [1;94mINFO[0m  ] Number of relevants:   514
2022-03-08 16:51:38.271392 [ [1;94mINFO[0m  ] Number of irrelevants: 6,424
2022-03-08 16:51:38.272610 [ [1;94mINFO[0m  ] 'Number of unknown:    0


Unnamed: 0,label,id,date
0,irrelevant,1287338646,1951-03-15
1,irrelevant,1270339579,1965-02-17
2,irrelevant,1287501005,1958-02-18
3,irrelevant,1289163109,1956-10-13
4,irrelevant,1289176186,1954-03-27
...,...,...,...
6933,irrelevant,1282757714,1963-04-19
6934,irrelevant,1270441158,1965-05-14
6935,irrelevant,1288866985,1957-11-19
6936,irrelevant,1356078942,1963-10-12


### SVM with average GloVe 

#### Generating X, y for training model (involves look-up for GloVe vectors (spacy nlp))


In [9]:
import spacy
import pickle

if os.path.isfile('cache/X_Glove_300.p') and os.path.isfile('cache/y_Glove_300.p'):
    X = pickle.load(open('cache/X_Glove_300.p','rb'))
    y = pickle.load(open('cache/y_Glove_300.p', 'rb'))
else:
    info('Building X,y')
    nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

    example_no = np.sum(dp_articles_df["label"]!='unknown')

    X = np.zeros(shape=(example_no, 300), dtype='float32')
    y = np.zeros(shape=(example_no,), dtype='int32')

    inst_no=0
    for idx in range(len(dp_articles_df)):
        label, id_, date = dp_articles_df.iloc[idx,:]
        file_ = GM_dp_dirpath+id_+'.xml'

        assert label!='unknown'
        title, text = get_title_and_text(file_)
        
        X[inst_no,:] = nlp(title+' '+text).vector
#         X[inst_no,] = nlp(text).vector
        y[inst_no] = 1 if label=='relevant' else 0
        inst_no+=1

    pickle.dump(X, open('cache/X_Glove_300.p','wb'))
    pickle.dump(y, open('cache/y_Glove_300.p','wb'))
ok('Done!')


2022-02-27 14:40:08.273658 [  [1;92mOK[0m   ] Done!


#### Grid Search

In [12]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel': ['linear','rbf','poly'], 'C':[0.5,2,4, 6, 9, 15, 17, 30]}
svc = SVC()

# parameters = {'C':[1,2,3,4,5,6], 'degree':[1,2,3,4,5,6,7]}
# svc = SVC(kernel='poly')
clf = GridSearchCV(svc, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X,y)
pd.DataFrame(results.cv_results_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END ..............C=0.5, kernel=linear;, score=0.864 total time=   1.6s
[CV 2/5] END ..............C=0.5, kernel=linear;, score=0.888 total time=   1.6s
[CV 3/5] END ..............C=0.5, kernel=linear;, score=0.874 total time=   1.2s
[CV 4/5] END ..............C=0.5, kernel=linear;, score=0.860 total time=   1.0s
[CV 5/5] END ..............C=0.5, kernel=linear;, score=0.828 total time=   1.3s
[CV 1/5] END .................C=0.5, kernel=rbf;, score=0.872 total time=   2.2s
[CV 2/5] END .................C=0.5, kernel=rbf;, score=0.880 total time=   2.1s
[CV 3/5] END .................C=0.5, kernel=rbf;, score=0.896 total time=   2.1s
[CV 4/5] END .................C=0.5, kernel=rbf;, score=0.876 total time=   2.0s
[CV 5/5] END .................C=0.5, kernel=rbf;, score=0.828 total time=   2.0s
[CV 1/5] END ................C=0.5, kernel=poly;, score=0.889 total time=   1.5s
[CV 2/5] END ................C=0.5, kernel=poly

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.184384,0.239823,0.145935,0.04482,0.5,linear,"{'C': 0.5, 'kernel': 'linear'}",0.864078,0.887805,0.874372,0.86,0.827586,0.862768,0.020043,24
1,1.586756,0.12577,0.484924,0.09577,0.5,rbf,"{'C': 0.5, 'kernel': 'rbf'}",0.872038,0.880383,0.895522,0.875622,0.827586,0.87023,0.022776,22
2,1.191227,0.236933,0.094742,0.028219,0.5,poly,"{'C': 0.5, 'kernel': 'poly'}",0.888889,0.892157,0.877551,0.865672,0.83,0.870854,0.022449,21
3,0.795942,0.185508,0.097113,0.021056,2.0,linear,"{'C': 2, 'kernel': 'linear'}",0.888889,0.872549,0.878049,0.861386,0.835821,0.867339,0.018084,23
4,1.054911,0.18756,0.362859,0.060301,2.0,rbf,"{'C': 2, 'kernel': 'rbf'}",0.881517,0.892157,0.884422,0.86,0.837438,0.871107,0.019928,20
5,0.828593,0.128283,0.081603,0.007452,2.0,poly,"{'C': 2, 'kernel': 'poly'}",0.909091,0.896552,0.892157,0.881188,0.866995,0.889197,0.014253,3
6,0.873133,0.039624,0.069751,0.009363,4.0,linear,"{'C': 4, 'kernel': 'linear'}",0.903846,0.897561,0.887805,0.872549,0.845771,0.881506,0.020754,18
7,0.957636,0.145527,0.251073,0.059475,4.0,rbf,"{'C': 4, 'kernel': 'rbf'}",0.889952,0.882353,0.891089,0.855721,0.841584,0.87214,0.019919,19
8,0.914976,0.158046,0.075906,0.009978,4.0,poly,"{'C': 4, 'kernel': 'poly'}",0.899522,0.901961,0.8867,0.873786,0.855721,0.883538,0.017168,13
9,0.823661,0.073789,0.071749,0.011738,6.0,linear,"{'C': 6, 'kernel': 'linear'}",0.902913,0.908213,0.887805,0.871287,0.845771,0.883198,0.022708,15


In [13]:
df_results = pd.DataFrame(results.cv_results_)
df_results.to_csv('gridsearch8.csv')

In [12]:
!export_out_of_tdmstudio gridsearch4.csv

/bin/sh: export_out_of_tdmstudio: command not found


### Split train-test confusion matrix

In [29]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix



# rng = np.random.default_rng(2022)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

clf = SVC(C=15, kernel='linear')
clf.fit(X,y)
yhat = clf.predict(X)

m = confusion_matrix(y, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,6397,27
true relevant,34,480


In [15]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

clf = SVC(C=15, kernel='linear')
clf.fit(X_train,y_train)

yhat=clf.predict(X_test)

m = confusion_matrix(y_test, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,1595,9
true relevant,19,112


In [62]:
from sklearn.model_selection import cross_validate

cv_results = cross_validate(SVC(C=15,kernel='linear'),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 0.5585
score_time: 0.0417
test_f1   : 0.8912
test_precision: 0.8973
test_recall: 0.8852
test_accuracy: 0.9840


### BOW MULTINOMIAL

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ..........................alpha=20;, score=0.766 total time=   0.8s
[CV 2/5] END ..........................alpha=20;, score=0.711 total time=   0.8s
[CV 3/5] END ..........................alpha=20;, score=0.749 total time=   0.8s
[CV 4/5] END ..........................alpha=20;, score=0.723 total time=   0.8s
[CV 5/5] END ..........................alpha=20;, score=0.770 total time=   1.1s
[CV 1/5] END ..........................alpha=30;, score=0.596 total time=   0.8s
[CV 2/5] END ..........................alpha=30;, score=0.651 total time=   0.8s
[CV 3/5] END ..........................alpha=30;, score=0.568 total time=   0.8s
[CV 4/5] END ..........................alpha=30;, score=0.550 total time=   1.0s
[CV 5/5] END ..........................alpha=30;, score=0.604 total time=   0.8s
[CV 1/5] END ..........................alpha=40;, score=0.430 total time=   0.7s
[CV 2/5] END ..........................alpha=40;,

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.709151,0.106459,0.143899,0.002851,20,{'alpha': 20},0.76555,0.710526,0.748815,0.722772,0.769953,0.743523,0.023371,1
1,0.709441,0.096096,0.14526,0.014288,30,{'alpha': 30},0.596273,0.650888,0.567742,0.55,0.603774,0.593735,0.034531,2
2,0.682118,0.069091,0.145716,0.031429,40,{'alpha': 40},0.42963,0.5,0.412214,0.375,0.387597,0.420888,0.043873,3
3,0.791761,0.116053,0.150932,0.005597,50,{'alpha': 50},0.252101,0.322581,0.254237,0.241379,0.268908,0.267841,0.028742,4
4,0.704712,0.070194,0.147872,0.014318,100,{'alpha': 100},0.019231,0.019231,0.019231,0.019417,0.0,0.015422,0.007711,5


In [21]:
df_results = pd.DataFrame(results.cv_results_)
df_results.to_csv('gridsearch6.csv')

fit_time  : 0.7229
score_time: 0.1489
test_f1   : 0.7435
test_precision: 0.7219
test_recall: 0.7684
test_accuracy: 0.9607


## Working with non-dp articles

In [30]:
import pandas as pd


import pandas as pd
all_articles_df = pd.DataFrame(
                  ['unknown']*len(all_files),
                  columns=['label']
                 )
all_articles_df['source']='GM1'
all_articles_df.iloc[len(os.listdir(GM_all_part1)):, 1] = 'GM2'

all_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in all_files]

# df['source']='GM'
# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
all_articles_df





Unnamed: 0,label,source,id
0,unknown,GM1,1323614655
1,unknown,GM1,1287437740
2,unknown,GM1,1289475925
3,unknown,GM1,1287717370
4,unknown,GM1,1295978754
...,...,...,...
2057863,unknown,GM2,1284766080
2057864,unknown,GM2,1288512672
2057865,unknown,GM2,1313948084
2057866,unknown,GM2,1283055041


#### Filtering out news articles that contain DP (already in the other DataFrame ((`dp_articles_df`))

In [31]:
dp_ids = set([file_.split('/')[-1][:-4] for file_ in dp_files])
to_keep=[]
for idx,id_ in enumerate(all_articles_df['id']):
    if not id_ in dp_ids:
        to_keep.append(idx)

all_articles_df["prediction"]=-1

all_articles_df.iloc[to_keep,:]

Unnamed: 0,label,source,id,prediction
0,unknown,GM1,1323614655,-1
1,unknown,GM1,1287437740,-1
2,unknown,GM1,1289475925,-1
3,unknown,GM1,1287717370,-1
4,unknown,GM1,1295978754,-1
...,...,...,...,...
2057863,unknown,GM2,1284766080,-1
2057864,unknown,GM2,1288512672,-1
2057865,unknown,GM2,1313948084,-1
2057866,unknown,GM2,1283055041,-1


In [16]:
all_articles_df['prediction']=-1
all_articles_df

Unnamed: 0,label,source,id,prediction
0,unknown,GM1,1323614655,-1
1,unknown,GM1,1287437740,-1
2,unknown,GM1,1289475925,-1
3,unknown,GM1,1287717370,-1
4,unknown,GM1,1295978754,-1
...,...,...,...,...
2057863,unknown,GM2,1284766080,-1
2057864,unknown,GM2,1288512672,-1
2057865,unknown,GM2,1313948084,-1
2057866,unknown,GM2,1283055041,-1


## Computing and saving predictions


In [32]:
from sklearn.svm import SVC
clf = SVC(C=15, kernel='linear')
clf.fit(X,y)

SVC(C=15, kernel='linear')

In [43]:
# non_dp_files = [file_ for file_ in all_files if ]
def process_file(file_):
    title, text = get_title_and_text(file_)
    if not title is None and not text is None:
        x = nlp(title+' '+text).vector
        yhat = clf.predict([x])[0]
        all_articles_df.iloc[idx,3] = yhat
    return yhat,title

count=0
for file_ in all_files[:1000]:
    yhat,title=process_file(file_)
    if yhat>0.5:
        count+=1
        print(title)

THIEVES GET $1,186
Harry A. Weingarten
Four Arrested In Narcotic Raids
Boy Drowns Despite Efforts of His Dog To Drag Him Out
Married Girl, 16, Dies From Polio; Toronto Total 67
Gifts to Hospital From All Ontario Bring Goal Near
Johnson-- Browning
Doctor Sentenced To 7-Year Term For Abortion
1 Dies, 2 Hurt, Trucks Crash In Cornwall
Lightning Kills Mother and Son
Red Urges Recognition
Hadassah Plans Rehab Centre
Plan Vote On Widening Of Yonge St.
Fire Leaps Track, Mill Destroyed
Photo Standalone 1 -- No Title
Pretty but Tough
Darts After 'Old Rag,' Rescues Toronto Boy
Resort Man, War Bride Found Frozen at Lake
Beauties From Britain
The Homemaker
Healing a Lost Community
China President May Fly to Reds in Bid for Peace
Race at 95 MPH Is Ended by Tree; 1 Killed, 1 Injured
Photo Standalone 3 -- No Title
Hungarian Reds Reinstate Nagy To Party Circles
Actress in Coma
Malan Calls Vote Whites' Chance To Retain Control
Other 3 -- No Title
Thousands in U.K. Hit by Influenza; Liverpool Crippled
Ma

UnboundLocalError: local variable 'yhat' referenced before assignment

In [44]:
count

99

In [None]:
import spacy
import pickle
from tqdm import tqdm


nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])
for idx in tqdm(range(len(all_articles_df))):        
    _, source, id_, _ = all_articles_df.iloc[idx,:]
    if source=='GM1':
        file_ = GM_all_part1+id_+'.xml'
    else:
        file_ = GM_all_part2+id_+'.xml'

    title, text = get_title_and_text(file_)
    if not title is None and not text is None:
        x = nlp(title+' '+text).vector
        yhat = clf.predict([x])[0]
        all_articles_df.iloc[idx,3] = yhat
all_articles_df.to_csv('all_articles_df_glove_300_predictions(best).csv')

 11%|█▏        | 234241/2057868 [4:58:30<65:02:10,  7.79it/s] 

# BEST MODELS

In [9]:
import spacy
import pickle
import string
from sklearn.svm import SVC


nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.text.lower() for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens


bow = SVC(kernel='linear', probability=True)
glove300 = SVC(C=15, kernel='linear', probability=True)
glove600 = SVC(C=4, degree=1, kernel='poly', probability=True)

bow.fit(
        pickle.load(open('cache/X_bow.p','rb')),
        pickle.load(open('cache/y_bow.p','rb')),
       )

glove300.fit(
             pickle.load(open('cache/X_Glove_300.p','rb')),
             pickle.load(open('cache/y_Glove_300.p','rb'))
            )

glove600.fit(
             pickle.load(open('cache/X.p','rb')),
             pickle.load(open('cache/y.p','rb'))
            )



SVC(C=4, degree=1, kernel='poly', probability=True)

In [11]:
vocab = pickle.load(open('cache/vocab.p', 'rb'))
word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])
from threading import Lock


def get_bow(file_): 
    title, text = get_title_and_text(file_)
    x = np.zeros(shape=(len(vocab)+1,), dtype='float32')
    if not title is None and not text is None:
        tokens = tokenize(title+' '+text)
        for token in tokens:
            if token in word2idx:
                x[word2idx[token]]+=1
            else:
                x[-1]+=1
    return x
def get_glove300(file_):
    title, text = get_title_and_text(file_)
    x = np.zeros(shape=(300,))
    if not title is None and not text is None:
        x = nlp(title+' '+text).vector
    return x

def get_glove600(file_):
    title, text = get_title_and_text(file_)
    x = np.zeros(shape=(600,))
    if not title is None and not text is None:
        dtitle, dtext = nlp.pipe([title,text])
        x = np.zeros(shape=(600,), dtype='float32')
        x[:300] = dtitle.vector
        x[300:] = dtext.vector
    return x

# count =0
# for file_ in all_files[:100]:
#     yhat_1 = bow.predict_proba([get_bow(file_)])[0,1]
#     yhat_2 = glove300.predict_proba([get_glove300(file_)])[0,1]
#     yhat_3 = glove600.predict_proba([get_glove600(file_)])[0,1]
#     print(f'{yhat_1:4.3f}    {yhat_2:4.3f}    {yhat_3:4.3f}')
#     if yhat_1>0.5 and yhat_2 > 0.5 and yhat_3 > 0.5:
#         count+=1
#         title,text = get_title_and_text(file_)
#         print(title)
        

In [None]:
def process_file(file_):
    file_id = file_.split('/')[-1][:-4]                      #ESTO LO AGREGUÉ DESPUES! ESTÁ TODO MAL
    output_file = predictions_dirpath+file_id+'v1.1.p'
    if not os.path.isfile(output_file):
        writer = open(output_file, 'w')
        writer.close()

In [14]:

import os
os.path.isfile('p')

False

In [16]:
import os
predictions_dirpath = './predictions/'



def process_file(file_):
    file_id = file_.split('/')[-1][:-4]                      #ESTO LO AGREGUÉ DESPUES! ESTÁ TODO MAL
    output_file = predictions_dirpath+file_id+'.p'
    if not os.path.isfile(output_file):
        yhat = np.zeros(shape=(3,))

        x_bow = get_bow(file_)
        yhat[0] = bow.predict_proba([x_bow])[0,1]

        x_glove300 = get_glove300(file_)        
        yhat[1] = glove300.predict_proba([x_glove300])[0,1]

        x_glove600 = get_glove600(file_)        
        yhat[2] = glove600.predict_proba([x_glove600])[0,1]


        file_id = file_.split('/')[-1][:-4]
        pickle.dump(yhat,open(output_file, 'wb'))


info('finished')

2022-02-27 14:44:45.318851 [ [1;94mINFO[0m  ] finished


In [10]:
dp_ids = set([file_.split('/')[-1][:-4] for file_ in dp_files])

all_files_minus_dp = [file_ for file_ in all_files if not file_.split('/')[-1][:-4] in dp_ids]
info(f'{len(all_files_minus_dp):,}')

2022-03-08 16:55:17.587104 [ [1;94mINFO[0m  ] 2,050,930


In [12]:
len(set(all_files_minus_dp))

2050930

In [18]:
import concurrent.futures

writer = open('done.txt', 'w')
writer.write(f'{datetime.datetime.now()} Starting...\n')

info('Starting...')

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    executor.map(process_file, all_files_minus_dp)
    
    
writer.write(f'{datetime.datetime.now()} Done!')
writer.close()
ok('Done!')

2022-02-27 14:44:53.466675 [ [1;94mINFO[0m  ] Starting...
2022-02-27 14:46:26.913358 [  [1;92mOK[0m   ] Done!


### Finaly running  ^^^^ (SIGUE)

In [77]:
writer = open('done.txt', 'w')
writer.write(f'{datetime.datetime.now()} Done!')
writer.close()

In [63]:
# %%time
# for f in all_files_minus_dp[:10000]:
#     process_file(f)

CPU times: user 43min 2s, sys: 21min 52s, total: 1h 4min 55s
Wall time: 19min 36s


In [13]:
# from threading import Lock
# lock_glove300 = Lock()
# lock_glove600 = Lock()
# lock_bow = Lock()
# lock_nlp = Lock()


# def check_status():
#     print(lock1.locked())
# check_status()
# with lock1:
#     print('hello world')
#     check_status()
# check_status()

False
hello world
True
False


In [11]:
# lock1.locked()

False

In [20]:
# bow.predict_proba([get_bow(all_files[0])])

array([[0.30085098, 0.69914902]])

In [21]:
# bow.predict([get_bow(all_files[0])])

array([1], dtype=int32)

In [12]:
len(all_files)-len(dp_files)

2050930

In [20]:
print('hello world')

hello world


#### Generating X_all

In [None]:


import spacy
import pickle
from tqdm import tqdm

to_do = []
if os.path.isfile('cache/X_all.p'):
    X_all = pickle.load(open('cache/X_all.p','rb'))
else:
    nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

    example_no = len(all_articles_df)

    X_all = np.zeros(shape=(example_no, 600), dtype='float32')
#     y = np.zeros(shape=(example_no,), dtype='int32')

    inst_no=0
    for idx in tqdm(range(len(all_articles_df))):        
        label, source, id_ = all_articles_df.iloc[idx,:]
        if source=='GM1':
            file_ = GM_all_part1+id_+'.xml'
        else:
            file_ = GM_all_part2+id_+'.xml'

        title, text = get_title_and_text(file_)
        if not title  is None and not text  is None:
            to_do.append((inst_no, title,text))
        
        
        if len(to_do)==10000:
            ##################
            # NLP processing #
            ##################
            texts = [text for _,_, text in to_do]
            titles = [title for _,title, _ in to_do]
            docs = list(nlp.pipe([elem for elem in texts+titles]))
            #########
            # texts #
            #########
            for idx,text_doc in enumerate(docs[:len(texts)]):
                inst_no = to_do[idx][0]
                X_all[inst_no,300:] = text_doc.vector
            ##########
            # titles #
            ##########
            for idx,title_doc in enumerate(docs[len(texts):]):
                inst_no = to_do[idx][0]
                X_all[inst_no,:300] = title_doc.vector
            ###############
            # reset to-do #
            ###############
            
            del(to_do, docs, texts, titles)
            del(docs)
            del(texts)
            del(titles)
            to_do=[]
        del(label, source, id_, file_, title, text)
        inst_no+=1

    pickle.dump(X_all, open('cache/X_all.p','wb'))

 31%|███       | 642388/2057868 [3:58:12<39:02, 604.27it/s]   

In [62]:
assert all(type(elem)==str for elem in texts+titles)

In [53]:
(texts+titles)[:2]

["\n\n\n\xa0\n\n\n\nToday and Tomorrow\n\n\nWalter Lippmann's\n\n\nThe Voice of America\n\n\nThough money for radio broadcasts and printing is needed so that the Voice of America may be heard in foreign lands our greatest need is to have something definite clear and convincing for that voice to say There will he little opposition in Congress to an appropriation if\n\n\nit were not for the feeling that the men who conduct our propaganda have little to do with the making of our policy and that the sales department of the Government so lo speak writing about goods for which the produc tion engineers have just begun to make the first blueprints As for the customers abroad they are undoubtedly confused and suspicious partly no doubt because the rival firm misrepresents us but chiefly because we sound hot and bothered when as Great Power they expect us to be cool and definite Mr Benton's difficul ties with Congress and with the opposition abroad will diminish when his chief Secretary Marshal

In [47]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [40]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [12]:

all_files[0]

'/home/ec2-user/SageMaker/data/GM_all_1945_1956/1323614655.xml'

In [41]:
np.random.choice?