## Aux functions

In [1]:
from IPython.core.display import display, HTML
import datetime
def info(str_):
    print(f'{datetime.datetime.now()} [ \033[1;94mINFO\x1b[0m  ] {str_}')
def ok(str_):
    print(f'{datetime.datetime.now()} [  \033[1;92mOK\x1b[0m   ] {str_}')
def warning(str_):
    print(f'{datetime.datetime.now()} [\x1b[1;31mWARNING\x1b[0m] {str_}')
def html(str_=''):
    display(HTML(str_))

In [2]:
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup

def get_date(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
#     title = root.find('.//Title').text
    date = root.find('.//NumericDate').text
#     publisher = root.find('.//PublisherName').text
    assert date is not None
    
    return date

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

## Retrieving Label data

In [3]:
import os
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'
assert os.path.exists(DP_examples_dirpath)
files = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        filepath = os.path.join(dirpath,filename)
        assert os.path.isfile(filepath)
        files.append(filepath)
info(f'Number of files retrieved: {len(files)}')

2022-02-25 20:29:03.505673 [ [1;94mINFO[0m  ] Number of files retrieved: 11


In [4]:
import re
total_count=0
urls = []
ids = []
for file_ in files:
    content = open(file_, 'r').read()
    urls += re.findall('url = {(.*)}',content)
    ids += re.findall('/docview/([^/]*)/',content)
    total_count += len(urls)
    assert len(ids)==len(urls)

relevant_ids = set(ids)
# print(len(urls))
info(f'Relevant article count: {len(relevant_ids)}')

2022-02-25 20:29:03.628619 [ [1;94mINFO[0m  ] Relevant article count: 535


## Reading files from disk

In [5]:
import os
GM_all_part1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM_all_part2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

all_files = [GM_all_part1+file_ for file_ in os.listdir(GM_all_part1)]
all_files += [GM_all_part2+file_ for file_ in os.listdir(GM_all_part2)]

dp_files = [GM_dp_dirpath+file_ for file_ in os.listdir(GM_dp_dirpath)]

# GM_dirpath = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
# all_files = [TS_dirpath+file_id for file_id in os.listdir(TS_dirpath)]
# all_files += [GM_dirpath+file_id for file_id in os.listdir(GM_dirpath)]

info(f'len(all_files):       {len(all_files):10,}')
info(f'len(dp_files):        {len(dp_files):10,}')

2022-02-25 20:29:08.999802 [ [1;94mINFO[0m  ] len(all_files):        2,057,868
2022-02-25 20:29:09.000527 [ [1;94mINFO[0m  ] len(dp_files):             6,938


#### Creating dataframe with ids + 'unknown' label


In [6]:
import pandas as pd
dp_articles_df = pd.DataFrame(
                  ['unknown']*len(dp_files), 
                  columns=['label']
                 )
dp_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in dp_files]
# df['source']='GM'
# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
dp_articles_df

Unnamed: 0,label,id
0,unknown,1287338646
1,unknown,1270339579
2,unknown,1287501005
3,unknown,1289163109
4,unknown,1289176186
...,...,...
6933,unknown,1282757714
6934,unknown,1270441158
6935,unknown,1288866985
6936,unknown,1356078942


#### Adding correct label + date to DataFrame


In [7]:
import numpy as np

relevants=0
dates = []

for idx,file_ in enumerate(dp_files):
    id_ = file_.split('/')[-1][:-4]
    date = get_date(file_)
    year = int(date[:4])
    if id_ in relevant_ids:
        dp_articles_df.iloc[idx,0]='relevant'
        relevants+=1
    else:
        dp_articles_df.iloc[idx,0]='irrelevant'
        
    dates.append(date)
dp_articles_df['date']=dates
info(f"Number of relevants:   {np.sum(dp_articles_df['label']=='relevant'):,}")
info(f"Number of irrelevants: {np.sum(dp_articles_df['label']=='irrelevant'):,}")
info(f"'Number of unknown:    {np.sum(dp_articles_df['label']=='unknown'):,}")
dp_articles_df

2022-02-25 20:29:18.120299 [ [1;94mINFO[0m  ] Number of relevants:   514
2022-02-25 20:29:18.122520 [ [1;94mINFO[0m  ] Number of irrelevants: 6,424
2022-02-25 20:29:18.124522 [ [1;94mINFO[0m  ] 'Number of unknown:    0


Unnamed: 0,label,id,date
0,irrelevant,1287338646,1951-03-15
1,irrelevant,1270339579,1965-02-17
2,irrelevant,1287501005,1958-02-18
3,irrelevant,1289163109,1956-10-13
4,irrelevant,1289176186,1954-03-27
...,...,...,...
6933,irrelevant,1282757714,1963-04-19
6934,irrelevant,1270441158,1965-05-14
6935,irrelevant,1288866985,1957-11-19
6936,irrelevant,1356078942,1963-10-12


### SVM with average GloVe 

#### Generating X, y for training model (involves look-up for GloVe vectors (spacy nlp))


In [8]:
import spacy
import pickle

if os.path.isfile('cache/X.p') and os.path.isfile('cache/y.p'):
    X = pickle.load(open('cache/X.p','rb'))
    y = pickle.load(open('cache/y.p', 'rb'))
else:
    info('Building X,y')
    nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

    example_no = np.sum(dp_articles_df["label"]!='unknown')

    X = np.zeros(shape=(example_no, 600), dtype='float32')
    y = np.zeros(shape=(example_no,), dtype='int32')

    inst_no=0
    for idx in range(len(dp_articles_df)):
        label, id_, date = dp_articles_df.iloc[idx,:]
        file_ = GM_dp_dirpath+id_+'.xml'

        assert label!='unknown'
        title, text = get_title_and_text(file_)
        X[inst_no,:300] = nlp(title).vector
        X[inst_no,300:] = nlp(text).vector
        y[inst_no] = 1 if label=='relevant' else 0
        inst_no+=1

    pickle.dump(X, open('cache/X.p','wb'))
    pickle.dump(y, open('cache/y.p','wb'))


#### Grid Search

In [13]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'C':[1,2,3,4,5,6], 'degree':[1,2,3,4,5,6,7]}
svc = SVC(kernel='poly')
clf = GridSearchCV(svc, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X,y)
pd.DataFrame(results.cv_results_)

Fitting 5 folds for each of 42 candidates, totalling 210 fits
[CV 1/5] END .....................C=1, degree=1;, score=0.854 total time=   2.3s
[CV 2/5] END .....................C=1, degree=1;, score=0.806 total time=   1.6s
[CV 3/5] END .....................C=1, degree=1;, score=0.826 total time=   1.4s
[CV 4/5] END .....................C=1, degree=1;, score=0.829 total time=   1.4s
[CV 5/5] END .....................C=1, degree=1;, score=0.854 total time=   1.5s
[CV 1/5] END .....................C=1, degree=2;, score=0.871 total time=   1.5s
[CV 2/5] END .....................C=1, degree=2;, score=0.810 total time=   1.3s
[CV 3/5] END .....................C=1, degree=2;, score=0.839 total time=   1.4s
[CV 4/5] END .....................C=1, degree=2;, score=0.831 total time=   1.3s
[CV 5/5] END .....................C=1, degree=2;, score=0.853 total time=   1.3s
[CV 1/5] END .....................C=1, degree=3;, score=0.868 total time=   1.4s
[CV 2/5] END .....................C=1, degree=3

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_degree,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,1.477457,0.320471,0.163215,0.042858,1,1,"{'C': 1, 'degree': 1}",0.854271,0.80597,0.825871,0.829268,0.854369,0.83395,0.018439,25
1,1.219171,0.073838,0.137136,0.002662,1,2,"{'C': 1, 'degree': 2}",0.871287,0.81,0.839024,0.830918,0.852941,0.840834,0.020622,12
2,1.221361,0.025169,0.142528,0.002583,1,3,"{'C': 1, 'degree': 3}",0.868293,0.80402,0.857143,0.846154,0.862745,0.847671,0.023021,2
3,1.237076,0.023905,0.147837,0.002134,1,4,"{'C': 1, 'degree': 4}",0.866995,0.808081,0.85,0.835821,0.84,0.840179,0.019313,13
4,1.287794,0.029677,0.153271,0.001513,1,5,"{'C': 1, 'degree': 5}",0.847291,0.808081,0.845771,0.817734,0.838384,0.831452,0.015738,28
5,1.358822,0.038506,0.158377,0.000855,1,6,"{'C': 1, 'degree': 6}",0.851485,0.812183,0.836735,0.79397,0.830769,0.825028,0.019998,36
6,1.384066,0.029219,0.162035,0.001362,1,7,"{'C': 1, 'degree': 7}",0.851485,0.812183,0.802083,0.773869,0.826531,0.81323,0.025749,42
7,1.131001,0.022602,0.126129,0.002296,2,1,"{'C': 2, 'degree': 1}",0.88,0.805825,0.825243,0.825243,0.847291,0.83672,0.025309,23
8,1.120177,0.010001,0.129171,0.001207,2,2,"{'C': 2, 'degree': 2}",0.878049,0.807882,0.839024,0.836538,0.835821,0.839463,0.022398,16
9,1.195868,0.020059,0.136586,0.002292,2,3,"{'C': 2, 'degree': 3}",0.884615,0.8,0.845771,0.847291,0.84,0.843535,0.026887,5


In [14]:
df_results = pd.DataFrame(results.cv_results_)
df_results.to_csv('gridsearch5.csv')

In [12]:
!export_out_of_tdmstudio gridsearch4.csv

/bin/sh: export_out_of_tdmstudio: command not found


### Split train-test confusion matrix

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix



# rng = np.random.default_rng(2022)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

clf = SVC(C=4, kernel='poly',degree=1)
clf.fit(X,y)
yhat = clf.predict(X)

m = confusion_matrix(y, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,6387,37
true relevant,47,467


In [25]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25)

clf = SVC(C=4, kernel='poly',degree=1)
clf.fit(X_train,y_train)

yhat=clf.predict(X_test)

m = confusion_matrix(y_test, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,1605,12
true relevant,21,97


In [10]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
cv_results = cross_validate(SVC(C=4,kernel='poly',degree=1),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 1.5873
score_time: 0.1655
test_f1   : 0.8484
test_precision: 0.8488
test_recall: 0.8482
test_accuracy: 0.9775


### BOW MULTINOMIAL

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ..........................alpha=20;, score=0.766 total time=   0.8s
[CV 2/5] END ..........................alpha=20;, score=0.711 total time=   0.8s
[CV 3/5] END ..........................alpha=20;, score=0.749 total time=   0.8s
[CV 4/5] END ..........................alpha=20;, score=0.723 total time=   0.8s
[CV 5/5] END ..........................alpha=20;, score=0.770 total time=   1.1s
[CV 1/5] END ..........................alpha=30;, score=0.596 total time=   0.8s
[CV 2/5] END ..........................alpha=30;, score=0.651 total time=   0.8s
[CV 3/5] END ..........................alpha=30;, score=0.568 total time=   0.8s
[CV 4/5] END ..........................alpha=30;, score=0.550 total time=   1.0s
[CV 5/5] END ..........................alpha=30;, score=0.604 total time=   0.8s
[CV 1/5] END ..........................alpha=40;, score=0.430 total time=   0.7s
[CV 2/5] END ..........................alpha=40;,

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.709151,0.106459,0.143899,0.002851,20,{'alpha': 20},0.76555,0.710526,0.748815,0.722772,0.769953,0.743523,0.023371,1
1,0.709441,0.096096,0.14526,0.014288,30,{'alpha': 30},0.596273,0.650888,0.567742,0.55,0.603774,0.593735,0.034531,2
2,0.682118,0.069091,0.145716,0.031429,40,{'alpha': 40},0.42963,0.5,0.412214,0.375,0.387597,0.420888,0.043873,3
3,0.791761,0.116053,0.150932,0.005597,50,{'alpha': 50},0.252101,0.322581,0.254237,0.241379,0.268908,0.267841,0.028742,4
4,0.704712,0.070194,0.147872,0.014318,100,{'alpha': 100},0.019231,0.019231,0.019231,0.019417,0.0,0.015422,0.007711,5


In [21]:
df_results = pd.DataFrame(results.cv_results_)
df_results.to_csv('gridsearch6.csv')

fit_time  : 0.7229
score_time: 0.1489
test_f1   : 0.7435
test_precision: 0.7219
test_recall: 0.7684
test_accuracy: 0.9607


## Working with non-dp articles

In [11]:
import pandas as pd


import pandas as pd
all_articles_df = pd.DataFrame(
                  ['unknown']*len(all_files),
                  columns=['label']
                 )
all_articles_df['source']='GM1'
all_articles_df.iloc[len(os.listdir(GM_all_part1)):, 1] = 'GM2'

all_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in all_files]

# df['source']='GM'
# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
all_articles_df





Unnamed: 0,label,source,id
0,unknown,GM1,1323614655
1,unknown,GM1,1287437740
2,unknown,GM1,1289475925
3,unknown,GM1,1287717370
4,unknown,GM1,1295978754
...,...,...,...
2057863,unknown,GM2,1284766080
2057864,unknown,GM2,1288512672
2057865,unknown,GM2,1313948084
2057866,unknown,GM2,1283055041


#### Filtering out news articles that contain DP (already in the other DataFrame ((`dp_articles_df`))

In [13]:
dp_ids = set([file_.split('/')[-1][:-4] for file_ in dp_files])
to_keep=[]
for idx,id_ in enumerate(all_articles_df['id']):
    if not id_ in dp_ids:
        to_keep.append(idx)

all_articles_df.iloc[to_keep,:]

Unnamed: 0,label,source,id
0,unknown,GM1,1323614655
1,unknown,GM1,1287437740
2,unknown,GM1,1289475925
3,unknown,GM1,1287717370
4,unknown,GM1,1295978754
...,...,...,...
2057863,unknown,GM2,1284766080
2057864,unknown,GM2,1288512672
2057865,unknown,GM2,1313948084
2057866,unknown,GM2,1283055041


In [16]:
all_articles_df['prediction']=-1
all_articles_df

Unnamed: 0,label,source,id,prediction
0,unknown,GM1,1323614655,-1
1,unknown,GM1,1287437740,-1
2,unknown,GM1,1289475925,-1
3,unknown,GM1,1287717370,-1
4,unknown,GM1,1295978754,-1
...,...,...,...,...
2057863,unknown,GM2,1284766080,-1
2057864,unknown,GM2,1288512672,-1
2057865,unknown,GM2,1313948084,-1
2057866,unknown,GM2,1283055041,-1


## Computing and saving predictions


In [14]:
from sklearn.svm import SVC
clf = SVC(C=4, degree=1, kernel='poly')
clf.fit(X,y)

SVC(C=4, degree=1, kernel='poly')

In [None]:
def process_file(file_):
    title, text = get_title_and_text(file_)
    if not title is None and not text is None:
        dtitle, dtext = nlp.pipe([title,text])
        x = np.zeros(shape=(600,), dtype='float32')
        x[:300] = dtitle.vector
        x[300:] = dtext.vector
        yhat = clf.predict([x])[0]
        all_articles_df.iloc[idx,3] = yhat

In [None]:
import spacy
import pickle
from tqdm import tqdm


nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])
for idx in tqdm(range(len(all_articles_df))):        
    _, source, id_, _ = all_articles_df.iloc[idx,:]
    if source=='GM1':
        file_ = GM_all_part1+id_+'.xml'
    else:
        file_ = GM_all_part2+id_+'.xml'

    title, text = get_title_and_text(file_)
    if not title is None and not text is None:
        dtitle, dtext = nlp.pipe([title,text])
        x = np.zeros(shape=(600,), dtype='float32')
        x[:300] = dtitle.vector
        x[300:] = dtext.vector
        yhat = clf.predict([x])[0]
        all_articles_df.iloc[idx,3] = yhat
all_articles_df.to_csv('all_articles_df_glove_600_predictions.csv')

 12%|█▏        | 240730/2057868 [4:34:07<44:44:13, 11.28it/s] 

In [1]:
print('hello')

hello


#### Generating X_all

In [None]:


import spacy
import pickle
from tqdm import tqdm

to_do = []
if os.path.isfile('cache/X_all.p'):
    X_all = pickle.load(open('cache/X_all.p','rb'))
else:
    nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

    example_no = len(all_articles_df)

    X_all = np.zeros(shape=(example_no, 600), dtype='float32')
#     y = np.zeros(shape=(example_no,), dtype='int32')

    inst_no=0
    for idx in tqdm(range(len(all_articles_df))):        
        label, source, id_ = all_articles_df.iloc[idx,:]
        if source=='GM1':
            file_ = GM_all_part1+id_+'.xml'
        else:
            file_ = GM_all_part2+id_+'.xml'

        title, text = get_title_and_text(file_)
        if not title  is None and not text  is None:
            to_do.append((inst_no, title,text))
        
        
        if len(to_do)==10000:
            ##################
            # NLP processing #
            ##################
            texts = [text for _,_, text in to_do]
            titles = [title for _,title, _ in to_do]
            docs = list(nlp.pipe([elem for elem in texts+titles]))
            #########
            # texts #
            #########
            for idx,text_doc in enumerate(docs[:len(texts)]):
                inst_no = to_do[idx][0]
                X_all[inst_no,300:] = text_doc.vector
            ##########
            # titles #
            ##########
            for idx,title_doc in enumerate(docs[len(texts):]):
                inst_no = to_do[idx][0]
                X_all[inst_no,:300] = title_doc.vector
            ###############
            # reset to-do #
            ###############
            
            del(to_do, docs, texts, titles)
            del(docs)
            del(texts)
            del(titles)
            to_do=[]
        del(label, source, id_, file_, title, text)
        inst_no+=1

    pickle.dump(X_all, open('cache/X_all.p','wb'))

 31%|███       | 642388/2057868 [3:58:12<39:02, 604.27it/s]   

In [62]:
assert all(type(elem)==str for elem in texts+titles)

In [53]:
(texts+titles)[:2]

["\n\n\n\xa0\n\n\n\nToday and Tomorrow\n\n\nWalter Lippmann's\n\n\nThe Voice of America\n\n\nThough money for radio broadcasts and printing is needed so that the Voice of America may be heard in foreign lands our greatest need is to have something definite clear and convincing for that voice to say There will he little opposition in Congress to an appropriation if\n\n\nit were not for the feeling that the men who conduct our propaganda have little to do with the making of our policy and that the sales department of the Government so lo speak writing about goods for which the produc tion engineers have just begun to make the first blueprints As for the customers abroad they are undoubtedly confused and suspicious partly no doubt because the rival firm misrepresents us but chiefly because we sound hot and bothered when as Great Power they expect us to be cool and definite Mr Benton's difficul ties with Congress and with the opposition abroad will diminish when his chief Secretary Marshal

In [47]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [40]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [12]:

all_files[0]

'/home/ec2-user/SageMaker/data/GM_all_1945_1956/1323614655.xml'

In [41]:
np.random.choice?