## Aux functions

In [1]:
from IPython.core.display import display, HTML
import datetime
def info(str_):
    print(f'{datetime.datetime.now()} [ \033[1;94mINFO\x1b[0m  ] {str_}')
def ok(str_):
    print(f'{datetime.datetime.now()} [  \033[1;92mOK\x1b[0m   ] {str_}')
def warning(str_):
    print(f'{datetime.datetime.now()} [\x1b[1;31mWARNING\x1b[0m] {str_}')
def html(str_=''):
    display(HTML(str_))

In [2]:
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup

def get_date(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
#     title = root.find('.//Title').text
    date = root.find('.//NumericDate').text
#     publisher = root.find('.//PublisherName').text
    assert date is not None
    
    return date

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

## Retrieving Label data

In [3]:
import os
DP_examples_dirpath = '/home/ec2-user/SageMaker/mariano/notebooks/04. Model of DP/DP-relevant articles/'
assert os.path.exists(DP_examples_dirpath)
files = []
for dirpath, dirnames, filenames in os.walk(DP_examples_dirpath):
    for filename in filenames:
        filepath = os.path.join(dirpath,filename)
        assert os.path.isfile(filepath)
        files.append(filepath)
info(f'Number of files retrieved: {len(files)}')

2022-02-25 20:31:23.670807 [ [1;94mINFO[0m  ] Number of files retrieved: 11


In [4]:
import re
total_count=0
urls = []
ids = []
for file_ in files:
    content = open(file_, 'r').read()
    urls += re.findall('url = {(.*)}',content)
    ids += re.findall('/docview/([^/]*)/',content)
    total_count += len(urls)
    assert len(ids)==len(urls)

relevant_ids = set(ids)
# print(len(urls))
info(f'Relevant article count: {len(relevant_ids)}')

2022-02-25 20:31:24.167651 [ [1;94mINFO[0m  ] Relevant article count: 535


## Reading files from disk

In [5]:
import os
GM_all_part1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM_all_part2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

all_files = [GM_all_part1+file_ for file_ in os.listdir(GM_all_part1)]
all_files += [GM_all_part2+file_ for file_ in os.listdir(GM_all_part2)]

dp_files = [GM_dp_dirpath+file_ for file_ in os.listdir(GM_dp_dirpath)]

# GM_dirpath = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
# all_files = [TS_dirpath+file_id for file_id in os.listdir(TS_dirpath)]
# all_files += [GM_dirpath+file_id for file_id in os.listdir(GM_dirpath)]

info(f'len(all_files):       {len(all_files):10,}')
info(f'len(dp_files):        {len(dp_files):10,}')

2022-02-25 20:31:29.504543 [ [1;94mINFO[0m  ] len(all_files):        2,057,868
2022-02-25 20:31:29.504718 [ [1;94mINFO[0m  ] len(dp_files):             6,938


#### Creating dataframe with ids + 'unknown' label


In [6]:
import pandas as pd
dp_articles_df = pd.DataFrame(
                  ['unknown']*len(dp_files), 
                  columns=['label']
                 )
dp_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in dp_files]
# df['source']='GM'
# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
dp_articles_df

Unnamed: 0,label,id
0,unknown,1287338646
1,unknown,1270339579
2,unknown,1287501005
3,unknown,1289163109
4,unknown,1289176186
...,...,...
6933,unknown,1282757714
6934,unknown,1270441158
6935,unknown,1288866985
6936,unknown,1356078942


#### Adding correct label + date to DataFrame


In [7]:
import numpy as np

relevants=0
dates = []

for idx,file_ in enumerate(dp_files):
    id_ = file_.split('/')[-1][:-4]
    date = get_date(file_)
    year = int(date[:4])
    if id_ in relevant_ids:
        dp_articles_df.iloc[idx,0]='relevant'
        relevants+=1
    else:
        dp_articles_df.iloc[idx,0]='irrelevant'
        
    dates.append(date)
dp_articles_df['date']=dates
info(f"Number of relevants:   {np.sum(dp_articles_df['label']=='relevant'):,}")
info(f"Number of irrelevants: {np.sum(dp_articles_df['label']=='irrelevant'):,}")
info(f"'Number of unknown:    {np.sum(dp_articles_df['label']=='unknown'):,}")
dp_articles_df

2022-02-25 20:31:38.596265 [ [1;94mINFO[0m  ] Number of relevants:   514
2022-02-25 20:31:38.603277 [ [1;94mINFO[0m  ] Number of irrelevants: 6,424
2022-02-25 20:31:38.607630 [ [1;94mINFO[0m  ] 'Number of unknown:    0


Unnamed: 0,label,id,date
0,irrelevant,1287338646,1951-03-15
1,irrelevant,1270339579,1965-02-17
2,irrelevant,1287501005,1958-02-18
3,irrelevant,1289163109,1956-10-13
4,irrelevant,1289176186,1954-03-27
...,...,...,...
6933,irrelevant,1282757714,1963-04-19
6934,irrelevant,1270441158,1965-05-14
6935,irrelevant,1288866985,1957-11-19
6936,irrelevant,1356078942,1963-10-12


### SVM with average GloVe 

#### Generating VOCAB


In [8]:
import spacy
import pickle
import string
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.text.lower() for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens



def build_vocab(file_list, umbral=None,threshold=10000):
    if os.path.isfile('cache/vocab.p'):
        vocab = pickle.load(open('cache/vocab.p', 'rb'))
    else:
        visited = set()
        freq = {}
        for file_ in tqdm(file_list):
            title, text = get_title_and_text(file_)
            tokens = tokenize(title+' '+text)
            for token in tokens:
                if token in visited:
                    if not token in freq:
                        freq[token]=2
                    else:
                        freq[token]+=1
                else:
                    visited.add(token)
        word_frequency_list = [(word, freq[word]) for word in freq if not word.strip()=='' and len(word)>=3]
        word_frequency_list = sorted(word_frequency_list, key=lambda x: x[1],reverse=True)
        vocab = [word for word,_ in word_frequency_list[:threshold]]

        pickle.dump(vocab,open('cache/vocab.p', 'wb'))        
        del(freq)
        del(word_frequency_list)
        del(visited)
    return vocab
vocab = build_vocab(dp_files)
len(vocab)

10000

#### Generating X, y for training model (involves look-up for GloVe vectors (spacy nlp))

In [9]:
   
if os.path.isfile('cache/X_bow.p') and os.path.isfile('cache/y_bow.p'):
    info('Retrieving from disk')
    X = pickle.load(open('cache/X_bow.p','rb'))
    y = pickle.load(open('cache/y_bow.p', 'rb'))
else:
    info('Creating X and y')
    vocab = build_vocab(dp_files)
    word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])
    example_no = np.sum(dp_articles_df["label"]!='unknown')

    X = np.zeros(shape=(example_no, len(vocab)+1), dtype='float32')
    y = np.zeros(shape=(example_no,), dtype='int32')

    inst_no=0
    for idx in tqdm(list(range(len(dp_articles_df)))):
        label, id_, date = dp_articles_df.iloc[idx,:]
        file_ = GM_dp_dirpath+id_+'.xml'

        assert label!='unknown'
        title, text = get_title_and_text(file_)
        tokens = tokenize(title+' '+text)
        for token in tokens:
            if token in word2idx:
                X[inst_no, word2idx[token]]+=1
            else:
                X[inst_no,-1]+=1
#         X[inst_no,:300] = nlp(title).vector
#         X[inst_no,300:] = nlp(text).vector
        y[inst_no] = 1 if label=='relevant' else 0
        inst_no+=1

    pickle.dump(X, open('cache/X_bow.p','wb'))
    pickle.dump(y, open('cache/y_bow.p','wb'))


2022-02-25 20:31:46.135771 [ [1;94mINFO[0m  ] Retrieving from disk


In [13]:
!rm cache/X_bow.p cache/y_bow.p

#### Grid Search SVM with BOW

In [17]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
parameters = {'kernel':['linear', 'rbf', 'poly'],'C':[1,2,3,4,5,6]} #'degree':[1,2,3,4,5,6,7]}
svc = SVC()
clf = GridSearchCV(svc, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X,y)
pd.DataFrame(results.cv_results_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ................C=1, kernel=linear;, score=0.890 total time=  21.0s
[CV 2/5] END ................C=1, kernel=linear;, score=0.861 total time=  21.5s
[CV 3/5] END ................C=1, kernel=linear;, score=0.833 total time=  20.4s
[CV 4/5] END ................C=1, kernel=linear;, score=0.865 total time=  22.5s
[CV 5/5] END ................C=1, kernel=linear;, score=0.834 total time=  21.2s
[CV 1/5] END ...................C=1, kernel=rbf;, score=0.000 total time=  54.9s
[CV 2/5] END ...................C=1, kernel=rbf;, score=0.000 total time=  55.0s
[CV 3/5] END ...................C=1, kernel=rbf;, score=0.000 total time=  55.1s
[CV 4/5] END ...................C=1, kernel=rbf;, score=0.000 total time=  55.1s
[CV 5/5] END ...................C=1, kernel=rbf;, score=0.000 total time=  55.8s
[CV 1/5] END ..................C=1, kernel=poly;, score=0.000 total time=  35.0s
[CV 2/5] END ..................C=1, kernel=poly;

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,17.754612,0.70013,3.557973,0.083008,1,linear,"{'C': 1, 'kernel': 'linear'}",0.889952,0.861386,0.833333,0.865385,0.834123,0.856836,0.021254,1
1,30.091866,0.287813,25.092628,0.42395,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
2,28.234112,0.118272,6.860139,0.054602,1,poly,"{'C': 1, 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
3,17.15246,0.416282,3.482776,0.071671,2,linear,"{'C': 2, 'kernel': 'linear'}",0.889952,0.861386,0.833333,0.865385,0.834123,0.856836,0.021254,1
4,30.250049,0.328056,25.080581,0.577529,2,rbf,"{'C': 2, 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
5,28.88058,0.362523,7.115195,0.259857,2,poly,"{'C': 2, 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
6,17.403991,0.542065,3.500034,0.075561,3,linear,"{'C': 3, 'kernel': 'linear'}",0.889952,0.861386,0.833333,0.865385,0.834123,0.856836,0.021254,1
7,30.122188,0.427892,25.004455,0.492593,3,rbf,"{'C': 3, 'kernel': 'rbf'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
8,29.211161,0.305336,7.389111,0.626353,3,poly,"{'C': 3, 'kernel': 'poly'}",0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
9,17.55002,0.566705,3.550072,0.081094,4,linear,"{'C': 4, 'kernel': 'linear'}",0.889952,0.861386,0.833333,0.865385,0.834123,0.856836,0.021254,1


In [12]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

cv_results = cross_validate(SVC(C=1, kernel='linear'),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 38.2240
score_time: 8.4459
test_f1   : 0.8568
test_precision: 0.8523
test_recall: 0.8619
test_accuracy: 0.9787


In [25]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix



# rng = np.random.default_rng(2022)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

clf = SVC(kernel='linear')
clf.fit(X,y)
yhat = clf.predict(X)

m = confusion_matrix(y, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,6424,0
true relevant,0,514


In [18]:
df_results = pd.DataFrame(results.cv_results_)
df_results.to_csv('gridsearch7.csv')

In [35]:
!export_out_of_tdmstudio gridsearch3.csv

/bin/sh: export_out_of_tdmstudio: command not found


## MultinomialNB with BOW (grid search)

In [None]:
from sklearn.model_selection import cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
X_bow = pickle.load(open('cache/X_bow.p','rb'))
y_bow = pickle.load(open('cache/y_bow.p','rb'))
parameters = {'alpha':[20,30,40,50,100]}
nb = MultinomialNB()
clf = GridSearchCV(nb, parameters,scoring='f1', cv=5,verbose=4)

results = clf.fit(X_bow,y_bow)
pd.DataFrame(results.cv_results_)

In [20]:
from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_validate

cv_results = cross_validate(MultinomialNB(alpha=20),X,y,cv=5,scoring=['f1','precision','recall','accuracy'])
for metric in cv_results:
    print(f'{metric:10}: {np.average(cv_results[metric]):5.4f}')

fit_time  : 0.4311
score_time: 0.1133
test_f1   : 0.7435
test_precision: 0.7219
test_recall: 0.7684
test_accuracy: 0.9607


In [26]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix



# rng = np.random.default_rng(2022)

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

clf = MultinomialNB(alpha=20)
clf.fit(X,y)
yhat = clf.predict(X)

m = confusion_matrix(y, yhat, labels=[0, 1])
pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,6196,228
true relevant,65,449


### Split train-test confusion matrix

In [9]:
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.svm import SVC
# from sklearn.metrics import confusion_matrix



# # rng = np.random.default_rng(2022)

# # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)

# clf = SVC(C=3, kernel='poly',degree=2)
# clf.fit(X,y)
# yhat = clf.predict(X)

# m = confusion_matrix(y, yhat, labels=[0, 1])
# pd.DataFrame(m, index=['true irrelevant','true relevant'], columns=['pred irrelevant', 'pred revant'])

Unnamed: 0,pred irrelevant,pred revant
true irrelevant,11939,51
true relevant,108,406


## Working with non-dp articles

In [11]:
len(all_files)

2057868

In [12]:
import pandas as pd


import pandas as pd
all_articles_df = pd.DataFrame(
                  ['unknown']*len(all_files),
                  columns=['label']
                 )
all_articles_df['source']='GM1'
all_articles_df.iloc[len(os.listdir(GM_all_part1)):, 1] = 'GM2'

all_articles_df['id'] = [file_id.split('/')[-1][:-4] for file_id in all_files]

# df['source']='GM'
# df.iloc[:len(os.listdir(TS_dirpath)),-1]='TS'
all_articles_df


Unnamed: 0,label,source,id
0,unknown,GM1,1323614655
1,unknown,GM1,1287437740
2,unknown,GM1,1289475925
3,unknown,GM1,1287717370
4,unknown,GM1,1295978754
...,...,...,...
2057863,unknown,GM2,1284766080
2057864,unknown,GM2,1288512672
2057865,unknown,GM2,1313948084
2057866,unknown,GM2,1283055041


In [13]:
dp_ids = set([file_.split('/')[-1][:-4] for file_ in dp_files])
to_keep=[]
for idx,id_ in enumerate(all_articles_df['id']):
    if not id_ in dp_ids:
        to_keep.append(idx)

all_articles_df['prediction'] = -1
all_articles_df.iloc[to_keep,:]

Unnamed: 0,label,source,id,prediction
0,unknown,GM1,1323614655,-1
1,unknown,GM1,1287437740,-1
2,unknown,GM1,1289475925,-1
3,unknown,GM1,1287717370,-1
4,unknown,GM1,1295978754,-1
...,...,...,...,...
2057863,unknown,GM2,1284766080,-1
2057864,unknown,GM2,1288512672,-1
2057865,unknown,GM2,1313948084,-1
2057866,unknown,GM2,1283055041,-1



## Compute and store predictions


In [16]:
from sklearn.svm import SVC
clf = SVC(kernel='linear')
clf.fit(X,y)

SVC(kernel='linear')

In [17]:
vocab = build_vocab(dp_files)
word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])


def process_file(file_):        
    title, text = get_title_and_text(file_)
    x = np.zeros(shape=(len(vocab)+1,), dtype='float32')
    if not title is None and not text is None:
        tokens = tokenize(title+' '+text)
        for token in tokens:
            if token in word2idx:
                x[word2idx[token]]+=1
            else:
                x[-1]+=1
        yhat = clf.predict([x])[0]
    return yhat,title
count=0
for file_ in all_files[:1000]:
    yhat,title=process_file(file_)
    if yhat>0.5:
        count+=1
        print(title)

Today and Tomorrow
New Alloy Used In Gas Regulators
THIEVES GET $1,186
Harry A. Weingarten
Four Arrested In Narcotic Raids
Boy Drowns Despite Efforts of His Dog To Drag Him Out
Married Girl, 16, Dies From Polio; Toronto Total 67
Gifts to Hospital From All Ontario Bring Goal Near
Western Grocert Profits Increase
Secret Business
Metro to Cut Police Role in Issuing Taxi Licenses
Rigid Restrictions Curb Swiss Power
Doctor Sentenced To 7-Year Term For Abortion
1 Dies, 2 Hurt, Trucks Crash In Cornwall
Lightning Kills Mother and Son
Red Urges Recognition
Hadassah Plans Rehab Centre
3 Notes to Soviet Aim at Showdown On Berlin Rights
Fire Leaps Track, Mill Destroyed
Health Walkers in Canadian Debut
Problem of Aged Social Challenge, Dr. Phillips Says
Old Country Soccer
Albert E. Brewer
BETTER
Darts After 'Old Rag,' Rescues Toronto Boy
Roach Will Play For Providence
Fun with Figures
EASTER MESSAGE
Resort Man, War Bride Found Frozen at Lake
The Homemaker
Quebec Planning $25 Million Issue For N.Y. 

UnboundLocalError: local variable 'yhat' referenced before assignment

In [None]:
import spacy
import pickle
from tqdm import tqdm


vocab = build_vocab(dp_files)
word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])

nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])
for idx in tqdm(range(len(all_articles_df))):        
    _, source, id_, _ = all_articles_df.iloc[idx,:]
    if source=='GM1':
        file_ = GM_all_part1+id_+'.xml'
    else:
        file_ = GM_all_part2+id_+'.xml'

    title, text = get_title_and_text(file_)
    x = np.zeros(shape=(len(vocab)+1,), dtype='float32')
    if not title is None and not text is None:
        tokens = tokenize(title+' '+text)
        for token in tokens:
            if token in word2idx:
                x[word2idx[token]]+=1
            else:
                x[-1]+=1
        yhat = clf.predict([x])[0]
        all_articles_df.iloc[idx,3] = yhat
all_articles_df.to_csv('all_articles_df_bow_predictions.csv')

 11%|█▏        | 234398/2057868 [5:33:17<42:14:13, 11.99it/s] 

In [1]:
print('hello')

hello


In [16]:
len(non_dp_articles_df)

2045364

In [29]:
import spacy
import pickle
from tqdm import tqdm

if os.path.isfile('cache/X_all_bow.p'):
    info('Retrieving from disk')
    X = pickle.load(open('cache/X_all_bow.p','rb'))
else:
    info('Creating X')
    vocab = build_vocab(dp_files)
    word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])
    example_no = np.sum(all_articles_df["label"]=='unknown')

    X = np.zeros(shape=(example_no, len(vocab)+1), dtype='float32')

    inst_no=0
    for idx in tqdm(list(range(len(all_articles_df)))):      
        label, source, id_ = all_articles_df.iloc[idx,:]
        if source=='GM1':
            file_ = GM_all_part1+id_+'.xml'
        else:
            file_ = GM_all_part2+id_+'.xml'

        title, text = get_title_and_text(file_)
        tokens = tokenize(title+' '+text)
        for token in tokens:
            if token in word2idx:
                X[inst_no, word2idx[token]]+=1
            else:
                X[inst_no,-1]+=1
#         X[inst_no,:300] = nlp(title).vector
#         X[inst_no,300:] = nlp(text).vector
        inst_no+=1

    pickle.dump(X, open('cache/X_all_bow.p','wb'))


2022-02-22 18:27:32.597858 [ [1;94mINFO[0m  ] Creating X


MemoryError: Unable to allocate 76.7 GiB for an array with shape (2057868, 10001) and data type float32

In [28]:
X.shape

(0, 10001)

In [62]:
assert all(type(elem)==str for elem in texts+titles)

In [53]:
(texts+titles)[:2]

["\n\n\n\xa0\n\n\n\nToday and Tomorrow\n\n\nWalter Lippmann's\n\n\nThe Voice of America\n\n\nThough money for radio broadcasts and printing is needed so that the Voice of America may be heard in foreign lands our greatest need is to have something definite clear and convincing for that voice to say There will he little opposition in Congress to an appropriation if\n\n\nit were not for the feeling that the men who conduct our propaganda have little to do with the making of our policy and that the sales department of the Government so lo speak writing about goods for which the produc tion engineers have just begun to make the first blueprints As for the customers abroad they are undoubtedly confused and suspicious partly no doubt because the rival firm misrepresents us but chiefly because we sound hot and bothered when as Great Power they expect us to be cool and definite Mr Benton's difficul ties with Congress and with the opposition abroad will diminish when his chief Secretary Marshal

In [47]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [40]:
if os.path.isfile('cache/X_all.p'):
    print('here')
else:
    print('there')

there


In [12]:

all_files[0]

'/home/ec2-user/SageMaker/data/GM_all_1945_1956/1323614655.xml'

In [41]:
np.random.choice?