In [1]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
!pip install --quiet PyPDF2
print('pip install keybert and PDF 2 complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert and PDF 2 complete.


In [2]:
# https://stackoverflow.com/a/59348028
import re
import time

import requests
from arrow import now
from bs4 import BeautifulSoup

URL = 'https://www.archives.gov/research/pentagon-papers'

def remove_duplicates(l): # remove duplicates and unURL string
    for item in l:
        match = re.search("(?P<url>https?://[^\s]+)", item)
        if match is not None:
            links.append((match.group("url")))

SKIP = False

time_start = now()
if not SKIP:
    source_code = requests.get(url=URL)
    soup = BeautifulSoup(source_code.content, 'lxml')
    data = []
    links = []


    for link in soup.find_all('a', href=True):
        data.append(str(link.get('href')))
    flag = True
    remove_duplicates(data)
    while flag:
        try:
            for link in links:
                for j in soup.find_all('a', href=True):
                    temp = []
                    source_code = requests.get(link)
                    soup = BeautifulSoup(source_code.content, 'lxml')
                    temp.append(str(j.get('href')))
                    remove_duplicates(temp)

                    if len(links) > 162: # set limitation to number of URLs
                        break
                if len(links) > 162:
                    break
            if len(links) > 162:
                break
        except Exception as e:
            print(e)
            if len(links) > 162:
                break

    FILES = [item for item in links if item.endswith('.pdf')]

print('got {} file links in {}'.format(len(FILES), now() - time_start))

got 71 file links in 0:00:04.779472


In [3]:
from urllib.request import urlretrieve
from os.path import basename
from os.path import exists
from arrow import now

SKIP = False
TARGET = '/kaggle/working/'

time_start = now()
for url in FILES:
    short_name = basename(url)
    target = TARGET + short_name
    if not exists(path=target):
        print('downloading {} to {}'.format(url, target))
        urlretrieve(url, target)
print('done in {}'.format(now() - time_start))

downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Index.pdf to /kaggle/working/Pentagon-Papers-Index.pdf
downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Part-I.pdf to /kaggle/working/Pentagon-Papers-Part-I.pdf
downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Part-II.pdf to /kaggle/working/Pentagon-Papers-Part-II.pdf
downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Part-III.pdf to /kaggle/working/Pentagon-Papers-Part-III.pdf
downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Part-IV-A-1.pdf to /kaggle/working/Pentagon-Papers-Part-IV-A-1.pdf
downloading https://nara-media-001.s3.amazonaws.com/arcmedia/research/pentagon-papers/Pentagon-Papers-Part-IV-A-2.pdf to /kaggle/working/Pentagon-Papers-Part-IV-A-2.pdf
downloading https://na

In [4]:
import PyPDF2
import pandas as pd
from glob import glob
from os.path import basename
from arrow import now

GLOB = '/kaggle/working/*.pdf'
time_start = now()

result = []
for input_file in glob(pathname=GLOB):
    short_name = basename(input_file)
    print('{} : processing : {}'.format(now().time(), input_file))

    with open(file=input_file, mode='rb') as file_object:
        reader = PyPDF2.PdfReader(file_object)

        for number, page in enumerate(reader.pages):
            series = pd.Series(data=[short_name, number, page.extract_text()], index=['file', 'page', 'text'])
            result.append(series)


df = pd.DataFrame(data=result)
print('done in {}'.format(now() - time_start))

df.shape

20:54:46.895108 : processing : /kaggle/working/Pentagon-Papers-Part-V-B-4-Book-II.pdf
20:55:27.553046 : processing : /kaggle/working/Pentagon-Papers-Part-I.pdf
20:56:07.666271 : processing : /kaggle/working/Pentagon-Papers-Part-V-B-3a.pdf
20:57:01.785387 : processing : /kaggle/working/Pentagon-Papers-Part-V-B-4-Book-I.pdf
20:58:30.500528 : processing : /kaggle/working/Pentagon-Papers-Part-VI-C-3.pdf
20:58:53.485583 : processing : /kaggle/working/Pentagon-Papers-Part-VI-A.pdf
20:59:08.064706 : processing : /kaggle/working/Pentagon-Papers-Part-IV-C-5.pdf
20:59:34.950332 : processing : /kaggle/working/Pentagon-Papers-Part-IV-A-3.pdf
20:59:47.323044 : processing : /kaggle/working/Pentagon-Papers-Part-VI-C-2.pdf
21:00:15.560866 : processing : /kaggle/working/Pentagon-Papers-Part-IV-C-8.pdf
21:00:48.997472 : processing : /kaggle/working/Pentagon-Papers-Part-IV-C-7-b.pdf
21:01:29.319743 : processing : /kaggle/working/Pentagon-Papers-Part-V-B-3c.pdf
21:02:54.134657 : processing : /kaggle/worki

(8292, 3)

In [5]:
df = df.sort_values(ascending=True, by=['file', 'page']).reset_index(drop=True)
df.head()

Unnamed: 0,file,page,text
0,Pentagon-Papers-Index.pdf,0,".. Declassified per Executive Order 13526, Sec..."
1,Pentagon-Papers-Index.pdf,1,TLE \nF<HM \nRTC \nJC \n[J \n[J Declassified p...
2,Pentagon-Papers-Index.pdf,2,"Declassified per Executive Order 13526, Sectio..."
3,Pentagon-Papers-Index.pdf,3,"Declassified per Executive Order 13526, Sectio..."
4,Pentagon-Papers-Index.pdf,4,"Declassified per Executive Order 13526, Sectio..."


In [6]:
from nltk.tokenize import sent_tokenize

df['text'] = df['text'].apply(func=sent_tokenize)
df = df.explode(column='text', ignore_index=True).dropna()
df['text'] = df['text'].str.replace('\n', ' ')
df['token count'] = df['text'].str.split().str.len()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 215535 entries, 0 to 215536
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   file         215535 non-null  object
 1   page         215535 non-null  int64 
 2   text         215535 non-null  object
 3   token count  215535 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 8.2+ MB


In [7]:
df.to_csv(path_or_buf='/kaggle/working/pentagon_papers.csv', index=False)

In [8]:
from plotly.express import histogram
histogram(data_frame=df, x='token count', log_y=True)

In [9]:
# https://stackoverflow.com/a/3939381
from collections import Counter

def clean(arg:str) -> str:
    return arg.translate(str.maketrans('', '', '©±®¢@«]*§£=¥%°+«€_>»·\xad•#~-'))

df['clean'] = df['text'].apply(clean)
count = Counter(''.join(df['clean'].values.tolist()))
count

Counter({' ': 3221450,
         'e': 1527018,
         't': 1204335,
         'i': 997587,
         'o': 935421,
         'n': 923882,
         'a': 892331,
         'r': 783822,
         's': 717101,
         'l': 554843,
         '.': 539321,
         'h': 505407,
         'c': 457100,
         'd': 432588,
         'u': 315443,
         'm': 290924,
         'f': 285046,
         'p': 261997,
         ',': 199966,
         'g': 195354,
         ':': 177707,
         'y': 173587,
         'b': 166914,
         '1': 164399,
         "'": 159597,
         'v': 156758,
         'S': 140246,
         'I': 110123,
         'T': 109654,
         'N': 109530,
         'w': 102312,
         'C': 94338,
         'D': 91623,
         'E': 87163,
         'O': 80859,
         '"': 78452,
         'A': 71904,
         '3': 68623,
         '2': 59983,
         ';': 54195,
         '0': 53723,
         'V': 53270,
         'P': 53187,
         '6': 52885,
         'R': 49453,
         'J': 47492,


In [10]:
''.join(list(dict(count).keys()))

'.Declasifd prExutvO13526,SonNPjmb:ByW0FIALRTVk&<HMCJ[XUK/8Y\'!4g7G9{h()"\\wzqQ;}?Z$'

In [11]:
clean_df = df.copy()
clean_df = clean_df[~clean_df['clean'].str.contains('Declassified per')]
clean_df = clean_df[~clean_df['clean'].str.contains('Ibid')]
clean_df = clean_df[~clean_df['clean'].str.contains('NND')]
clean_df = clean_df[~clean_df['clean'].str.contains('NWD')]
clean_df = clean_df[~clean_df['clean'].str.contains('SECRET')]
clean_df = clean_df[clean_df['token count'] > 3]
sample_df = clean_df.sample(n=10000, random_state=2024)

In [12]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 10 # we can tune this somewhat depending on the size of the corpus
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
# we use the clean text for keywords even though we show a truncated original message
DOCS = sample_df['clean'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we probably don't need to increase the max sequence length from the default of 128
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
sample_df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]


.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:03:49.441456
we have 10000 documents and 1906 words.
model time: 0:03:55.697274


In [13]:
sample_df[sample_df['keyword'] != '-none-']['keyword'].value_counts(normalize=True).head(n=10)

keyword
vietnam       0.061813
hanoi         0.022661
indochina     0.022437
saigon        0.016715
vietnamese    0.016042
viet          0.013013
french        0.008414
laos          0.008189
asia          0.007628
military      0.007516
Name: proportion, dtype: float64

In [14]:
histogram(data_frame=sample_df[sample_df['keyword'] != '-none-']['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count', marginal='box')

In [15]:
import pandas as pd
from umap import UMAP

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
plot_df = sample_df.copy()
plot_df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
plot_df['short text'] = plot_df['clean'].apply(func=lambda x: ' '.join(x.split()[:20]))
plot_df = plot_df[plot_df['keyword'] != '-none-']
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:33.115441


In [16]:
from plotly.colors import qualitative
from plotly.express import scatter

top_keywords = plot_df['keyword'].value_counts().head(n=52).index.tolist()
scatter(data_frame=plot_df[plot_df['keyword'].isin(top_keywords)], x='u0', y='u1', hover_name='short text', color='keyword',
        color_discrete_sequence=qualitative.Alphabet, height=900
       ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)