In [81]:
from pypdf import PdfReader
from nltk.corpus import words
import chromadb
import requests
from io import BytesIO
import random
import numpy as np
from nltk.tokenize import word_tokenize

In [53]:
random.seed(312412)
vocabulary = set((word.lower() for word in words.words('en')))

In [6]:
chroma_client = chromadb.PersistentClient('../data')
books = chroma_client.get_collection('books')
data = books.get()
data.keys()

In [28]:
links = [m.get('link') for m in data['metadatas'] if m.get('link', '').endswith('pdf')]
len(links)

310

In [34]:
def download_pdf(url: str):
    r = requests.get(url)
    if r.ok and r.headers.get('content-type') == 'application/pdf':
        return BytesIO(r.content)
    else:
        return

In [35]:
selected_links = random.choices(links, k = 10)

In [36]:
pdfs = [download_pdf(link) for link in selected_links ] # this could fail

In [40]:
pdf_objs = [ PdfReader(pdf) for pdf in pdfs if pdf ]

In [130]:
text_arr = [ ' '.join(page.extract_text() for page in pdf_obj.pages[20:-20]) for pdf_obj in pdf_objs ] # we ommit the first and last pages 
text_arr = [text for text in text_arr if len(text) > 0]
[len(text) for text in text_arr]

[700750, 254368, 0, 1824352, 320005, 244297, 75685, 1002106]

In [133]:
def split_text(text: str, chunk_size: int = 256, chunk_overlap: int = 128) -> list[str]:
    current, next, split = '', '', []
    m, n = 0, chunk_overlap - chunk_size
    for word in text.split():
        k = len(word) + 1
        if m < chunk_size:
            current += f'{word} '
            m += k
        else:
            split.append(current.strip())
            current = f'{word} '
            m = k
        if n < chunk_size:
            n += k
            if n > 0:
                next += f'{word} '
        else:
            split.append(next.strip())
            next = f'{word} '
            n = k
    if split[-1] != current:
        split.append(current.strip())
    return split

In [134]:
chunks = [split_text(text, 1024, 512) for text in text_arr ]

In [135]:
[len(chunk) for chunk in chunks]

[1363, 495, 3536, 623, 474, 147, 1863]

In [136]:
def words_in_vocabulary_ratio(text: str) -> float:
    counter, length = 0, 0  # pretty naive function
    for word in text.split():
        if word.isalpha():
            if word.lower() in vocabulary:
                counter += 1
            length += 1
    return counter / max(length, 1)  # prevents division by zero

In [137]:
wiv_ratio_arr = [ words_in_vocabulary_ratio(chunk) for split in chunks for chunk in split ]

In [138]:
len(wiv_ratio_arr) == sum(len(chunk) for chunk in chunks)

True

In [139]:
q = np.arange(0, 1.05, 0.05)
print(' quantile | value ')
for r, q in zip(np.quantile(wiv_ratio_arr, q), q):
    print(f'   {q:.2f}   |  {r:.2f}')

 quantile | value 
   0.00   |  0.28
   0.05   |  0.40
   0.10   |  0.44
   0.15   |  0.50
   0.20   |  0.75
   0.25   |  0.78
   0.30   |  0.80
   0.35   |  0.81
   0.40   |  0.82
   0.45   |  0.83
   0.50   |  0.84
   0.55   |  0.85
   0.60   |  0.86
   0.65   |  0.87
   0.70   |  0.88
   0.75   |  0.89
   0.80   |  0.89
   0.85   |  0.91
   0.90   |  0.92
   0.95   |  0.94
   1.00   |  1.00


In [106]:
def get_word_clusters(text: str) -> list[list[str]]:
    clusters = []
    curr_cluster = []
    prev_token = None
    for i, token in enumerate(word_tokenize(text)):
        if token.isalpha():
            if token.lower() not in vocabulary:
                if prev_token:
                    curr_cluster.append(prev_token)
                    prev_token = None
                curr_cluster.append(token)
            else:
                if not prev_token and curr_cluster:
                    curr_cluster.append(token)
                    clusters.append(curr_cluster)
                    curr_cluster = []
                prev_token = token
        else:
            if curr_cluster:
                clusters.append(curr_cluster)
                curr_cluster = []
            prev_token = None
    if curr_cluster:
        clusters.append(curr_cluster)
    return clusters

In [118]:
min(wiv_ratio_arr, key=wiv_ratio_arr.index)

0.4

In [119]:
min(wiv_ratio_arr)

0.0

In [120]:
wiv_ratio_arr.index(0.4)

0

In [121]:
wiv_ratio_arr

[0.4,
 0.37593984962406013,
 0.3939393939393939,
 0.43661971830985913,
 0.4513888888888889,
 0.417910447761194,
 0.42424242424242425,
 0.4492753623188406,
 0.4788732394366197,
 0.45774647887323944,
 0.35555555555555557,
 0.29927007299270075,
 0.39416058394160586,
 0.5,
 0.5,
 0.4827586206896552,
 0.4405594405594406,
 0.3925925925925926,
 0.40425531914893614,
 0.4375,
 0.4172661870503597,
 0.3880597014925373,
 0.36363636363636365,
 0.43356643356643354,
 0.4589041095890411,
 0.4166666666666667,
 0.4315068493150685,
 0.44525547445255476,
 0.44680851063829785,
 0.4316546762589928,
 0.3769230769230769,
 0.3795620437956204,
 0.42105263157894735,
 0.38345864661654133,
 0.3829787234042553,
 0.4397163120567376,
 0.43283582089552236,
 0.35036496350364965,
 0.3237410071942446,
 0.3014705882352941,
 0.3053435114503817,
 0.41353383458646614,
 0.4198473282442748,
 0.4057971014492754,
 0.45806451612903226,
 0.4657534246575342,
 0.4076923076923077,
 0.36,
 0.4108527131782946,
 0.44274809160305345,
 0.

In [107]:
s = 'hi, thereee is a cattt and a doogggg. takeee care.'
get_word_clusters(s)

[['thereee', 'is'],
 ['a', 'cattt', 'and'],
 ['a', 'doogggg'],
 ['takeee', 'care']]

In [116]:
chunks[7][-13]

'agencies conduct studies to generate evi- dence about HIV education and prevention interventions? Should agencies focus on the delivery of interventions based on the existing evidence? 3. How is this black MSM population vulnerable, and how should this vulnerability be addressed in research and nonresearch interventions? 4. Do Dr. Albert and Dr. Baines have ethical obligations to other community popu- lations? On what basis is the public health agency justiﬁ ed in advancing inter- ventions that target only a subgroup of the community? 5. How should research studies on Internet-based interventions be conducted to ensure scientiﬁ c validity, given the difﬁ culties of knowing, for example, whether the participant meets the study’s inclusion criteria? Which measures should be taken to protect the privacy and conﬁ dentiality of participants? 6. How should you decide what level and type of evidence you need to back a pub- lic health educational intervention? Should public health professiona

In [109]:
get_word_clusters(chunks[7][-10])

[['users', 'will'],
 ['References', 'Bull'],
 ['Thousand', 'Oaks'],
 ['Sage', 'Publications'],
 ['Phibbs'],
 ['Watson'],
 ['McFarlane'],
 ['young', 'adults', 'expect'],
 ['go', 'online'],
 ['Lessons', 'for'],
 ['prevention', 'website'],
 ['Medical', 'Systems'],
 ['Pratte'],
 ['Whitesell'],
 ['Rietmeijer'],
 ['McFarlane'],
 ['an', 'Internet', 'based'],
 ['for', 'HIV', 'prevention'],
 ['The', 'Youthnet', 'trials'],
 ['AIDS', 'and'],
 ['doi'],
 ['Centers', 'for'],
 ['HIV', 'Among'],
 ['African', 'Americans'],
 ['http'],
 ['Accessed'],
 ['Chiasson'],
 ['Humberstone'],
 ['Hirshﬁ', 'eld'],
 ['Hartel'],
 ['Increased', 'HIV', 'disclosure'],
 ['three', 'months', 'after'],
 ['an', 'online', 'video']]

In [112]:
'will' in vocabulary

True

In [114]:
'dogs' in vocabulary

True

In [None]:
def join_words(text: str) -> str:
    l = word_tokenize(text)
    i, m = 0, len(l)
    output = ''
    while i < m - 1:
        w = l[i]
        if w not in vocabulary:
            w1 = l[i + 1]
            w_ = w + w1
            if w_ in vocabulary:
                output += f'{w_} '
                i += 2
            else:
                output += f'{w} '
                i += 1
        else:
            output += f'{w} '
            i += 1
    return output.strip()
