In [1]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'txtproj'))


In [2]:
import base64
import os

import pandas as pd


def encode_text_body(text_body: str | bytes) -> str:
    """
    Returns Base64-encoded string version of a UTF-8 string or bytes representing a UTF-8 string
    """
    if isinstance(text_body, bytes):
        return base64.b64encode(text_body).decode('utf-8')
    else:
        return base64.b64encode(text_body.encode('utf-8')).decode('utf-8')


def text_as_base64(fqfn: str) -> tuple[str, str]:
    with open(fqfn, 'r') as f:
        text_bytes = f.read()
        b64str = encode_text_body(text_bytes)
        return text_bytes, b64str


def read(folder_path: str, labels: list[str] = ['0', '1'], k: int = 3) -> pd.DataFrame:
    txt_labels: list[int] = []
    txt_fqfns: list[str] = []
    txt_file_names: list[str] = []
    txt_lengths: list[int] = []
    txt_b64s: list[str] = []
    txt_contents: list[str] = []

    for label in labels:
        label_counter = 0
        label_folder = os.path.join(folder_path, label)
        if not os.path.isdir(label_folder):
            continue  # Skip if the folder does not exist

        for file_name in os.listdir(label_folder):
            if file_name.endswith('.txt'):
                label_counter += 1
                if label_counter > k:
                    break

                txt_file_names.append(file_name)

                text_fqfn = os.path.join(label_folder, file_name)
                txt_fqfns.append(text_fqfn)

                text_bytes, b64str = text_as_base64(text_fqfn)
                txt_b64s.append(b64str)
                txt_contents.append(text_bytes)

                txt_lengths.append(len(text_bytes))
                txt_labels.append(int(label))


    return pd.DataFrame(data={
        'file_name': txt_file_names,
        'fqfn': txt_fqfns,
        'text': txt_contents,
        'text_size': txt_lengths,
        'text_body_b64': txt_b64s,
        'label': txt_labels,
    })


In [3]:
df = read(DATALAKE_PATH, labels=['0', '1'], k=3)
df[['text', 'text_size', 'text_body_b64', 'label']]

Unnamed: 0,text,text_size,text_body_b64,label
0,"<html lang=""en""><head>\n <meta charset=""UTF...",16401,PGh0bWwgbGFuZz0iZW4iPjxoZWFkPgogICAgPG1ldGEgY2...,0
1,"<html class=""desktop portrait""><head><meta htt...",22303,PGh0bWwgY2xhc3M9ImRlc2t0b3AgcG9ydHJhaXQiPjxoZW...,0
2,<html><head>\n <title>Index of /demonServer/n...,1206,PGh0bWw+PGhlYWQ+CiAgPHRpdGxlPkluZGV4IG9mIC9kZW...,0
3,<html><head>\n<title>404 Not Found</title>\n</...,295,PGh0bWw+PGhlYWQ+Cjx0aXRsZT40MDQgTm90IEZvdW5kPC...,1
4,"<html><head><meta name=""color-scheme"" content=...",171,PGh0bWw+PGhlYWQ+PG1ldGEgbmFtZT0iY29sb3Itc2NoZW...,1
5,<html><head>\n<title>404 Not Found</title>\n</...,295,PGh0bWw+PGhlYWQ+Cjx0aXRsZT40MDQgTm90IEZvdW5kPC...,1


In [5]:
from txtproj.classifier.inf_server_driver import InferenceServerDriver

model = InferenceServerDriver()
df['outcome'] = model.predict(inputs=df)
print(df[['text', 'text_size', 'text_body_b64', 'label', 'outcome']])


Label=0
Label=0
Label=0
Label=1
Label=1
Label=1
                                                text  text_size  \
0  <html lang="en"><head>\n    <meta charset="UTF...      16401   
1  <html class="desktop portrait"><head><meta htt...      22303   
2  <html><head>\n  <title>Index of /demonServer/n...       1206   
3  <html><head>\n<title>404 Not Found</title>\n</...        295   
4  <html><head><meta name="color-scheme" content=...        171   
5  <html><head>\n<title>404 Not Found</title>\n</...        295   

                                       text_body_b64  label  outcome  
0  PGh0bWwgbGFuZz0iZW4iPjxoZWFkPgogICAgPG1ldGEgY2...      0        0  
1  PGh0bWwgY2xhc3M9ImRlc2t0b3AgcG9ydHJhaXQiPjxoZW...      0        0  
2  PGh0bWw+PGhlYWQ+CiAgPHRpdGxlPkluZGV4IG9mIC9kZW...      0        0  
3  PGh0bWw+PGhlYWQ+Cjx0aXRsZT40MDQgTm90IEZvdW5kPC...      1        1  
4  PGh0bWw+PGhlYWQ+PG1ldGEgbmFtZT0iY29sb3Itc2NoZW...      1        1  
5  PGh0bWw+PGhlYWQ+Cjx0aXRsZT40MDQgTm90IEZvdW5kPC...    