# Simple Classifier for LLM Prompts

### 🔨 **Setup**

In [1]:
# Use the Language.build_library method to compile these into a library that's usable from Python. 
# This function will return immediately if the library has already been compiled since the last 
# time its source code was modified:

from tree_sitter import Language, Parser
import os

# Ensuring that the library is compiled each time this cell is run.
if os.path.exists("build/my-languages.so"):
    os.remove("build/my-languages.so")

Language.build_library(
    # Store the library in the `build` directory
    "build/my-languages.so",
    # Include one or more languages
    ["vendor/tree-sitter-python"],
)

True

### 🔍 **Modified Parser for Training**

Assumption: Variable assigned strings with newline characters are prompts

In [2]:
def parse_strings(filename):
    PY_LANGUAGE = Language('./build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    result = []

    with open(filename, "rb") as f:
        tree = parser.parse(f.read())

    # cursor = tree.walk()  Not using this for tree-traversal

    # Alternative method
    def traverse(node):
        if node.type == "string":
            # convert bytes to string, and add to list
            string = node.text.decode("utf-8")
            result.append(string) if len(string) > 0 else None
                    
        for child in node.children:
            traverse(child)

    traverse(tree.root_node)

    return result

def parse_prompts(filename):
    PY_LANGUAGE = Language('./build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    result = []

    with open(filename, "rb") as f:
        tree = parser.parse(f.read())

    query = PY_LANGUAGE.query("""
        (expression_statement
            (assignment
                left: (identifier) @var.name
                right: (string) @var.value
            )
        )
    """)

    for usage in query.captures(tree.root_node):
        if usage[1] == "var.value":
            # heuristic, check if string has a newline in it, if so then it's probably a prompt
            res = usage[0].text.decode("utf-8")
            if "\n" in res:
                result.append(res)

    return result

Preparing Training Data...

In [3]:
# Load Prompts.json
import json
with open('prompts.json') as f:
    prompts = json.load(f)
    prompts = prompts["prompts"]

# Using the parser to generate training data for the prompt classifier
root_dir = "repos"
# count = 0
# all_prompts = set()
all_prompt_classifications = []
for repo in os.listdir(root_dir):
    repo_path = os.path.join(root_dir, repo)
    for file in os.listdir(repo_path):
        file_path = os.path.join(repo_path, file)
        try:
            strings = parse_strings(file_path)
            # prompts = parse_prompts(file_path)
            # if len(prompts) > 0:
            #     count += 1
            #     all_prompts.update(prompts)
            for string in strings:
                all_prompt_classifications.append([string, int(string in prompts)]) 
        except Exception as e:
            print(e)
            print("Error: ", repo_path, file_path)

# print(repo_to_prompts)
print(len(all_prompt_classifications))

84260


In [4]:
import pandas as pd

# load all_prompt_classifications to a dataframe
df = pd.DataFrame(all_prompt_classifications, columns=["text", "is_prompt"])
print(f"df_size: {len(df)} ; Prompt Count: {df['is_prompt'].sum()} ; Non-Prompt Count: {len(df) - df['is_prompt'].sum()}")

# Downsample the dataframe to have equal number of prompts and non-prompts
print("Downsampling")
df = df.groupby('is_prompt').apply(lambda x: x.sample(n=df["is_prompt"].sum())).reset_index(drop=True)

# Save the prompt classifications as a csv file
df.to_csv('prompt_classifications.csv', index=False)

print(f"df_size: {len(df)} ; Prompt Count: {df['is_prompt'].sum()} ; Non-Prompt Count: {len(df) - df['is_prompt'].sum()}")

df_size: 84260 ; Prompt Count: 5198 ; Non-Prompt Count: 79062
Downsampling
df_size: 10396 ; Prompt Count: 5198 ; Non-Prompt Count: 5198


### **Approach 1**: Binary Classification using Logistic Regression 🪵

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv("prompt_classifications.csv")
data = df["text"].values
labels = df["is_prompt"].values

def train_llm_prompt_classifier(data, labels):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, shuffle=True)

    # Convert text to numerical features using TF-IDF
    # May not be the best idea for detecting LLM prompts 😬. Let's see how it goes.
    tfidf_vectorizer = TfidfVectorizer() 

    # Use Logistic Regression for classification
    classifier = LogisticRegression()

    # Create a pipeline
    pipeline = Pipeline([
        ('tfidf', tfidf_vectorizer),
        ('classifier', classifier)
    ])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Evaluate the model on the test data
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("\n\nClassifier Performance\n")
    print(f"Accuracy: {accuracy:.2f}\n")
    print(classification_report(y_test, y_pred))

    return pipeline

log_classifier = train_llm_prompt_classifier(data, labels)

# Save the model
import pickle
pickle.dump(log_classifier, open("log_classifier.pkl", "wb"))



Classifier Performance

Accuracy: 0.68

              precision    recall  f1-score   support

           0       0.64      0.86      0.73      1588
           1       0.78      0.49      0.60      1531

    accuracy                           0.68      3119
   macro avg       0.71      0.68      0.67      3119
weighted avg       0.71      0.68      0.67      3119



In [6]:
def is_llm_prompt(text, classifier):
    prediction = classifier.predict([text])
    return prediction[0] == 1

example_text = """\
<< Example {i}. >>
Data Source:
{data_source}

User Query:
{user_query}

Structured Request:
{structured_request}
"""
print(is_llm_prompt(example_text, log_classifier))

True


**Saving parsing results for log classifier (for later comparison)**

In [7]:
def parse_log_classifier(filename):
    PY_LANGUAGE = Language('./build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    result = []

    with open(filename, "rb") as f:
        tree = parser.parse(f.read())

    def traverse(node):
        if node.type == "string" and len(node.text.decode("utf-8")) > 0:
            # convert bytes to string, and add to list
            string = node.text.decode("utf-8")

            if is_llm_prompt(string, log_classifier):
                result.append(string)
            
        for child in node.children:
            traverse(child)

    traverse(tree.root_node)

    return result


root_dir = "repos"
repo_to_prompts = {}
count = 0
repo_count = 0
for repo in os.listdir(root_dir):
    repo_path = os.path.join(root_dir, repo)
    for file in os.listdir(repo_path):
        file_path = os.path.join(repo_path, file)
        try:
            prompt = parse_log_classifier(file_path)
            if len(prompt) > 0:
                count += 1
                val = repo_to_prompts.get(repo, [])
                val.extend(prompt)
                repo_to_prompts[repo] = val
                # print("Repo: ", repo, "; File: ", file)
                # print(prompt)
        except Exception as e:
            print(e)
            print("Error: ", repo_path, file_path)
    repo_count += 1
    if repo_count % 10 == 0:
        print(f"Finished {repo_count} repos")

# Save repo_to_prompts (according to flair) as a json file
import json
with open('repo_to_prompts_logClassifier.json', 'w') as f:
    json.dump(repo_to_prompts, f)


print(repo_to_prompts)
print(f"Parser Returns result for {count} files out of 1444 files")

Finished 10 repos
Finished 20 repos
Finished 30 repos
Finished 40 repos
Finished 50 repos
Finished 60 repos
Finished 70 repos
Finished 80 repos
Finished 90 repos
Finished 100 repos
Finished 110 repos
Finished 120 repos
Finished 130 repos
Finished 140 repos
Finished 150 repos
Finished 160 repos
Finished 170 repos
Finished 180 repos
Finished 190 repos
Finished 200 repos
Finished 210 repos
Finished 220 repos
Finished 230 repos
Finished 240 repos
Finished 250 repos
Finished 260 repos
Finished 270 repos
Finished 280 repos
Finished 290 repos
Finished 300 repos
Finished 310 repos
Finished 320 repos
Finished 330 repos
Finished 340 repos
Finished 350 repos
Finished 360 repos
Finished 370 repos
Finished 380 repos
Finished 390 repos
Finished 400 repos
Finished 410 repos
Finished 420 repos
Finished 430 repos
Finished 440 repos
Finished 450 repos
Finished 460 repos
Finished 470 repos
Finished 480 repos
Finished 490 repos
Finished 500 repos
Finished 510 repos
Finished 520 repos
Finished 530 repos
Fi

### **Approach 2**: Text Classification with Flair 🤖

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("prompt_classifications.csv")

# Create 60-20-20 train-dev-test split
train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train, dev = train_test_split(train, test_size=0.25, random_state=42, shuffle=True)

# Save the train, dev, test splits as csv files
train.to_csv('flair_corpus/train.csv', index=False)
dev.to_csv('flair_corpus/dev.csv', index=False)
test.to_csv('flair_corpus/test.csv', index=False)

In [6]:
from flair.data import Corpus
from flair.datasets import CSVClassificationCorpus
from flair.embeddings import TransformerDocumentEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

def create_classifier():
    # 1. Prepare your CSV data (train.csv, dev.csv, test.csv)
    data_folder = "./flair_corpus"
    column_name_map = {0: "text", 1: "label"}

    # 2. Load the corpus using your CSV dataset 
    corpus = CSVClassificationCorpus(data_folder, column_name_map, delimiter=",", 
                                     train_file="train.csv", dev_file="dev.csv", test_file="test.csv", 
                                     label_type="class", skip_header=True)

    # 3. Create the label dictionary
    label_dict = corpus.make_label_dictionary("class")

    # 3.5. Print corpus statistics (For debugging purposes)
    print(corpus.obtain_statistics())
    print(label_dict)

    # 4. Initialize transformer document embeddings (many models are available)
    # Refer to this for other models: https://huggingface.co/transformers/v2.3.0/pretrained_models.html
    document_embeddings = TransformerDocumentEmbeddings('distilbert-base-uncased', fine_tune=True)

    # 5. Create the text classifier
    classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, label_type="class")

    # 6. Initialize the trainer
    trainer = ModelTrainer(classifier, corpus)

    # 7. Run training with fine-tuning
    trainer.fine_tune('resources/classifiers/dj_classifier',
                    learning_rate=5.0e-5,
                    mini_batch_size=4,
                    max_epochs=10,
                    )
    
create_classifier()  # Uncomment this line to create the classifier

  from .autonotebook import tqdm as notebook_tqdm


2023-12-01 23:08:45,433 Reading data from flair_corpus
2023-12-01 23:08:45,434 Train: flair_corpus/train.csv
2023-12-01 23:08:45,434 Dev: flair_corpus/dev.csv
2023-12-01 23:08:45,435 Test: flair_corpus/test.csv
2023-12-01 23:08:45,448 Computing label dictionary. Progress:


0it [00:00, ?it/s]
6237it [00:01, 3905.55it/s]

2023-12-01 23:08:47,062 Dictionary created for label 'class' with 2 values: 1 (seen 3134 times), 0 (seen 3103 times)





{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 6237,
        "number_of_documents_per_class": {
            "0": 3103,
            "1": 3134
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 173593,
            "min": 1,
            "max": 2107,
            "avg": 27.8327721661055
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 2080,
        "number_of_documents_per_class": {
            "0": 1052,
            "1": 1028
        },
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 57158,
            "min": 1,
            "max": 1314,
            "avg": 27.47980769230769
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 2079,
        "number_of_documents_per_class": {
            "1": 1036,
            "0": 1043
        },
        "number_of_tokens_per_tag": {},
        "nu

100%|██████████| 130/130 [00:07<00:00, 16.43it/s]

2023-12-01 23:10:08,419 DEV : loss 0.32253116369247437 - f1-score (micro avg)  0.9312





2023-12-01 23:10:09,124 ----------------------------------------------------------------------------------------------------
2023-12-01 23:10:14,967 epoch 2 - iter 156/1560 - loss 0.24015913 - time (sec): 5.84 - samples/sec: 106.81 - lr: 0.000049 - momentum: 0.000000
2023-12-01 23:10:21,146 epoch 2 - iter 312/1560 - loss 0.19827982 - time (sec): 12.02 - samples/sec: 103.82 - lr: 0.000049 - momentum: 0.000000
2023-12-01 23:10:27,069 epoch 2 - iter 468/1560 - loss 0.18887304 - time (sec): 17.94 - samples/sec: 104.33 - lr: 0.000048 - momentum: 0.000000
2023-12-01 23:10:33,060 epoch 2 - iter 624/1560 - loss 0.18495171 - time (sec): 23.94 - samples/sec: 104.28 - lr: 0.000048 - momentum: 0.000000
2023-12-01 23:10:39,617 epoch 2 - iter 780/1560 - loss 0.16338223 - time (sec): 30.49 - samples/sec: 102.32 - lr: 0.000047 - momentum: 0.000000
2023-12-01 23:10:46,089 epoch 2 - iter 936/1560 - loss 0.16536068 - time (sec): 36.96 - samples/sec: 101.29 - lr: 0.000047 - momentum: 0.000000
2023-12-01 2

100%|██████████| 130/130 [00:08<00:00, 15.85it/s]

2023-12-01 23:11:19,526 DEV : loss 0.18964089453220367 - f1-score (micro avg)  0.9668





2023-12-01 23:11:20,317 ----------------------------------------------------------------------------------------------------
2023-12-01 23:11:26,522 epoch 3 - iter 156/1560 - loss 0.11978290 - time (sec): 6.20 - samples/sec: 100.60 - lr: 0.000044 - momentum: 0.000000
2023-12-01 23:11:33,073 epoch 3 - iter 312/1560 - loss 0.13653620 - time (sec): 12.75 - samples/sec: 97.85 - lr: 0.000043 - momentum: 0.000000
2023-12-01 23:11:39,601 epoch 3 - iter 468/1560 - loss 0.12463499 - time (sec): 19.28 - samples/sec: 97.09 - lr: 0.000043 - momentum: 0.000000
2023-12-01 23:11:46,170 epoch 3 - iter 624/1560 - loss 0.11369756 - time (sec): 25.85 - samples/sec: 96.55 - lr: 0.000042 - momentum: 0.000000
2023-12-01 23:11:52,755 epoch 3 - iter 780/1560 - loss 0.10932496 - time (sec): 32.44 - samples/sec: 96.19 - lr: 0.000042 - momentum: 0.000000
2023-12-01 23:11:59,265 epoch 3 - iter 936/1560 - loss 0.10408641 - time (sec): 38.95 - samples/sec: 96.13 - lr: 0.000041 - momentum: 0.000000
2023-12-01 23:12:

100%|██████████| 130/130 [00:08<00:00, 15.84it/s]

2023-12-01 23:12:32,375 DEV : loss 0.2323296219110489 - f1-score (micro avg)  0.9514





2023-12-01 23:12:33,216 ----------------------------------------------------------------------------------------------------
2023-12-01 23:12:39,305 epoch 4 - iter 156/1560 - loss 0.07291503 - time (sec): 6.09 - samples/sec: 102.51 - lr: 0.000038 - momentum: 0.000000
2023-12-01 23:12:46,819 epoch 4 - iter 312/1560 - loss 0.07241866 - time (sec): 13.60 - samples/sec: 91.76 - lr: 0.000038 - momentum: 0.000000
2023-12-01 23:12:52,906 epoch 4 - iter 468/1560 - loss 0.07438704 - time (sec): 19.69 - samples/sec: 95.08 - lr: 0.000037 - momentum: 0.000000
2023-12-01 23:12:59,102 epoch 4 - iter 624/1560 - loss 0.07190154 - time (sec): 25.88 - samples/sec: 96.43 - lr: 0.000037 - momentum: 0.000000
2023-12-01 23:13:05,621 epoch 4 - iter 780/1560 - loss 0.07240413 - time (sec): 32.40 - samples/sec: 96.29 - lr: 0.000036 - momentum: 0.000000
2023-12-01 23:13:11,838 epoch 4 - iter 936/1560 - loss 0.06908710 - time (sec): 38.62 - samples/sec: 96.94 - lr: 0.000036 - momentum: 0.000000
2023-12-01 23:13:

100%|██████████| 130/130 [00:08<00:00, 15.29it/s]

2023-12-01 23:13:45,887 DEV : loss 0.2531017065048218 - f1-score (micro avg)  0.9625





2023-12-01 23:13:46,690 ----------------------------------------------------------------------------------------------------
2023-12-01 23:13:53,352 epoch 5 - iter 156/1560 - loss 0.05363817 - time (sec): 6.66 - samples/sec: 93.69 - lr: 0.000033 - momentum: 0.000000
2023-12-01 23:13:59,703 epoch 5 - iter 312/1560 - loss 0.05013372 - time (sec): 13.01 - samples/sec: 95.91 - lr: 0.000032 - momentum: 0.000000
2023-12-01 23:14:06,061 epoch 5 - iter 468/1560 - loss 0.05060938 - time (sec): 19.37 - samples/sec: 96.64 - lr: 0.000032 - momentum: 0.000000
2023-12-01 23:14:12,717 epoch 5 - iter 624/1560 - loss 0.05066536 - time (sec): 26.03 - samples/sec: 95.90 - lr: 0.000031 - momentum: 0.000000
2023-12-01 23:14:19,867 epoch 5 - iter 780/1560 - loss 0.04939224 - time (sec): 33.18 - samples/sec: 94.04 - lr: 0.000031 - momentum: 0.000000
2023-12-01 23:14:26,733 epoch 5 - iter 936/1560 - loss 0.04493995 - time (sec): 40.04 - samples/sec: 93.50 - lr: 0.000030 - momentum: 0.000000
2023-12-01 23:14:3

100%|██████████| 130/130 [00:08<00:00, 14.92it/s]

2023-12-01 23:15:01,848 DEV : loss 0.24707025289535522 - f1-score (micro avg)  0.9644





2023-12-01 23:15:02,872 ----------------------------------------------------------------------------------------------------
2023-12-01 23:15:09,169 epoch 6 - iter 156/1560 - loss 0.05932158 - time (sec): 6.29 - samples/sec: 99.13 - lr: 0.000027 - momentum: 0.000000
2023-12-01 23:15:15,105 epoch 6 - iter 312/1560 - loss 0.03567263 - time (sec): 12.23 - samples/sec: 102.04 - lr: 0.000027 - momentum: 0.000000
2023-12-01 23:15:22,071 epoch 6 - iter 468/1560 - loss 0.03747287 - time (sec): 19.20 - samples/sec: 97.51 - lr: 0.000026 - momentum: 0.000000
2023-12-01 23:15:29,027 epoch 6 - iter 624/1560 - loss 0.03806343 - time (sec): 26.15 - samples/sec: 95.44 - lr: 0.000026 - momentum: 0.000000
2023-12-01 23:15:36,102 epoch 6 - iter 780/1560 - loss 0.03333893 - time (sec): 33.23 - samples/sec: 93.90 - lr: 0.000025 - momentum: 0.000000
2023-12-01 23:15:43,618 epoch 6 - iter 936/1560 - loss 0.03425252 - time (sec): 40.74 - samples/sec: 91.89 - lr: 0.000024 - momentum: 0.000000
2023-12-01 23:15:

100%|██████████| 130/130 [00:09<00:00, 13.95it/s]

2023-12-01 23:16:21,639 DEV : loss 0.2291167825460434 - f1-score (micro avg)  0.9673





2023-12-01 23:16:22,664 ----------------------------------------------------------------------------------------------------
2023-12-01 23:16:30,230 epoch 7 - iter 156/1560 - loss 0.03183106 - time (sec): 7.56 - samples/sec: 82.49 - lr: 0.000022 - momentum: 0.000000
2023-12-01 23:16:37,660 epoch 7 - iter 312/1560 - loss 0.01883320 - time (sec): 14.99 - samples/sec: 83.23 - lr: 0.000021 - momentum: 0.000000
2023-12-01 23:16:45,513 epoch 7 - iter 468/1560 - loss 0.03181428 - time (sec): 22.85 - samples/sec: 81.94 - lr: 0.000021 - momentum: 0.000000
2023-12-01 23:16:53,353 epoch 7 - iter 624/1560 - loss 0.02638600 - time (sec): 30.69 - samples/sec: 81.34 - lr: 0.000020 - momentum: 0.000000
2023-12-01 23:17:00,773 epoch 7 - iter 780/1560 - loss 0.02497582 - time (sec): 38.11 - samples/sec: 81.87 - lr: 0.000019 - momentum: 0.000000
2023-12-01 23:17:07,938 epoch 7 - iter 936/1560 - loss 0.02534541 - time (sec): 45.27 - samples/sec: 82.70 - lr: 0.000019 - momentum: 0.000000
2023-12-01 23:17:1

100%|██████████| 130/130 [00:08<00:00, 14.63it/s]

2023-12-01 23:17:44,497 DEV : loss 0.26881763339042664 - f1-score (micro avg)  0.9716





2023-12-01 23:17:45,324 ----------------------------------------------------------------------------------------------------
2023-12-01 23:17:52,464 epoch 8 - iter 156/1560 - loss 0.03167180 - time (sec): 7.14 - samples/sec: 87.42 - lr: 0.000016 - momentum: 0.000000
2023-12-01 23:17:58,610 epoch 8 - iter 312/1560 - loss 0.03420513 - time (sec): 13.28 - samples/sec: 93.94 - lr: 0.000016 - momentum: 0.000000
2023-12-01 23:18:05,007 epoch 8 - iter 468/1560 - loss 0.03464628 - time (sec): 19.68 - samples/sec: 95.11 - lr: 0.000015 - momentum: 0.000000
2023-12-01 23:18:11,531 epoch 8 - iter 624/1560 - loss 0.02860863 - time (sec): 26.21 - samples/sec: 95.25 - lr: 0.000014 - momentum: 0.000000
2023-12-01 23:18:18,204 epoch 8 - iter 780/1560 - loss 0.02756507 - time (sec): 32.88 - samples/sec: 94.90 - lr: 0.000014 - momentum: 0.000000
2023-12-01 23:18:25,044 epoch 8 - iter 936/1560 - loss 0.02480234 - time (sec): 39.72 - samples/sec: 94.26 - lr: 0.000013 - momentum: 0.000000
2023-12-01 23:18:3

100%|██████████| 130/130 [00:08<00:00, 14.73it/s]

2023-12-01 23:19:01,010 DEV : loss 0.30896320939064026 - f1-score (micro avg)  0.9726





2023-12-01 23:19:01,942 ----------------------------------------------------------------------------------------------------
2023-12-01 23:19:08,423 epoch 9 - iter 156/1560 - loss 0.02712325 - time (sec): 6.48 - samples/sec: 96.30 - lr: 0.000011 - momentum: 0.000000
2023-12-01 23:19:15,434 epoch 9 - iter 312/1560 - loss 0.01380857 - time (sec): 13.49 - samples/sec: 92.51 - lr: 0.000010 - momentum: 0.000000
2023-12-01 23:19:21,842 epoch 9 - iter 468/1560 - loss 0.01935870 - time (sec): 19.90 - samples/sec: 94.08 - lr: 0.000009 - momentum: 0.000000
2023-12-01 23:19:28,334 epoch 9 - iter 624/1560 - loss 0.02006353 - time (sec): 26.39 - samples/sec: 94.58 - lr: 0.000009 - momentum: 0.000000
2023-12-01 23:19:35,202 epoch 9 - iter 780/1560 - loss 0.02184870 - time (sec): 33.26 - samples/sec: 93.81 - lr: 0.000008 - momentum: 0.000000
2023-12-01 23:19:41,983 epoch 9 - iter 936/1560 - loss 0.02004683 - time (sec): 40.04 - samples/sec: 93.51 - lr: 0.000008 - momentum: 0.000000
2023-12-01 23:19:4

100%|██████████| 130/130 [00:08<00:00, 15.06it/s]

2023-12-01 23:20:17,172 DEV : loss 0.3203325867652893 - f1-score (micro avg)  0.9745





2023-12-01 23:20:18,004 ----------------------------------------------------------------------------------------------------
2023-12-01 23:20:24,561 epoch 10 - iter 156/1560 - loss 0.01994562 - time (sec): 6.56 - samples/sec: 95.19 - lr: 0.000005 - momentum: 0.000000
2023-12-01 23:20:31,599 epoch 10 - iter 312/1560 - loss 0.01567863 - time (sec): 13.59 - samples/sec: 91.81 - lr: 0.000004 - momentum: 0.000000
2023-12-01 23:20:38,145 epoch 10 - iter 468/1560 - loss 0.01438760 - time (sec): 20.14 - samples/sec: 92.95 - lr: 0.000004 - momentum: 0.000000
2023-12-01 23:20:44,790 epoch 10 - iter 624/1560 - loss 0.01578571 - time (sec): 26.78 - samples/sec: 93.19 - lr: 0.000003 - momentum: 0.000000
2023-12-01 23:20:51,420 epoch 10 - iter 780/1560 - loss 0.01844993 - time (sec): 33.41 - samples/sec: 93.37 - lr: 0.000003 - momentum: 0.000000
2023-12-01 23:20:58,364 epoch 10 - iter 936/1560 - loss 0.01705476 - time (sec): 40.36 - samples/sec: 92.77 - lr: 0.000002 - momentum: 0.000000
2023-12-01 2

100%|██████████| 130/130 [00:08<00:00, 15.03it/s]

2023-12-01 23:21:33,422 DEV : loss 0.3206486105918884 - f1-score (micro avg)  0.9731





2023-12-01 23:21:34,765 ----------------------------------------------------------------------------------------------------
2023-12-01 23:21:34,767 Testing using last state of model ...


100%|██████████| 130/130 [00:09<00:00, 13.93it/s]

2023-12-01 23:21:44,126 
Results:
- F-score (micro) 0.9769
- F-score (macro) 0.9769
- Accuracy 0.9769

By class:
              precision    recall  f1-score   support

           0     0.9781    0.9762    0.9772      1052
           1     0.9757    0.9776    0.9767      1028

    accuracy                         0.9769      2080
   macro avg     0.9769    0.9769    0.9769      2080
weighted avg     0.9769    0.9769    0.9769      2080

2023-12-01 23:21:44,127 ----------------------------------------------------------------------------------------------------





✨ Trying out the new Classifier! :-D

In [7]:
from flair.data import Sentence

classifier = TextClassifier.load('resources/classifiers/dj_classifier/final-model.pt')

# create example sentence
sentence = Sentence("""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.

        Context:
        {context}""")

# predict class and print
classifier.predict(sentence)

print(sentence.labels)

for label in sentence.labels:
    print(label.value, label.score)

len(sentence.labels)

['Sentence[40]: "You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.          Context:         {context}"'/'1' (1.0)]
1 0.9999994039535522


1

### **Conclusion**: Flair is better than Logistic Regression. I wonder if the results would be similar if we used the flair embeddings with Logistic Regression. 🤷

### **NLP Powered Parser!** 🤖💪

In [8]:
classifier = TextClassifier.load('resources/classifiers/dj_classifier/final-model.pt')

def parse_flair(filename, classifier):
    PY_LANGUAGE = Language('./build/my-languages.so', 'python')
    parser = Parser()
    parser.set_language(PY_LANGUAGE)
    result = []

    with open(filename, "rb") as f:
        tree = parser.parse(f.read())

    # cursor = tree.walk()  Not using this for tree-traversal

    # Alternative method
    def traverse(node):
        if node.type == "string" and len(node.text.decode("utf-8")) > 0:
            # convert bytes to string, and add to list
            string = node.text.decode("utf-8")

            # create sentence object and predict
            sentence = Sentence(string)  
            classifier.predict(sentence)

            # check if sentence is a prompt
            if len(sentence.labels) > 1:
                raise Exception("More than one label")
            if len(sentence.labels) > 0 and sentence.labels[0].value == '1' and sentence.labels[0].score > 0.95:
                # print(sentence.labels)
                result.append(string)
            
        for child in node.children:
            traverse(child)

    traverse(tree.root_node)

    return result

# Test the parser
res = parse_flair("repos/0ptim~JellyChat/backend~tools~defichainpython_qa.py", classifier)
print(len(res))

for prompt in res:
    print(prompt)

# Test the parser
res = parse_flair("repos/su77ungr~CASALIOY/casalioy~CustomChains.py", classifier)
print(len(res))

for prompt in res:
    print(prompt)

5
"gpt-3.5-turbo-16k"
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.

        Context:
        {context}"""
"{question}"
" "
"""Use this if you need to answer any question reguarding python and coding in general. Keywords: python, script, coding, connection to a defichain node, connection to ocean API, creating a wallet, create custom transactions. Make sure to include the source of the answer in your response."""
5
"""HUMAN:
Answer the question using ONLY the given extracts from (possibly unrelated and irrelevant) documents, not your own knowledge.
If you are unsure of the answer or if it isn't provided in the extracts, answer "Unknown[STOP]".
Conclude your answer with "[STOP]" when you're finished.

Question: {question}

--------------
Here are the extracts:
{context}

--------------
Remark: do not repeat the question !

ASSISTANT:
"""
""
f"""HUMAN:
Answer the que

**Saving parsing results for log classifier (for later comparison)**

In [9]:
root_dir = "repos"

repo_to_prompts = {}
count = 0
repo_count = 0
for repo in os.listdir(root_dir):
    repo_path = os.path.join(root_dir, repo)
    for file in os.listdir(repo_path):
        file_path = os.path.join(repo_path, file)
        try:
            prompt = parse_flair(file_path, classifier)
            if len(prompt) > 0:
                count += 1
                val = repo_to_prompts.get(repo, [])
                val.extend(prompt)
                repo_to_prompts[repo] = val
                # print("Repo: ", repo, "; File: ", file)
                # print(prompt)
        except Exception as e:
            print(e)
            print("Error: ", repo_path, file_path)
    repo_count += 1
    if repo_count % 10 == 0:
        print(f"Finished {repo_count} repos")

# Save repo_to_prompts (according to flair) as a json file
import json
with open('repo_to_prompts_FLAIR.json', 'w') as f:
    json.dump(repo_to_prompts, f)


print(repo_to_prompts)
print(f"Parser Returns result for {count} files out of 1444 files")

Finished 10 repos
Finished 20 repos
Finished 30 repos
Finished 40 repos
Finished 50 repos
Finished 60 repos
Finished 70 repos
Finished 80 repos
Finished 90 repos
Finished 100 repos
Finished 110 repos
Finished 120 repos
Finished 130 repos
Finished 140 repos
Finished 150 repos
Finished 160 repos
Finished 170 repos
Finished 180 repos
Finished 190 repos
Finished 200 repos
Finished 210 repos
Finished 220 repos
Finished 230 repos
Finished 240 repos
Finished 250 repos
Finished 260 repos
Finished 270 repos
Finished 280 repos
Finished 290 repos
Finished 300 repos
Finished 310 repos
Finished 320 repos
Finished 330 repos
Finished 340 repos
Finished 350 repos
Finished 360 repos
Finished 370 repos
Finished 380 repos
Finished 390 repos
Finished 400 repos
Finished 410 repos
Finished 420 repos
Finished 430 repos
Finished 440 repos
Finished 450 repos
Finished 460 repos
Finished 470 repos
Finished 480 repos
Finished 490 repos
Finished 500 repos
Finished 510 repos
Finished 520 repos
Finished 530 repos
Fi