In [1]:
import os
import time
import openai
from tqdm import tqdm
import configparser
import pickle
import queue

import sys
sys.path.insert(0, "../Helper Stuff/")
import tree

API Set Up

In [2]:
config_path = "../config.ini"
config = configparser.ConfigParser()
config.read(config_path)
openai_api_key = config.get('openai', 'api_key')
openai.api_key = openai_api_key

Tree Construction

In [3]:
embeddings = "../Helper Stuff/embeddings.json"
my_tree = tree.build_tree(embeddings)

Load the one word summaries created by ChatGPT 

In [4]:
with open("../Data/openai_summaries_2", "rb") as fp:
    summaries = pickle.load(fp)

Create Labels

In [5]:
def determine_label(topics, model_engine="gpt-3.5-turbo", prompt = "In a minimum of 1 word and a maximum of 3 words find the most specific commonality between the following topics:"):
    main_text = "{} {}".format(prompt, " ".join(topics))
    prompt = {"role":"system", "content": main_text}
    try:
        response = openai.ChatCompletion.create(model = model_engine, messages = [prompt])["choices"][0]["message"]["content"]
    except:
        print("Error with rate limit waiting 60 seconds")
        time.sleep(60)
        print("Done Sleeping")
        response = openai.ChatCompletion.create(model = model_engine, messages = [prompt])["choices"][0]["message"]["content"]
    return response

Traverses the tree bottom up creating labels. Childrens labels are used to create parents labels

In [6]:
def populate_labels(tree, summaries, prompt = None):
    if tree.is_leaf():
        tree.label = summaries[tree.files[0]]
        return 
    else:
        if prompt == None:
            populate_labels(tree.left, summaries)
            populate_labels(tree.right, summaries)
        else:
            populate_labels(tree.left, summaries, prompt)
            populate_labels(tree.right, summaries, prompt)
        new_label = determine_label([tree.left.label, tree.right.label])
        tree.label = new_label
        return

In [36]:
populate_labels(my_tree, summaries)

Error with rate limit waiting 60 seconds
Done Sleeping
Error with rate limit waiting 60 seconds
Done Sleeping
Error with rate limit waiting 60 seconds
Done Sleeping
Error with rate limit waiting 60 seconds
Done Sleeping


Level Order traversal of the Tree to get label and files for each cluster

In [7]:
def level_order_labels(the_tree):
    q = queue.Queue()
    q.put(the_tree)
    q.put("M")
    levels = []
    labels =[]
    while not q.empty():
        val = q.get()
        if val == "M":
            levels.append(labels)
            labels = []
            if not q.empty():
                q.put("M")
        else:
            labels.append((val.label, val.files))
            if val.left is not None:
                q.put(val.left)
            if val.right is not None:
                q.put(val.right)
    return levels

Write the current cluster labels and files to a file

In [9]:
def write_output(levels, path = "../Data/ClusterLabels.txt"):
    with open(path, "w+") as f:
        i = 1
        for level in levels:
            f.write("Level: {}\n".format(i))
            for cluster in level:
                label, files = cluster[0], cluster[1]
                f.write("    Cluster: {:<140} Files: {:<}\n".format(label, ", ".join(files)))
            i += 1

In [8]:
path = "../Data/ClusterLabels.txt"
levels = level_order_labels(my_tree)
write_output(levels, path)

NameError: name 'levels' is not defined

Attempt 2 at Generating Labels: Use the one word summaries of all files in the cluster to generate label

In [10]:
def populate_labels2(tree, summaries, prompt = None):
    if tree.is_leaf():
        tree.label = summaries[tree.files[0]]
        return 
    else:
        if prompt == None:
            populate_labels(tree.left, summaries)
            populate_labels(tree.right, summaries)
        else:
            populate_labels(tree.left, summaries, prompt)
            populate_labels(tree.right, summaries, prompt)
        left_labels = [summaries[x] for x in tree.left.files]
        right_labels = [summaries[x] for x in tree.right.files]
        new_label = determine_label(left_labels + right_labels)
        tree.label = new_label
        return

In [11]:
populate_labels2(my_tree, summaries)

Error with rate limit waiting 60 seconds
Done Sleeping


In [12]:
path = "../Data/ClusterLabels2.txt"
levels = level_order_labels(my_tree)
write_output(levels, path)