In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from collections import defaultdict
import openai
import json
import re


api_key = ""
openai.api_key = api_key
classnames_file = "data/RWD/ImageSets/Aerial/known_classnames.txt"

In [None]:
attribute_categories = ["Shape: This includes the overall form or outline of objects in the image.",
                        "Color: The predominant colors present in the image and how they contribute to the class identity.",
                        "Texture: The surface qualities of objects, such as smoothness, roughness, or patterns.", 
                        "Size: The dimensions and proportions of objects in the image.",
                        "Context: The surroundings or background of objects, which can provide contextual information.",
                        "Features: Specific visual characteristics unique to certain classes, like wings for birds or fins for fish.",
                        "Appearance: The overall visual appearance, including the presence of specific structures like wheels on vehicles or petals on flowers.",
                        "Behavior: Attributes related to how objects in the image interact or move.",
                        "Environment: The type of setting or habitat depicted in the image.",
                        "Material: The substances or materials objects are made of, which can affect their appearance."]

domain = "xray images of the bones in the hands"
classnames = read_txt_file(classnames_file)


In [None]:
def read_txt_file(filename):
    try:
        with open(filename, 'r') as file:
            content = file.read().splitlines()
            return content
    except FileNotFoundError:
        return "File not found"
    except Exception as e:
        return f"An error occurred: {str(e)}"


def merge_attributes(data_dict):
    merged_attributes = {}  # Attribute_name: [values]
    for class_data in data_dict.values():
        for attr_name, attr_values in class_data.items():
            if attr_name not in merged_attributes:
                merged_attributes[attr_name] = []
            merged_attributes[attr_name].extend(attr_values)
    return merged_attributes


def remove_duplicates(merged_attributes):
    unique_merged_attributes = {}
    for attr_name, attr_values in merged_attributes.items():
        unique_values = list(set(attr_values))
        unique_merged_attributes[attr_name] = unique_values
    return unique_merged_attributes
    

In [None]:
def extract_attributes_from_text(text):
    pattern = r'```python\n([\s\S]+?)\n\]\n```'
    matches = re.search(pattern, text)
    if matches:
        text = matches.group(1).strip()

    if len(text.split(":"))==2:
        text = text.split(":")[-1]

    if "list format:" in text:
        text = text.split("list format:")[-1]

    if "Please note" in text:
        text = text.split("Please note")[0]
    elif "please note" in text:
        text = text.split("please note")[0]
    # Define custom patterns to capture attributes
    patterns = [
        r'\d+\.\s*([\w\s\-\'()]+)',
        r'\d+\.\s*([\w\s\-,\'()]+)',
        r'\[\s*"([\w\s\-\'(),.]+)"\s*\]',
        r'\n\d+\.\s',
        r'\n\d+\.\s+',
        r'"([^"]+)"',
        r"([\w\s\-\'(),.]+)",
        r"'(.*?)'"
    ]

    attributes = []
    for pattern in patterns:
        matches = re.findall(pattern, text)
        matches = [m for m in matches if '\n' not in m]
        matches = [m for m in matches if any(c.isalpha() for c in m)]
        if len(attributes)< len(matches):
            attributes = matches

    for pattern in patterns:
        matches = re.split(pattern, text)
        matches = [m for m in matches if '\n' not in m]
        matches = [m for m in matches if any(c.isalpha() for c in m)]
        if len(attributes)< len(matches):
            attributes = matches


    return attributes

In [None]:
text_prompt = lambda class_name, description, context: f"""I am conducting research in zero-shot image classification and seek your assistance in compiling valuable visual/functional attributes for various classes.\nThese attributes will aid in identifying features within images, enabling image classification into target classes.\nKindly provide the attributes related to {class_name} based on {description} that would aid in its classification within {context}. Format the attributes as a Python list (list ONLY):"""


In [None]:
dataset = {}
for c in classnames:
    dataset[c]={}
    for a in attribute_categories:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo-16k",
            messages= [{"role": "user", "content":text_prompt(a, c, domain)}],
            temperature=0.5,
            max_tokens=2789,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        dataset[c][a.split(":")[0]] = response['choices'][0]['message']['content']

In [None]:
attributes = {}
for c in classnames:
    attributes[c]={}
    for a in attribute_categories:
        attributes[c][a.split(":")[0]]=extract_attributes_from_text(dataset[c][a.split(":")[0]])
        if len(attributes[c][a.split(":")[0]])==0:
            print(dataset[c][a.split(":")[0]])
            print(c)
            print(a.split(":")[0])

In [None]:

def remove_similar_duplicates(attributes, threshold=90):
    unique_attributes = defaultdict(list)

    seen_values = set()

    for category, attr_values in attributes.items():
        for value in attr_values:
            value_lower = value.lower()
            is_duplicate = False

            if is_valid_attribute(category, value_lower, seen_values, threshold):
                for seen_value in seen_values:
                    similarity_ratio = fuzz.token_set_ratio(value_lower, seen_value.lower())
                    if similarity_ratio >= threshold:
                        is_duplicate = True
                        break

                if not is_duplicate:
                    seen_values.add(value_lower)
                    unique_attributes[category].append(value)

    return unique_attributes

def is_valid_attribute(category, value, existing_values, threshold):
    if not existing_values:
        return True

    for existing_value in existing_values:
        similarity_ratio = fuzz.token_set_ratio(value, existing_value)
        if similarity_ratio >= threshold:
            return False

    return True




In [None]:
merged_attributes = remove_duplicates(merge_attributes(attributes))
unique_attributes = remove_similar_duplicates(merged_attributes)
print("Unique attributes saved to 'unique_attributes.json'")

In [None]:
with open('unique_attributes.json', 'w') as file:
    json.dump(unique_attributes, file, indent=4)