In [1]:
import re
import os
import sys
import time
import openai
import warnings
import pandas as pd
from openai import OpenAI

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)
sys.path.append('/home/or/dev/latin_music_statistics/crawler/data_dir')
sys.path.append(os.path.abspath(os.path.join(parent_dir, 'statistics_dir')))

from config import *

client = OpenAI(api_key = OPENAI_KEY)

In [2]:
df=pd.read_csv('final_data.csv')

In [18]:
MAX_THEME_CLUSTER_SIZE = 20
THEME_BATCH_SIZE = 100

SONGS_CLUSTERING_PROMPT = "Cluster the following song themes into brief general themes:\n\n{batch_text}\n\nPlease provide the clusters in a clean Python list format without any extra text or formatting. The list should be written exactly as a Python list, like this: [\"...\",\"...\",\"...\"].\n\nEnsure there are no additional markers or explanations—just the list."
CLUSTERS_PROMPT = f'These song theme clusters\nGroup these song themes into broader, more concise categories, combining similar themes and removing redundancy.\nCombine them into less than {MAX_THEME_CLUSTER_SIZE} ' + 'categories\nThe output should look like: ["...","...","..."].\n{batch_text}\nEnsure all clusters are combined into a single, flat Python list with no nested lists.'

OPENAI_MODEL = "gpt-4o"
import ast

def chatgpt_request_list_extraction(msgs, model_name):
    response = client.chat.completions.create(model=model_name, messages=msgs)
    return response
    
def extract_valid_list(response_text, model_name, max_attempts=5):
    attempt = 0
    chatgpt_messages = []

    while attempt < max_attempts:

        try:
            extracted_list = ast.literal_eval(response_text)
            if isinstance(extracted_list, list):
                return extracted_list
                
        except (SyntaxError, ValueError):
            print(f"Attempt: {attempt + 1} to exrect the python list form the text")
            attempt += 1
            prev_content = f"The previous {attempt + 1} attempt to extract of the valid python list from the text failed.\n Wxtract valid Python list with no additional text, formatting, or code. The output should look exactly like this: [\"...\"].\nProvide only the list, nothing else."
            chatgpt_messages = prepare_chatgpt_msg(response_text, prev_content, chatgpt_messages)
            extraction_response = chatgpt_request_list_extraction(chatgpt_messages, model_name)
            response_text = extraction_response.choices[0].message.content
    
    raise ValueError("Failed to extract a valid Python list after multiple attempts.")

def prepare_chatgpt_msg(curr_text, prev_text, chatgpt_messages):
    chatgpt_messages.append({"role": "user", "content": prev_text})
    chatgpt_messages.append({"role": "user", "content": curr_text})
    return chatgpt_messages

def python_list_to_batch_text_promt(base_prompt, a_list, prev_text, batch_size, row_i, max_len):
    batch_text = "\n".join(a_list[row_i: min(row_i + batch_size, max_len)])
    prompt = base_prompt.format(batch_text=batch_text)
    chatgpt_msg_request =  prepare_chatgpt_msg(prompt, prev_text, [])
    return chatgpt_msg_request

def check_max_clusters(failed_extraction, max_theme_clusters_size ,len_clusters_before):
    return (not failed_extraction) and (max_theme_clusters_size < len_clusters_before)


def batch_process_themes(list_to_cluster, prev_text, base_prompt, theme_batch_size, model):
    row_i = 0
    total_clusters = []
    total_len =len(list_to_cluster)
    
    while row_i <= total_len:
        chatgpt_msg_request = python_list_to_batch_text_promt(base_prompt, list_to_cluster, prev_text, theme_batch_size, row_i, total_len)
        response = chatgpt_request_list_extraction(chatgpt_msg_request, model)
        curr_batch_size = theme_batch_size

        # Check if the response is not finished properly
        while response.choices[0].finish_reason != "stop":
            print(f"Batch {row_i} to {row_i + curr_batch_size} didn't complete. Reducing batch size.")
            curr_batch_size = max(1, curr_batch_size // 2)  # Reduce batch size by half, but not below 1
            chatgpt_msg_request = python_list_to_batch_text_promt(base_prompt, batch_list, prev_text, theme_batch_size, row_i, total_len)
            response = chatgpt_request_list_extraction(chatgpt_msg_request, model)

        response_text = response.choices[0].message.content

        try:
            response_as_list = extract_valid_list(response_text, OPENAI_MODEL, max_attempts=5)
        except ValueError as e:
            print(f"Failed to extract valid list from CHATGPT prompt")
            return(total_clusters) ,1
            
        total_clusters.extend(response_as_list)
        print(f"Processed batch {row_i} to {row_i + THEME_BATCH_SIZE}")

        row_i += curr_batch_size
        
    return total_clusters, 0 

prev_text = ''
song_list = df['theme'].tolist()

clustered_themes, failed_extraction = batch_process_themes(song_list, prev_text, SONGS_CLUSTERING_PROMPT, THEME_BATCH_SIZE, OPENAI_MODEL)

len_clusters_before = len(clustered_themes)
continue_clustring = check_max_clusters(failed_extraction, MAX_THEME_CLUSTER_SIZE, len_clusters_before)


decreasing_clusters_attempt = 1
print(f"Initial number of clusters: {len_clusters_before}")

while continue_clustring:
    curr_clustered_themes, failed_extraction = batch_process_themes(clustered_themes, prev_text, CLUSTERS_PROMPT, THEME_BATCH_SIZE, OPENAI_MODEL)
    prev_clusters_text = '\n'.join(clustered_themes)
    curr_clusters_text = '\n'.join(curr_clustered_themes)
    len_clusters_after = len(curr_clustered_themes)
    
    print(f"The number of clusters: {len_clusters_after}")

    if len_clusters_before == len_clusters_after:
        decreasing_clusters_attempt += 1
        prev_text =  f'This is the {decreasing_clusters_attempt} attempt for clustering. ' +\
                     f'There are {len_clusters_after} many clusters - which are too many. ' +\
                     f'The previous clustered topic attempt was:\n{prev_clusters_text}\n\n\nTry again with those topics:\n' + curr_clusters_text
    else:
        decreasing_clusters_attempt = 1
        prev_text = ''

    continue_clustring = check_max_clusters(failed_extraction, MAX_THEME_CLUSTER_SIZE, len_clusters_after)

    clustered_themes = curr_clustered_themes
    len_clusters_before = len_clusters_after

print(f"The final number of clusters: {len(clustered_themes)}")

Attempt: 1 to exrect the python list form the text
Processed batch 0 to 100
Attempt: 1 to exrect the python list form the text
Processed batch 100 to 200
Processed batch 200 to 300
Processed batch 300 to 400
Processed batch 400 to 500
Attempt: 1 to exrect the python list form the text
Processed batch 500 to 600
Attempt: 1 to exrect the python list form the text
Processed batch 600 to 700
Attempt: 1 to exrect the python list form the text
Processed batch 700 to 800
Processed batch 800 to 900
Initial number of clusters: 585
Processed batch 0 to 100
Processed batch 100 to 200
Attempt: 1 to exrect the python list form the text
Processed batch 200 to 300
Attempt: 1 to exrect the python list form the text
Processed batch 300 to 400
Processed batch 400 to 500
Processed batch 500 to 600
The number of clusters: 247
Processed batch 0 to 100
Attempt: 1 to exrect the python list form the text
Processed batch 100 to 200
Processed batch 200 to 300
The number of clusters: 152
Processed batch 0 to 100

# create specific song themes embeddings and match the closest general theme embeddings

In [8]:
general_themes_df = pd.read_csv(GENERAL_SONGS_THEMS)

In [10]:
from sentence_transformers import SentenceTransformer
from tqdm.autonotebook import tqdm
import time

# Initialize the Hugging Face model
MODEL_NAME = 'sentence-transformers/all-roberta-large-v1'

emb_model = SentenceTransformer(MODEL_NAME)

def create_embeddings(df, model, src_col, tgt_col):
    df[tgt_col] = None
    
    tic = time.time()
    
    # Use tqdm to add a progress bar to the loop
    for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
        response_emb = model.encode(row[src_col])    
        df.at[index, tgt_col] = response_emb
    
    # Track the end time
    toc = time.time()
    
    # Print the time taken in minutes
    print(f"Time taken: {(toc - tic) / 60:.2f} minutes")

create_embeddings(df, emb_model, 'theme', 'theme_emb')
create_embeddings(general_themes_df,  emb_model, 'general_theme', 'general_theme_emb')

  from tqdm.autonotebook import tqdm, trange


Processing rows:   0%|          | 0/843 [00:00<?, ?it/s]

Time taken: 10.84 minutes


Processing rows:   0%|          | 0/19 [00:00<?, ?it/s]

Time taken: 0.09 minutes


In [12]:
import pandas as pd
from scipy.spatial.distance import cosine
import numpy as np

def find_closest_embeddings(df, general_themes_df):
    # Assuming embeddings are stored as lists or numpy arrays in the DataFrame columns
    df['closest_general_theme'] = None

    for idx, theme_emb in df.iterrows():
        current_embedding = theme_emb['theme_emb']
        
        # Initialize variables to find the closest match
        min_dist = float('inf')
        closest_theme = None
        
        # Iterate over general_themes_df to find the closest match
        for _, general_row in general_themes_df.iterrows():
            general_embedding = general_row['general_theme_emb']
            general_theme = general_row['general_theme']
            
            # Compute cosine similarity (distance) between the current embedding and the general embeddings
            dist = cosine(current_embedding, general_embedding)

            # Update closest match if a smaller distance is found
            if dist < min_dist:
                min_dist = dist
                closest_theme = general_theme
        
        # Store the closest theme in the df
        df.at[idx, 'general_theme'] = closest_theme

    return df

result_df = find_closest_embeddings(df, general_themes_df)

In [23]:
unnamed_columns = [col for col in result_df.columns if 'Unnamed' in col]
result_df.drop(unnamed_columns, axis=1, inplace=True)

In [34]:
result_df.to_csv('final_data.csv')

# Match the general categry to each theme:

In [137]:
# from tqdm.notebook import tqdm
# tqdm.pandas()

# MATCH_PROMPT = 'To which of the following general categories:\n{general_themes}\nThis theme fits best:\n{cur_theme}\nwrite only the catagory name:'
# WRONG_GENERAL_THEME =  'This is the {attempt_num} attempt.\n' +\
#              'You failed to extract the best fitting general theme out of the theme list.\n' +\
#              'You wrote name {name} which is not in the list.' + curr_clusters_text

# df_general_themes = pd.read_csv(GENERAL_SONGS_THEMS)

# unnamed_columns = [col for col in df.columns if 'Unnamed' in col]
# df.drop(unnamed_columns, axis=1, inplace=True)

# def prepare_chatgpt_msg(curr_text, prev_text, chatgpt_messages):
#     chatgpt_messages.append({"role": "user", "content": prev_text})
#     chatgpt_messages.append({"role": "user", "content": curr_text})
#     return chatgpt_messages

# def match_generic_theme(theme_txt, general_themes):
#     attempt_num = 1
#     curr_text = MATCH_PROMPT.format(general_themes=','.join(general_themes), cur_theme=theme_txt)
#     chatgpt_msg_request = prepare_chatgpt_msg(curr_text, '', [])
#     response = chatgpt_request_list_extraction(chatgpt_msg_request, OPENAI_MODEL)

#     response_txt = response.choices[0].message.content
#     while response_txt not in general_themes:
#         attempt_num += 1
#         prev_text = WRONG_GENERAL_THEME.format(attempt_num=attempt_num, name=response_txt)

#         chatgpt_msg_request = prepare_chatgpt_msg(curr_text, prev_text, [])
#         response = chatgpt_request_list_extraction(chatgpt_msg_request, OPENAI_MODEL)
#         response_txt = response.choices[0].message.content

#         print(f'{attempt_num}:\n\nprev_text {prev_text}')
#         print('-'*100)
#     return response  

# df['general_theme'] = ""

# # Use tqdm to add a progress bar to the loop
# for index, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
#     print(index)
#     result = match_generic_theme(row['theme'], general_themes)
    
#     # Add or set the result to the 'general_theme' column
#     df.at[index, 'general_theme'] = result

df['general_theme'] = ''
general_themes = df_general_themes['general_theme'].tolist()
df['general_theme'] = df['theme'].apply(lambda txt: match_generic_theme(txt, general_themes))