In [None]:
import altair as alt
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech
from collections import defaultdict, Counter
from dotenv import load_dotenv
from hdbscan import HDBSCAN
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import openai
import os
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from umap import UMAP

from nesta_ds_utils.viz.altair import saving as viz_save

from dsp_ai_eval import PROJECT_DIR, logging
from dsp_ai_eval.utils import utils
from dsp_ai_eval.utils.gpt_summary_utils import extract_conclusion, extract_theme_headings
from dsp_ai_eval.utils.clustering_utils import create_new_topic_model, create_df_for_viz

model = SentenceTransformer('all-miniLM-L6-v2')

pd.set_option('display.width', 1000)

load_dotenv()

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [None]:
answers_data = utils.load_jsonl(PROJECT_DIR / 'inputs/data/gpt/gpt_themes_repeats.jsonl')
answers_data[0]

In [None]:
answers = [' '.join(a['answer']) for a in answers_data if a['temperature']==0.5]

In [None]:
len(answers_data) - len(answers)

In [None]:
# Initialize a dictionary to hold counters for each temperature
headings_by_temperature = defaultdict(Counter)

for item in answers_data:
    headings = extract_theme_headings(item['answer'])
    # Group the headings by the item's temperature
    temp = item['temperature']
    headings_by_temperature[temp].update(headings)

# Display the count of each unique theme heading, organized by temperature
for temp, headings_counter in headings_by_temperature.items():
    print(f'Temperature: {temp}')
    for heading, count in headings_counter.items():
        print(f'  "{heading}": {count}')
    print()  # Blank line for better readability

In [None]:
embeddings = model.encode(answers, show_progress_bar=True)

In [None]:
topic_model = create_new_topic_model()

In [None]:
# Train model
topics, probs = topic_model.fit_transform(answers, embeddings)

# Show topics
topic_model.get_topic_info()

In [None]:
topic_model.visualize_barchart()

In [None]:
topic_model.visualize_topics()

In [None]:
df_vis = create_df_for_viz(embeddings,
                      topic_model,
                      topics,
                      answers,
                      seed=42)

In [None]:
# Create the plot
plot = alt.Chart(df_vis
                 ).mark_circle(size=30, opacity=0.75).encode(
    x='x',
    y='y',
    color='Name:N',
    tooltip=['Name:N','doc:N']
).properties(
    width=800,
    height=600,
).interactive()

plot.save(PROJECT_DIR / 'outputs/figures/gpt_temp5_themes.html')
# plot.save(PROJECT_DIR / 'outputs/figures/scite_abstracts.png')
viz_save.save(plot, 'gpt_temp5_themes', PROJECT_DIR / 'outputs/figures', save_png=True)

plot.display()

In [None]:
representative_docs = topic_model.get_representative_docs()

In [None]:
for key in representative_docs.keys():
    logging.info(f"{key}")
    for x in representative_docs[key]:
        logging.info(f"  {x}")

In [None]:
for k in representative_docs.keys():
    docs = representative_docs[k]
    all_headings = []
    headings = [extract_theme_headings(doc) for doc in docs]
    for h in headings:
        all_headings.extend(h)
    all_headings = [h.lower() for h in all_headings]
    print(f"{k}: {sorted(list(set(all_headings)))}")

In [None]:
headings = [extract_theme_headings(h) for h in answers]

headings


In [None]:
df_vis['headings'] = headings

In [None]:
df_long = df_vis.explode('headings')

In [None]:
counts = pd.DataFrame(df_long.groupby(['topic', 'headings']).size(), columns=['size']).reset_index()

In [None]:
# Sort the DataFrame by 'topic' and 'size' in descending order
sorted_counts = counts.sort_values(by=['topic', 'size'], ascending=[True, False]).reset_index(drop=True)
sorted_counts.head()

In [None]:
sorted_counts = sorted_counts[sorted_counts['size'] > 2]

In [None]:
sorted_counts.to_csv("sorted_counts.csv")

In [None]:
# Assuming 'df_long' is your original DataFrame
# Group by 'topic' and aggregate unique 'headings' for each 'topic'
unique_headings_per_topic = df_long.groupby('topic')['headings'].unique()

# Convert the series to a DataFrame for easier manipulation
unique_headings_df = unique_headings_per_topic.reset_index()

# Determine the headings unique to each topic
def find_unique_headings(row, all_headings):
    # Convert the current topic's headings to a set
    current_headings = set(row['headings'])
    # Convert all other headings to sets and then get the union
    other_headings = set().union(*[set(headings) for topic, headings in all_headings.items() if topic != row['topic']])
    # Unique headings are those in current but not in others
    return list(current_headings - other_headings)

# Assuming 'unique_headings_df' and 'unique_headings_per_topic' have been defined as before
# Apply the updated function
unique_headings_df['unique_headings'] = unique_headings_df.apply(find_unique_headings, axis=1, args=(unique_headings_per_topic.to_dict(),))

In [None]:
unique_headings_df.to_csv("unique_headings.csv")

In [None]:
df_long['headings'].value_counts()