In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path

from dsp_ai_eval import PROJECT_DIR

model = SentenceTransformer('all-miniLM-L6-v2')

pd.set_option('display.width', 1000)

In [None]:
answers_data = pd.read_csv(PROJECT_DIR / 'inputs/data/repeated_prompts.csv')
answers_data.head()

In [None]:
texts = answers_data['Answer'].tolist()
indices = [i-1 for i in answers_data['Attempt'].tolist()]
embeddings = model.encode(texts)

In [None]:
similarity_matrix = cosine_similarity(embeddings)
pd.DataFrame(similarity_matrix)

In [None]:
# Construct a network graph
G = nx.Graph()
for i, text in enumerate(texts):
    G.add_node(i, label=text)

# Adding edges based on similarity
threshold = 0.75
for i in range(len(texts)):
    for j in range(i + 1, len(texts)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i][j])

# Map index positions to colors
colors = plt.cm.plasma(np.linspace(0, 1, len(indices)))
color_map = [colors[i] for i in indices]

# Plot the network graph without text labels
pos = nx.spring_layout(G)  # positions for all nodes

# nodes
nx.draw_networkx_nodes(G, pos, node_size=700, node_color=color_map)

# edges
weights = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edges(G, pos, width=[v*1.5 for v in weights.values()])

# labels
labels = {i: str(indices[i]) for i in range(len(texts))}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=12)

plt.axis('off')
plt.show()

# TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Preprocessing function
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # Stemming (optional)
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return " ".join(stemmed_tokens)

In [None]:
texts

In [None]:
# Preprocess the texts
preprocessed_texts = [preprocess_text(text) for text in texts]

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Generate TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(preprocessed_texts)

In [None]:
# Get feature names to use as dataframe columns
feature_names = vectorizer.get_feature_names_out()

# Convert to array and show the result
tfidf_array = tfidf_matrix.toarray()

In [None]:
# Create a DataFrame for better readability
df = pd.DataFrame(tfidf_array, columns=feature_names)
df

In [None]:
# Function to find top n scoring words for each document
def top_n_words_per_document(df, n=10):
    top_words = {}
    for index, row in df.iterrows():
        sorted_row = row.sort_values(ascending=False)
        top_n = sorted_row.head(n)
        top_words[f"Document {index+1}"] = list(top_n.index)
    return top_words

# Get the top 10 scoring words for each document
top_10_words = top_n_words_per_document(df, 10)

# Display the results
for doc, words in top_10_words.items():
    print(f"{doc}: {words}")

In [None]:
output_dir = PROJECT_DIR / 'outputs/data/initial_repeat_prompting/tfidf_matrix.csv'
Path(output_dir).parent.mkdir(parents=True, exist_ok=True)

pd.DataFrame(top_10_words).to_csv(output_dir, index=False)