# Data analysis for the jeopardy dataset

## Read and process the data

### Import necessary modules

In [91]:
import json
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import plotly.express as px

In [2]:
path = r"data/JEOPARDY_QUESTIONS.json"
jeopardy_questions = json.load(open(path, "r"))

In [6]:
f"Number of questions: {len(jeopardy_questions)}"

'Number of questions: 216930'

In [37]:
categories = list(set(question.get("category", "No category") for question in jeopardy_questions))
sample_categories = ', \n'.join(random.choices(categories, k=10))
print(f"Number of unique categories: {len(categories)}. \nFor example: \n{sample_categories}")

Number of unique categories: 27995. 
For example: 
1812, 
NICE WORK IF YOU CAN GET IT, 
ON A SWISS ARMY KNIFE, 
OSCAR-WINNING SONGS, 
WINGS, 
"DEC" ME, 
WE WANT PISA!, 
TV & MOVIE ACTORS, 
OBJETS D'ART, 
THE 2011 EMMYS


In [113]:
samples_per_category = {c: [] for c in categories}
for question in jeopardy_questions: samples_per_category[question.get("category", "No category")].append(question["question"])
nSamples_per_category = {c: len(s) for c, s in samples_per_category.items()}
nSamples_per_category = sorted(nSamples_per_category.items(), key=lambda item: item[1], reverse=True)
nMostProminentClasses = 10
mostProminentClasses = nSamples_per_category[:nMostProminentClasses]
mostProminentClasses

[('BEFORE & AFTER', 547),
 ('SCIENCE', 519),
 ('LITERATURE', 496),
 ('AMERICAN HISTORY', 418),
 ('POTPOURRI', 401),
 ('WORLD HISTORY', 377),
 ('WORD ORIGINS', 371),
 ('COLLEGES & UNIVERSITIES', 351),
 ('HISTORY', 349),
 ('SPORTS', 342)]

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


data = {topic: samples_per_category[topic] for topic, count in mostProminentClasses}
# Flatten data and labels for encoding
texts = [text for texts_list in data.values() for text in texts_list]
labels = [key for key, texts_list in data.items() for _ in texts_list]

# Step 1: Vectorize texts using TF-IDF (or embeddings if available)
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(texts)

# Step 2: Dimensionality Reduction
# First reduce with PCA for better t-SNE results
pca = PCA(n_components=30)
pca_result = pca.fit_transform(tfidf_vectors.toarray())

# Step 2: Dimensionality Reduction with t-SNE
tsne = TSNE(n_components=3, perplexity=5, n_iter=300)  # Use 3 components for 3D
tsne_result = tsne.fit_transform(pca_result)



'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.



In [115]:
import plotly.io as pio
%matplotlib qt

# Identify prominent feature (highest-weighted term in TF-IDF) for each text
feature_names = tfidf_vectorizer.get_feature_names_out()
prominent_features = [
    feature_names[np.argmax(vec.toarray())] for vec in tfidf_vectors
]

# Step 3: Create interactive plot with Plotly
fig = px.scatter(
    x=tsne_result[:, 0], y=tsne_result[:, 1],
    color=labels,
    hover_name=texts,  # Text displayed on hover
    hover_data={'Prominent Feature': prominent_features}
)

fig.update_layout(
    title="Interactive Text Sample Clustering by Category",
    xaxis_title="TSNE Dimension 1",
    yaxis_title="TSNE Dimension 2"
)

fig.show()

In [116]:
from sentence_transformers import SentenceTransformer
# Step 1: Encode text samples using Sentence-BERT
model = SentenceTransformer('all-MiniLM-L6-v2')  # Use a lightweight model
sentence_embeddings = model.encode(texts)

# Step 2: Dimensionality Reduction with PCA and t-SNE
pca = PCA(n_components=5)
pca_result = pca.fit_transform(sentence_embeddings)

tsne = TSNE(n_components=2, perplexity=5, n_iter=300)
tsne_result = tsne.fit_transform(pca_result)

# Step 3: Create interactive plot with Plotly
fig = px.scatter(
    x=tsne_result[:, 0], y=tsne_result[:, 1],
    color=labels,
    hover_name=texts  # Text displayed on hover
)

fig.update_layout(
    title="Text Sample Clustering by Category with Sentence-BERT",
    xaxis_title="TSNE Dimension 1",
    yaxis_title="TSNE Dimension 2"
)

fig.show()


'n_iter' was renamed to 'max_iter' in version 1.5 and will be removed in 1.7.

