In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px

df = pd.read_csv("/kaggle/input/200000-jeopardy-questions/JEOPARDY_CSV.csv")
df.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216930 entries, 0 to 216929
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   Show Number  216930 non-null  int64 
 1    Air Date    216930 non-null  object
 2    Round       216930 non-null  object
 3    Category    216930 non-null  object
 4    Value       213296 non-null  object
 5    Question    216930 non-null  object
 6    Answer      216927 non-null  object
dtypes: int64(1), object(6)
memory usage: 11.6+ MB


In [9]:
df.describe()

Unnamed: 0,Show Number
count,216930.0
mean,4264.238519
std,1386.296335
min,1.0
25%,3349.0
50%,4490.0
75%,5393.0
max,6300.0


In [10]:
df.columns = df.columns.str.strip()

In [12]:
# Removes the NaNs and duplicates
df = df.dropna()
df = df.drop_duplicates()

# Change Value to number
df['Value'] = df['Value'].str.replace("$","")
df['Value'] = df['Value'].str.replace(",","")
df['Value'] = df['Value'].astype(int)

df

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200,"Signer of the Dec. of Indep., framer of the Co...",John Adams
...,...,...,...,...,...,...,...
216924,4999,2006-05-11,Double Jeopardy!,OFF-BROADWAY,2000,In 2006 the cast of this long-running hit emba...,Stomp
216925,4999,2006-05-11,Double Jeopardy!,RIDDLE ME THIS,2000,This Puccini opera turns on the solution to 3 ...,Turandot
216926,4999,2006-05-11,Double Jeopardy!,"""T"" BIRDS",2000,In North America this term is properly applied...,a titmouse
216927,4999,2006-05-11,Double Jeopardy!,AUTHORS IN THEIR YOUTH,2000,"In Penny Lane, where this ""Hellraiser"" grew up...",Clive Barker


In [13]:
df['Show Number'].unique()

array([4680, 5957, 3751, ..., 5070, 5195, 4999])

In [14]:
df['Show Number'].nunique()

3640

In [15]:
# The dataset is still too large, so make a new smaller one

# we can reduce by combining and selecting a show number
# 5070 is one of the above, however we could randomize it OR create a treemap for all of 3640 to show the whole dataset
p = df.groupby(['Show Number','Category','Answer'])['Value'].sum()
p = p[5070].reset_index()

In [16]:
p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  60 non-null     object
 1   Answer    60 non-null     object
 2   Value     60 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 1.5+ KB


In [17]:
# Honestly, I wish that I could use the entire dataset, however I do know that the treemap may not even render
fig = px.treemap(p, path=[px.Constant("all"), 'Category', 'Answer'], values='Value')
fig.update_traces(root_color="lightgrey")
fig.show()