In [18]:
#import libraries 
import os
import string 
import re
import json
import pandas as pd
from nltk import word_tokenize, sent_tokenize, pos_tag, download
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from bs4 import BeautifulSoup
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [19]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nosiz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
#load datasset  
df = pd.read_json("jeopardy.json")
df.head()

Unnamed: 0,category,air_date,question,value,answer,round,show_number
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680


In [21]:
#we only need the value and question columns
#remove other columns
df = df[["question", "value"]].dropna()
df.head()

Unnamed: 0,question,value
0,"'For the last 8 years of his life, Galileo was...",$200
1,'No. 2: 1912 Olympian; football star at Carlis...,$200
2,'The city of Yuma in this state has a record a...,$200
3,"'In 1963, live on ""The Art Linkletter Show"", t...",$200
4,"'Signer of the Dec. of Indep., framer of the C...",$200


In [22]:
#we would like to know how many unique values are in the value
#assumption: values > median value = hard questions, theirfore are high value questions 
unique_values = df["value"].unique()
unique_values

array(['$200', '$400', '$600', '$800', '$2,000', '$1000', '$1200',
       '$1600', '$2000', '$3,200', '$5,000', '$100', '$300', '$500',
       '$1,000', '$1,500', '$1,200', '$4,800', '$1,800', '$1,100',
       '$2,200', '$3,400', '$3,000', '$4,000', '$1,600', '$6,800',
       '$1,900', '$3,100', '$700', '$1,400', '$2,800', '$8,000', '$6,000',
       '$2,400', '$12,000', '$3,800', '$2,500', '$6,200', '$10,000',
       '$7,000', '$1,492', '$7,400', '$1,300', '$7,200', '$2,600',
       '$3,300', '$5,400', '$4,500', '$2,100', '$900', '$3,600', '$2,127',
       '$367', '$4,400', '$3,500', '$2,900', '$3,900', '$4,100', '$4,600',
       '$10,800', '$2,300', '$5,600', '$1,111', '$8,200', '$5,800',
       '$750', '$7,500', '$1,700', '$9,000', '$6,100', '$1,020', '$4,700',
       '$2,021', '$5,200', '$3,389', '$4,200', '$5', '$2,001', '$1,263',
       '$4,637', '$3,201', '$6,600', '$3,700', '$2,990', '$5,500',
       '$14,000', '$2,700', '$6,400', '$350', '$8,600', '$6,300', '$250',
       '$3,9

In [23]:
#to set median value we need to remove $ and , signs from the values 
#create new column called clean value 

def clean_value(v):
    v = str(v).replace("$", "").replace(",", "")
    return int(v) if v.isdigit() else None

df["CleanValue"] = df["value"].apply(clean_value)
df = df.dropna(subset=["CleanValue"])

median_val = df["CleanValue"].median()
df["HighValue"] = (df["CleanValue"] >= median_val).astype(int)


In [24]:
#we want to remove punctuations in the questions 
english_stopwords = set(stopwords.words('english') + list(punctuation))
english_stopwords

{'!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'might

In [25]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nosiz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nosiz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [26]:
#set up lemmatizer 
#define function for cleaning the text


lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    cleaned = [
        lemmatizer.lemmatize(word)
        for word in tokens
        if word not in english_stopwords
    ]
    return " ".join(cleaned)

In [27]:
df["CleanQuestion"] = df["question"].apply(clean_text)
df.head()

Unnamed: 0,question,value,CleanValue,HighValue,CleanQuestion
0,"'For the last 8 years of his life, Galileo was...",$200,200,0,'for last 8 year life galileo house arrest esp...
1,'No. 2: 1912 Olympian; football star at Carlis...,$200,200,0,'no 2 1912 olympian football star carlisle ind...
2,'The city of Yuma in this state has a record a...,$200,200,0,"'the city yuma state record average 4,055 hour..."
3,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,200,0,'in 1963 live `` art linkletter show '' compan...
4,"'Signer of the Dec. of Indep., framer of the C...",$200,200,0,'signer dec. indep. framer constitution mass. ...


In [28]:
#split and train 
X_train, X_test, y_train, y_test = train_test_split(
    df["CleanQuestion"], df["HighValue"], random_state=1
)


In [29]:
#vectorize 
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [30]:
#Naive Bayes Classifier 
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
predictions = naive_bayes.predict(X_test_tf)

In [31]:
#print accuracy metric 
print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.5681119195859275
