# STP 510 Module 5 Basics
## By Marisa Boyd

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import json
import re
import os
from string import punctuation
from datetime import datetime

In [2]:
# Opening JSON file
jeopardy_json_file = open('jeopardy.json')
  
# returns JSON object as 
# a list of dictionaries
jeopardy_data = json.load(jeopardy_json_file)
  
#displays the first dictionary in the list
jeopardy_data[:1]

[{'category': 'HISTORY',
  'air_date': '2004-12-31',
  'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'",
  'value': '$200',
  'answer': 'Copernicus',
  'round': 'Jeopardy!',
  'show_number': '4680'}]

In [3]:
#display the keys for the dictionary
jeopardy_data[0].keys()

dict_keys(['category', 'air_date', 'question', 'value', 'answer', 'round', 'show_number'])

In [4]:
#iterate through the list of dictionaries and copy each value into its own list
category = [d['category'] for d in jeopardy_data]
air_date = [d['air_date'] for d in jeopardy_data]
question = [d['question'] for d in jeopardy_data]
value = [d['value'] for d in jeopardy_data]
answer = [d['answer'] for d in jeopardy_data]
jround = [d['round'] for d in jeopardy_data]
show_number = [d['show_number'] for d in jeopardy_data]

In [5]:
type(value[0])

str

In [6]:
#iterate through the value list and insert a ' ' for all none/Nan values
new_value = [str(i or '') for i in value]

In [7]:
#low value = 0 and high value = 1
#applying these values based on the value award for each question
#anything between 200 - 999 is low value, anything from 10000 and above is high value
easy_hard = [0 if re.search(r'\$\d{3}$',x) else 1  for x in new_value]

In [8]:
list(zip(new_value, easy_hard))[:50]

[('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$2,000', 1),
 ('$800', 0),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$1200', 1),
 ('$2,000', 1),
 ('$1200', 1),
 ('$1200', 1),
 ('$1200', 1),
 ('$1600', 1),
 ('$1600', 1),
 ('$1600', 1),
 ('$1600', 1),
 ('$1600', 1)]

In [9]:
#created dataframe for all elements
df = pd.DataFrame ({'category': category,
                    'air date': air_date,
                    'question': question,
                    'value': new_value,
                    'answer': answer,
                    'round': jround,
                    'show numberr': show_number,
                    'prediction': easy_hard
                   })

In [10]:
#display first five rows of dataframe
df.head()

Unnamed: 0,category,air date,question,value,answer,round,show numberr,prediction
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,0


In [11]:
#creating the training set and the test set for the x and y variables
X_train, X_test, y_train, y_test = train_test_split(df.category, df.prediction, 
                                                    random_state = 1)

In [12]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [13]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
predictions = naive_bayes.predict(X_test_tf)

In [14]:
#display the accuracy score for the model
print('Accuracy: ', accuracy_score(y_test, predictions))

Accuracy:  0.6869433739605038
