# STP 510 Module 5 Basics
## By Marisa Boyd

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import pandas as pd
import string
import json
import re
import os
from string import punctuation
from datetime import datetime

In [2]:
# Opening JSON file
jeopardy_json_file = open('jeopardy.json')
  
# returns JSON object as 
# a list of dictionaries
jeopardy_data = json.load(jeopardy_json_file)
  
#displays the first dictionary in the list
jeopardy_data[:1]

[{'category': 'HISTORY',
  'air_date': '2004-12-31',
  'question': "'For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory'",
  'value': '$200',
  'answer': 'Copernicus',
  'round': 'Jeopardy!',
  'show_number': '4680'}]

In [3]:
#display the keys for the dictionary
jeopardy_data[0].keys()

dict_keys(['category', 'air_date', 'question', 'value', 'answer', 'round', 'show_number'])

In [4]:
#iterate through the list of dictionaries and copy each value into its own list
category = [d['category'] for d in jeopardy_data]
air_date = [d['air_date'] for d in jeopardy_data]
question = [d['question'] for d in jeopardy_data]
value = [d['value'] for d in jeopardy_data]
answer = [d['answer'] for d in jeopardy_data]
jround = [d['round'] for d in jeopardy_data]
show_number = [d['show_number'] for d in jeopardy_data]

In [5]:
#diplay the type of data stored in the variable value
type(value)

list

In [6]:
#display the type of data stored within the first item in the variable value
type(value[0])

str

In [7]:
#iterate through the value list and insert a ' ' for all none/Nan values
new_value = [str(i or '') for i in value]

In [8]:
#low value = 0 and high value = 1
#applying these values based on the value award for each question
#anything between 200 - 999 is low value, anything from 10000 and above is high value
easy_hard = [0 if re.search(r'\$\d{3}$',x) else 1  for x in new_value]

In [9]:
list(zip(new_value, easy_hard))[:30]

[('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$200', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$400', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$600', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$800', 0),
 ('$2,000', 1),
 ('$800', 0),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$1000', 1),
 ('$400', 0)]

In [10]:
#created dataframe for all elements
df = pd.DataFrame ({'category': category,
                    'air date': air_date,
                    'question': question,
                    'value': new_value,
                    'answer': answer,
                    'round': jround,
                    'show number': show_number,
                    'prediction': easy_hard
                   })

In [11]:
#display first five rows of dataframe
df.head()

Unnamed: 0,category,air date,question,value,answer,round,show number,prediction
0,HISTORY,2004-12-31,"'For the last 8 years of his life, Galileo was...",$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No. 2: 1912 Olympian; football star at Carlis...,$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,'The city of Yuma in this state has a record a...,$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,"'In 1963, live on ""The Art Linkletter Show"", t...",$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,"'Signer of the Dec. of Indep., framer of the C...",$200,John Adams,Jeopardy!,4680,0


### Basic Attempt:
Used the category feature to predict the high/low value.

In [12]:
#creating the training set and the test set for the x and y variables
X_train, X_test, y_train, y_test = train_test_split(df.category, df.prediction, 
                                                    random_state = 1)

In [14]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [15]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
predictions = naive_bayes.predict(X_test_tf)

In [16]:
#display the accuracy score for the model
print('Accuracy: ', accuracy_score(y_test, predictions))

Accuracy:  0.6869433739605038


### New Attempt 1:
Used the question feature to predict the high/low value.

In [18]:
#creating the training set and the test set for the x and y variables
X_train, X_test, y_train, y_test = train_test_split(df.question, df.prediction, 
                                                    random_state = 1)

In [19]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [20]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
predictions = naive_bayes.predict(X_test_tf)

In [21]:
#display the accuracy score for the model
print('Accuracy: ', accuracy_score(y_test, predictions))

Accuracy:  0.7011229325318533


### New Attempt 2:
Used the lemmatized versioni of the question featture to predict the high/low value.

In [22]:
#defined english_stopwords with potential stop words and punctuation that might be seen
english_stopwords = set(stopwords.words('english') + list(punctuation) + ['..','...','....','``', "''",'//n'])

In [23]:
#defined a function to lemmatize the text in the question feature
def clean_wordlist(questionlist):
    
    lemmatizer = WordNetLemmatizer()
    for eachquestion in questionlist:
        clean_question = [lemmatizer.lemmatize(word) for word in word_tokenize(eachquestion)
                            if word not in english_stopwords]
    return clean_question

In [24]:
#applied the defined function to the question list
print(datetime.now().strftime(':%M:%S'))
allquestions = [' '.join(clean_wordlist([x])) for x in question]
print(datetime.now().strftime(':%M:%S'))

:35:11
:35:56


In [25]:
#created dataframe for all elements
df = pd.DataFrame ({'category': category,
                    'air date': air_date,
                    'question': allquestions,
                    'value': new_value,
                    'answer': answer,
                    'round': jround,
                    'show number': show_number,
                    'prediction': easy_hard
                   })

In [26]:
#display first five rows of dataframe
df.head()

Unnamed: 0,category,air date,question,value,answer,round,show number,prediction
0,HISTORY,2004-12-31,'For last 8 year life Galileo house arrest esp...,$200,Copernicus,Jeopardy!,4680,0
1,ESPN's TOP 10 ALL-TIME ATHLETES,2004-12-31,'No 2 1912 Olympian football star Carlisle Ind...,$200,Jim Thorpe,Jeopardy!,4680,0
2,EVERYBODY TALKS ABOUT IT...,2004-12-31,"'The city Yuma state record average 4,055 hour...",$200,Arizona,Jeopardy!,4680,0
3,THE COMPANY LINE,2004-12-31,'In 1963 live The Art Linkletter Show company ...,$200,McDonald\'s,Jeopardy!,4680,0
4,EPITAPHS & TRIBUTES,2004-12-31,'Signer Dec. Indep. framer Constitution Mass. ...,$200,John Adams,Jeopardy!,4680,0


In [27]:
#creating the training set and the test set for the x and y variables
X_train, X_test, y_train, y_test = train_test_split(df.question, df.prediction, 
                                                    random_state = 1)

In [28]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_tf = tfidf_vectorizer.fit_transform(X_train)
X_test_tf = tfidf_vectorizer.transform(X_test)

In [29]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_tf, y_train)
predictions = naive_bayes.predict(X_test_tf)

In [30]:
#display the accuracy score for the model
print('Accuracy: ', accuracy_score(y_test, predictions))

Accuracy:  0.7015839064776059
