In [1]:
import pandas as pd
import spacy
import numpy as np
from collections import defaultdict
import re
import string
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
data = pd.read_csv('JEOPARDY_CSV.csv')

In [3]:
len(data)

216930

In [4]:
data.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [5]:
data.columns

Index(['Show Number', ' Air Date', ' Round', ' Category', ' Value',
       ' Question', ' Answer'],
      dtype='object')

In [6]:
# rename columns
data.columns = ['show_id', 'date', 'round', 'category', 'value', 'question', 'answer'] 

In [7]:
data['round'].unique()

array(['Jeopardy!', 'Double Jeopardy!', 'Final Jeopardy!', 'Tiebreaker'],
      dtype=object)

In [8]:
#remove $ remove comma and convert to int

data['value'] = data['value'].apply(lambda x: int(x[1:].replace(',','')) if x != 'None' else None) 

In [9]:
data['year'] = data['date'].apply(lambda x: int(x[:4])) # grab the year from date

In [10]:
min(data['year']), max(data['year'])

(1984, 2012)

In [11]:
len(data['year'].unique())

29

In [12]:
data.head()

Unnamed: 0,show_id,date,round,category,value,question,answer,year
0,4680,2004-12-31,Jeopardy!,HISTORY,200.0,"For the last 8 years of his life, Galileo was ...",Copernicus,2004
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,200.0,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe,2004
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,200.0,The city of Yuma in this state has a record av...,Arizona,2004
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,200.0,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's,2004
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,200.0,"Signer of the Dec. of Indep., framer of the Co...",John Adams,2004


In [13]:
data['round'].unique()

array(['Jeopardy!', 'Double Jeopardy!', 'Final Jeopardy!', 'Tiebreaker'],
      dtype=object)

In [14]:
len(data['category'].unique())

27995

In [15]:
def get_vocab(content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    terms = {}
    vocab = defaultdict(float)
    for question in content:
        words = set(question.split())
        for word in words:
            vocab[word] += 1
    return vocab     

In [16]:
dfs = [data[data.year==i] for i in data.year.unique()]

In [17]:
#https://towardsdatascience.com/topic-modeling-quora-questions-with-lda-nmf-aff8dce5e1dd
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

df_clean = pd.DataFrame(data.question.apply(lambda x: clean_text(x)))

nlp = spacy.load('en')
def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)
    

In [18]:
df_clean["question_lemmatize"] =  df_clean.apply(lambda x: lemmatizer(x['question']), axis=1)
df_clean['question_lemmatize_clean'] = df_clean['question_lemmatize'].str.replace('-PRON-', '')

In [19]:
vocab = list(get_vocab(df_clean['question_lemmatize_clean']).keys())

In [None]:
d = np.zeros((len(data),len(vocab)))

for i,row in enumerate(df_clean['question_lemmatize_clean']):
    for w in row.split():
        d[i,vocab.index(w)] = 1
df = pd.DataFrame(d,columns=vocab,dtype=int)

In [None]:
df['#round'] = data['round'].values

In [None]:
df

## LDA

In [None]:
# final = df[df['#round']=='Jeopardy!'].copy()
final = df.copy()

In [None]:
lda = LatentDirichletAllocation(n_components=10)
probs = lda.fit_transform(final.iloc[:,:-1])

In [None]:
dominant_topic = np.argmax(probs, axis=1)

In [None]:
final['#topic'] = dominant_topic

In [None]:
final[final['#topic']==3]

In [None]:
dfs[0].iloc[13182]['question']

In [None]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#17howtofindtheoptimalnumberoftopicsforlda
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values