In [1]:
import numpy
import pandas

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

from sklearn import svm

In [2]:
HEADERS = ["Show Number","Air Date","Round","Category","Value","Question","Answer"]

dataset = pandas.read_csv("JEOPARDY_CSV.csv", names=HEADERS, low_memory=False, nrows=10000)

## Import data

In [3]:
dataset.head()

Unnamed: 0,Show Number,Air Date,Round,Category,Value,Question,Answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was ...",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,No. 2: 1912 Olympian; football star at Carlisl...,Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,The city of Yuma in this state has a record av...,Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", th...",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Co...",John Adams


In [4]:
dataset.shape

(10000, 7)

In [5]:
dataset.describe()

Unnamed: 0,Show Number
count,10000.0
mean,4267.6037
std,1426.27971
min,62.0
25%,3403.0
50%,4506.0
75%,5419.0
max,6294.0


## Split data into InputSet and OutputSet

In [6]:
xset = dataset.iloc[:,:5].values
yset = dataset.iloc[:,5:].values

## Compile Redondances

In [7]:
inputSet = dataset.iloc[:,[0,2,3]]
inputSet[:5]

Unnamed: 0,Show Number,Round,Category
0,4680,Jeopardy!,HISTORY
1,4680,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES
2,4680,Jeopardy!,EVERYBODY TALKS ABOUT IT...
3,4680,Jeopardy!,THE COMPANY LINE
4,4680,Jeopardy!,EPITAPHS & TRIBUTES


## Categories weighting

In [8]:
def categories_weighting(inputSet, column):
    catList = list(set(inputSet[column]))
    categories = inputSet[column].copy()
    for ind in range(len(categories)):
        categories[ind] = catList.index(categories[ind])
    inputSet = inputSet.drop(columns=[column])
    inputSet[column] = categories
    return (inputSet, catList)


In [9]:
outputSet, categorieTable = categories_weighting(inputSet, 'Category')
outputSet, roundTable     = categories_weighting(outputSet, 'Round')
outputSet.head()

Unnamed: 0,Show Number,Category,Round
0,4680,920,2
1,4680,735,2
2,4680,581,2
3,4680,1674,2
4,4680,1235,2


## Split Data into training set and test set

In [10]:
yIndSet = range(len(yset))
xtrain, xtest, ytrain, ytest = train_test_split(outputSet, yIndSet, test_size=0.1, random_state=2)

## Classification and regression trees

In [16]:
c = DecisionTreeClassifier().fit(outputSet,yIndSet)

In [17]:
from sklearn import tree
tree.export_graphviz(c, out_file='tree.dot') 


## Clusterisation

In [13]:
cluster_inputSet = outputSet.drop(columns=['Round'])

In [14]:
xxtrain, xxtest, yytrain, yytest = train_test_split(cluster_inputSet, yIndSet, test_size=0.1, random_state=2)

from sklearn.neighbors import KNeighborsClassifier as kn
c = kn(n_neighbors=2).fit(xxtrain, yytrain)