In [20]:
import sys
import csv
import os.path
import numpy as np

def load_file(csvfile):
	#open file / create headers(column name) and data arrays
	if not os.path.isfile(csvfile):
		exit_error('can\'t find the file ' + csvfile)
	data = []
	with open(csvfile) as csv_iterator:
		data_reader = csv.reader(csv_iterator, delimiter=',')
		for row in data_reader:
			data.append(row)
	csv_iterator.close()
	if len(data) < 2:
		exit_error('file ' + csvfile + ' is empty')
	headers = data[0]
	del data[0]
	return headers, data

def is_number(value):
	#surprisingly hard to do in python
	try:
		float(value)
		return True
	except ValueError:
		return False

def all_numeric(data, index):
	for row in data:
		if row[index] != '' and is_number(row[index]) == False:
			return False
	return True

headers, data = load_file('./data/dataset_test.csv')

In [21]:
def prep_data(headers, data, medians):
	featuresToReject = ['Index','Hogwarts House','First Name','Last Name','Arithmancy','Care of Magical Creatures', \
						'Defense Against the Dark Arts','Birthday']
	featuresToKeep = []
	features = []
						
	for index, column in enumerate(headers):
		if column in featuresToReject:
			continue
		featuresToKeep.append(index)
		features.append([])
			
	for index, numCol in enumerate(featuresToKeep):
		# Replace categorical value with numeric
		if headers[numCol] == 'Best Hand':
			for row in data:
				if row[numCol] == 'Left':
					row[numCol] = 0
				else:
					row[numCol] = 1
		# Replace NaN data with median
		median = medians[index]
		for row in data:
			if row[numCol] == '':
				row[numCol] = median

	# Create a marix for the data
	for index, numCol in enumerate(featuresToKeep):
		for row in data:
			features[index].append(float(row[numCol]))

	# Normalize data
	for index, feature in enumerate(features):
		features[index] = [(x - min(feature)) / (max(feature) - min(feature)) for x in feature]
	
	return  features

medians = np.load('./data/medians.npy')
schools = np.load('./data/schools.npy')
weights = np.load('./data/weights.npy')
features = prep_data(headers, data, medians)
test = np.array(features).T
probas = []

In [23]:
def sigmoid(z):
	return 1 / (1 + np.exp(-z))

for index, school in enumerate(schools):
    theta = weights[index]
    temp = np.dot(test, theta)
    yhat = sigmoid(temp)
    probas.append(yhat)

In [40]:
yhats = np.array(probas).T

with open('./data/houses.csv', 'w') as outputfile:
    writer = csv.writer(outputfile, delimiter=',')
    writer.writerow(["Index", "Hogwarts House"])
    for index, row in enumerate(yhats):
        writer.writerow((index, schools[row.argmax()]))
print('done')

done


In [41]:
print(weights)

[[ 0.22923183  1.61887089 -2.89793269  1.32758465 -0.46311819  2.82159318
  -3.36259254 -3.75225688 -1.20082167 -1.86829443  3.56678892]
 [-0.16596614  6.04380175  6.03501805  2.75102749 -4.79976776 -7.76362684
   0.99240981  0.15407797 -4.28457719 -0.76809627 -4.76819701]
 [-0.35149453 -6.69104875  1.89879002  0.37291816  4.52149754  2.94899019
  -2.11279563 -2.32344026 -2.72170322  5.32953697 -3.06904866]
 [ 0.22273446 -3.37758548 -3.62339024 -5.82398656 -1.04250226 -1.63990046
   2.64047762  3.3822523   4.80749685 -2.34771938 -1.06001926]]
