In [145]:
import sys
import csv
import os.path
import math

def load_file(csvfile):
	#open file / create headers(column name) and data arrays
	if not os.path.isfile(csvfile):
		exit_error('can\'t find the file ' + csvfile)
	data = []
	with open(csvfile) as csv_iterator:
		data_reader = csv.reader(csv_iterator, delimiter=',')
		for row in data_reader:
			data.append(row)
	csv_iterator.close()
	if len(data) < 2:
		exit_error('file ' + csvfile + ' is empty')
	headers = data[0]
	del data[0]
	return headers, data

def is_number(value):
	#surprisingly hard to do in python
	try:
		float(value)
		return True
	except ValueError:
		return False

def all_numeric(data, index):
	for row in data:
		if row[index] != '' and is_number(row[index]) == False:
			return False
	return True

headers, data = load_file('./data/dataset_train.csv')

# get schools
array = []
for index, lib in enumerate(headers):
    if lib == 'Hogwarts House':
        school_col = index
for row in data:
    if row[school_col] != '':
        array.append(row[school_col])
uniquearray = list(sorted(set(array)))
schools = [x for x in set(uniquearray)]
schools.sort()

In [146]:
featuresToReject = ['Index','Hogwarts House','First Name','Last Name','Arithmancy','Care of Magical Creatures', \
                    'Defense Against the Dark Arts','Birthday']
featuresToKeep = []
features = []
y = []

def get_median(data, index):
    array = []
    for row in data:
        if row[index] != '':
            array.append(float(row[index]))
    count = len(array)
    if count == 0:
        return 0
    array.sort()
    return array[max(int(count * 0.50) - 1, 0)]

# Separate y = Hogwarts House
for index, column in enumerate(headers):
    if column == 'Hogwarts House':
        for row in data:
            y.append(row[index])
                    
# Which features do we keep?
for index, column in enumerate(headers):
    if column in featuresToReject:
        continue
    featuresToKeep.append(index)
    features.append([])
        
for index, numCol in enumerate(featuresToKeep):
    # Replace categorical value with numeric
    if headers[numCol] == 'Best Hand':
        for row in data:
            if row[numCol] == 'Left':
                row[numCol] = 0
            else:
                row[numCol] = 1
    # Replace NaN data with median
    median = get_median(data, numCol)
    for row in data:
        if row[numCol] == '':
            row[numCol] = median

# Create a marix for the data
for index, numCol in enumerate(featuresToKeep):
    for row in data:
        features[index].append(float(row[numCol]))

# Normalize data
for index, feature in enumerate(features):
    features[index] = [(x - min(feature)) / (max(feature) - min(feature)) for x in feature]
    
yraw = y

In [154]:
import numpy as np

def sigmoid(z):
	return 1 / (1 + np.exp(-z))

def loss(yhat, y):
    return sum(y * np.log(yhat) + (1 - y) * np.log(1 - yhat)) / (-(y.shape[0]))

def f(i, school):
	if i == school:
		return 1
	else:
		return 0

def train_lr():
    # logistic regression params
    learning_rate = 0.1
    param_stop = 0.000001

    # init. Add numpy lib to ease coding. checked with ademenet (only basic numpy functions)
    import numpy as np
    weights = []
    train = np.array(features).T
    for school in schools:
        y = np.array([f(i,school) for i in yraw])
        theta = np.zeros(train.shape[1])

        cost = 1
        for i in range(100000):
            temp = np.dot(train, theta)
            yhat = sigmoid(temp)
            prev_cost = cost
            cost = loss(yhat, y)
            if prev_cost - cost < param_stop:
                break
            gradient = np.dot((yhat - y), train) / y.shape[0]
            theta -= gradient * learning_rate
        print(school, theta)
        weights.append(theta)
    print(weights)
    np.save('./data/weights',weights)

train_lr()

Gryffindor [ 0.22923183  1.61887089 -2.89793269  1.32758465 -0.46311819  2.82159318
 -3.36259254 -3.75225688 -1.20082167 -1.86829443  3.56678892]
Hufflepuff [-0.16596614  6.04380175  6.03501805  2.75102749 -4.79976776 -7.76362684
  0.99240981  0.15407797 -4.28457719 -0.76809627 -4.76819701]
Ravenclaw [-0.35149453 -6.69104875  1.89879002  0.37291816  4.52149754  2.94899019
 -2.11279563 -2.32344026 -2.72170322  5.32953697 -3.06904866]
Slytherin [ 0.22273446 -3.37758548 -3.62339024 -5.82398656 -1.04250226 -1.63990046
  2.64047762  3.3822523   4.80749685 -2.34771938 -1.06001926]
[array([ 0.22923183,  1.61887089, -2.89793269,  1.32758465, -0.46311819,
        2.82159318, -3.36259254, -3.75225688, -1.20082167, -1.86829443,
        3.56678892]), array([-0.16596614,  6.04380175,  6.03501805,  2.75102749, -4.79976776,
       -7.76362684,  0.99240981,  0.15407797, -4.28457719, -0.76809627,
       -4.76819701]), array([-0.35149453, -6.69104875,  1.89879002,  0.37291816,  4.52149754,
        2.948

In [157]:
test = np.load('./data/weights.npy')
test[0]

array([ 0.22923183,  1.61887089, -2.89793269,  1.32758465, -0.46311819,
        2.82159318, -3.36259254, -3.75225688, -1.20082167, -1.86829443,
        3.56678892])