## Imports

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
import numpy as np
import csv
from sklearn import preprocessing

## Reading data

In [None]:
filename = 'sdss.csv'
raw_data = []
with open(filename) as f:
    reader = csv.reader(f)
    for row in reader:
        #print(row.split(','))
        raw_data.append(row)

In [None]:
x_data = []
y_label = []
for row in raw_data[1:]:
    y_label.append(row[13])   # Append label
    x_data.append(list(map(float, (row[0:13] + row[14:])))) # Convert to list of float
x_data = np.array(x_data)
print(len(x_data))
print(len(y_label))

In [None]:
lb_encoding = preprocessing.LabelBinarizer()
lb_encoding.fit(y_label)
y_data = lb_encoding.transform(y_label)

In [None]:
y_data[:7]

In [None]:
lb_encoding.classes_

In [None]:
lb_encoding.inverse_transform(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))

## Checking for useless data

In [None]:
std_list = [np.std(x_data[:,i]) for i in range(x_data.shape[1])]

for i,std in enumerate(std_list):
    if std == 0:
        print("Useless feature in index ",i)

<p> Here we found two features that are exactly the same for all samples. Therefore, they are useless and are going to be removed. Their indices are 0 and 9 </p>

In [None]:
valid_features = [i for i in range(x_data.shape[1]) if i not in [0,9]]
x_data = x_data[:,valid_features]

In [None]:
x_data.shape

## Normalizing data

In [None]:
## Subtracting by mean and dividing by standand deviation
def normalize_data(x_data):
    for i in range(x_data.shape[1]):
        x_data[:, i] = x_data[:, i] - np.mean(x_data[:, i])
        x_data[:, i] = x_data[:, i]/np.std(x_data[:, i])
    return x_data

x_data = normalize_data(x_data)

In [None]:
for i in range(x_data.shape[1]):
    print(np.mean(x_data[:,i]), np.std(x_data[:,i]))

## Split  in Train, Validation and Test

In [None]:
## Shuffle data 
def shuffle_lists_the_same_way(x_data, y_data):
    assert len(x_data) == len(y_data)
    p = np.random.permutation(len(x_data))
    return np.array(x_data)[p], np.array(y_data)[p]

x_data, y_data = shuffle_lists_the_same_way(x_data, y_data)

In [None]:
## Separating 60% train, 20% validation and 20% test
train_percentage = 0.6
validation_percentage = 0.2
test_percentage = 0.2

def separare_train_validation_test(x_data, y_data, train_percentage=0.6, validation_percentage=0.2, test_percentage=0.2):
    total = len(x_data)
    train_slice = int(train_percentage*total)
    validation_slice = train_slice + int(validation_percentage*total)
    test_slice = validation_slice + int(test_percentage*total)
    
    x_train, y_train = x_data[:train_slice], y_data[:train_slice]
    x_validation, y_validation = x_data[train_slice:validation_slice], y_data[train_slice:validation_slice]
    x_test, y_test = x_data[validation_slice:test_slice], y_data[validation_slice:test_slice]
    return x_train, y_train, x_validation, y_validation, x_test, y_test

x_train, y_train, x_validation, y_validation, x_test, y_test = separare_train_validation_test(x_data, y_data, train_percentage, validation_percentage, test_percentage)

In [None]:
print(x_train.shape, y_train.shape)
print(x_validation.shape, y_validation.shape)
print(x_test.shape, y_test.shape)

<p> Data were split randomly, checking with number of samples for each class are somewhat distributed </p>

In [None]:
print(np.sum(y_train, axis=0))
print(np.sum(y_validation, axis=0))
print(np.sum(y_test, axis=0))

## Building Network

In [None]:
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=([15])))
model.add(Dense(128, activation='relu', input_shape=([15])))
model.add(Dense(3, activation='relu'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
response = model.fit(x_train, y_train, 
          batch_size=1, epochs=20, verbose=1)

In [None]:
score = model.evaluate(x_validation, y_validation, verbose=1)

In [None]:
print(score)

In [None]:
score = model.evaluate(x_test, y_test, verbose=1)

In [None]:
print(score)