## Imports

In [19]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
import numpy as np
import csv
from sklearn import preprocessing
import matplotlib.pyplot as plt

## Reading data

In [2]:
filename = 'sdss.csv'
raw_data = []
with open(filename) as f:
    reader = csv.reader(f)
    for row in reader:
        #print(row.split(','))
        raw_data.append(row)

In [3]:
x_data = []
y_label = []
for row in raw_data[1:]:
    y_label.append(row[13])   # Append label
    x_data.append(list(map(float, (row[0:13] + row[14:])))) # Convert to list of float
x_data = np.array(x_data)
print(len(x_data))
print(len(y_label))

5000
5000


## One Hot Encode

In [4]:
lb_encoding = preprocessing.LabelBinarizer()
lb_encoding.fit(y_label)
y_data = lb_encoding.transform(y_label)

In [5]:
y_data[:7]

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [6]:
lb_encoding.classes_

array(['GALAXY', 'QSO', 'STAR'], dtype='<U6')

In [7]:
lb_encoding.inverse_transform(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))

array(['GALAXY', 'QSO', 'STAR'], dtype='<U6')

## Checking for useless data

In [8]:
std_list = [np.std(x_data[:,i]) for i in range(x_data.shape[1])]

for i,std in enumerate(std_list):
    if std == 0:
        print("Useless feature in index ",i)

Useless feature in index  0
Useless feature in index  9


<p> Here we found two features that are exactly the same for all samples. Therefore, they are useless and are going to be removed. Their indices are 0 and 9 </p>

In [9]:
valid_features = [i for i in range(x_data.shape[1]) if i not in [0,9]]
x_data = x_data[:,valid_features]

In [10]:
x_data.shape

(5000, 15)

## Normalizing data

In [11]:
## Subtracting by mean and dividing by standand deviation
def normalize_data(x_data):
    for i in range(x_data.shape[1]):
        x_data[:, i] = x_data[:, i] - np.mean(x_data[:, i])
        x_data[:, i] = x_data[:, i]/np.std(x_data[:, i])
    return x_data

x_data = normalize_data(x_data)

In [12]:
for i in range(x_data.shape[1]):
    print(np.mean(x_data[:,i]), np.std(x_data[:,i]))

5.684341886080802e-16 1.0
-6.892264536872972e-17 0.9999999999999999
-1.2509104863056565e-15 1.0
8.874678769643651e-16 1.0
-3.588240815588506e-17 0.9999999999999998
-3.872457909892546e-16 1.0
1.6470380614919123e-15 0.9999999999999999
-5.4001247917767614e-17 1.0
-3.552713678800501e-17 0.9999999999999999
6.536993168992922e-17 1.0
1.1795009413617663e-16 0.9999999999999999
6.536993168992922e-17 1.0
-4.9737991503207014e-17 1.0
-2.1593393739749445e-15 0.9999999999999998
4.263256414560601e-17 1.0


## Split  in Train, Validation and Test

In [13]:
## Shuffle data 
def shuffle_lists_the_same_way(x_data, y_data):
    assert len(x_data) == len(y_data)
    p = np.random.permutation(len(x_data))
    return np.array(x_data)[p], np.array(y_data)[p]

x_data, y_data = shuffle_lists_the_same_way(x_data, y_data)

In [14]:
## Separating 70% train and 30% test
train_percentage = 0.7
validation_percentage = 0
test_percentage = 0.3

def separare_train_validation_test(x_data, y_data, train_percentage=0.6, validation_percentage=0.2, test_percentage=0.2):
    total = len(x_data)
    train_slice = int(train_percentage*total)
    validation_slice = train_slice + int(validation_percentage*total)
    test_slice = validation_slice + int(test_percentage*total)
    
    x_train, y_train = x_data[:train_slice], y_data[:train_slice]
    x_validation, y_validation = x_data[train_slice:validation_slice], y_data[train_slice:validation_slice]
    x_test, y_test = x_data[validation_slice:test_slice], y_data[validation_slice:test_slice]
    return x_train, y_train, x_validation, y_validation, x_test, y_test

x_train, y_train, x_validation, y_validation, x_test, y_test = separare_train_validation_test(x_data, y_data, train_percentage, validation_percentage, test_percentage)

In [15]:
print(x_train.shape, y_train.shape)
print(x_validation.shape, y_validation.shape)
print(x_test.shape, y_test.shape)

(3500, 15) (3500, 3)
(0, 15) (0, 3)
(1500, 15) (1500, 3)


<p> Data were split randomly, checking with number of samples for each class are somewhat distributed </p>

In [16]:
print(np.sum(y_train, axis=0))
print(np.sum(y_test, axis=0))

[1742  299 1459]
[759 113 628]


## Building Network

# 3.1

In [43]:
train_acc = []
test_acc = []
response_list = []
hidden_nodes_list = [16, 32, 64, 128, 256, 512, 1024, 2048]

for hidden_nodes in hidden_nodes_list :
    model = Sequential()
    model.add(Dense(hidden_nodes, activation='relu', input_shape=([15])))
    model.add(Dense(3, activation='relu'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

Train acc:  [0.781999999659402, 0.878857142993382, 0.8882857141494751, 0.9065714289120266, 0.8762857142175947, 0.8994285712923322, 0.8597142853736878, 0.944000000340598]
Test acc:  [0.7633333328564962, 0.8573333330154419, 0.8659999996821086, 0.8819999996821085, 0.844, 0.8686666666666667, 0.8580000004768371, 0.914666666507721]


# 3.2

In [44]:
train_acc = []
test_acc = []
response_list = []
hidden_nodes_list = [16, 32, 64, 128, 256, 512, 1024, 2048]

for hidden_nodes in hidden_nodes_list :
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(hidden_nodes, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

Train acc:  [0.880857142652784, 0.6757142856461661, 0.8920000000681196, 0.909714285509927, 0.9271428570066179, 0.9042857143538339, 0.9597142857142857, 0.5700000001362392]
Test acc:  [0.8559999996821086, 0.6606666661898295, 0.87, 0.9093333338101705, 0.9053333334922791, 0.8973333330154419, 0.9399999998410543, 0.5693333338101705]


# 3.3

In [48]:
train_acc = []
test_acc = []
response_list = []
learning_rates = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]

for learning_rate in learning_rates :
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    optimizer = keras.optimizers.Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

Train acc:  [0.9540000000681196, 0.9225714286395482, 0.8771428574834551, 0.8791428572790964, 0.8891428572109767, 0.7348571425846645]
Test acc:  [0.9199999998410543, 0.9120000001589457, 0.8640000004768371, 0.889333333492279, 0.8873333338101705, 0.7493333328564962]


In [68]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=([15])))
model.add(Dense(1024, activation='relu'))
model.add(Dense(3, activation='relu'))

optimizer = keras.optimizers.Adam(lr=0.001, decay=1.5)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

response = model.fit(x_train, y_train, 
          batch_size=128, epochs=20, verbose=0)
score = model.evaluate(x_test, y_test, verbose=0)

train_acc = response.history['acc'][-1]
test_acc = score[1]
response_list.append(response.history['acc'])
keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

Train acc:  0.5694285716329303
Test acc:  0.5680000004768372


# 3.4

In [72]:
train_acc = []
test_acc = []
response_list = []
batch_sizes = [1, 8, 32, 64, 128, 256, 512]

for batch_size in batch_sizes:
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    optimizer = keras.optimizers.Adam(lr=0.0001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=batch_size, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()
    
print("Train acc: ", train_acc)
print("Test acc: ", test_acc)
keras.backend.clear_session()

Train acc:  [0.4605714285714286, 0.9431428571428572, 0.9205714285714286, 0.95, 0.9240000000681196, 0.7928571439470563, 0.9219999998637608]
Test acc:  [0.4533333334128062, 0.916666666507721, 0.8793333330154419, 0.9126666668256124, 0.8406666671435038, 0.7913333328564962, 0.8840000001589458]


## Final results using oversampling and K-fold