## Imports

In [1]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
import numpy as np
import csv
from sklearn import preprocessing
import matplotlib.pyplot as plt

Using TensorFlow backend.


## Reading data

In [2]:
filename = 'sdss.csv'
raw_data = []
with open(filename) as f:
    reader = csv.reader(f)
    for row in reader:
        #print(row.split(','))
        raw_data.append(row)

In [3]:
x_data = []
y_label = []
for row in raw_data[1:]:
    y_label.append(row[13])   # Append label
    x_data.append(list(map(float, (row[0:13] + row[14:])))) # Convert to list of float
x_data = np.array(x_data)
print(len(x_data))
print(len(y_label))

5000
5000


## One Hot Encode

In [4]:
lb_encoding = preprocessing.LabelBinarizer()
lb_encoding.fit(y_label)
y_data = lb_encoding.transform(y_label)

In [5]:
y_data[:7]

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 0, 1]])

In [6]:
lb_encoding.classes_

array(['GALAXY', 'QSO', 'STAR'], dtype='<U6')

In [7]:
lb_encoding.inverse_transform(np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]))

array(['GALAXY', 'QSO', 'STAR'], dtype='<U6')

## Checking for useless data

In [8]:
std_list = [np.std(x_data[:,i]) for i in range(x_data.shape[1])]

for i,std in enumerate(std_list):
    if std == 0:
        print("Useless feature in index ",i)

Useless feature in index  0
Useless feature in index  9


<p> Here we found two features that are exactly the same for all samples. Therefore, they are useless and are going to be removed. Their indices are 0 and 9 </p>

In [9]:
valid_features = [i for i in range(x_data.shape[1]) if i not in [0,9]]
x_data = x_data[:,valid_features]

In [10]:
x_data.shape

(5000, 15)

## Normalizing data

In [11]:
## Subtracting by mean and dividing by standand deviation
def normalize_data(x_data):
    for i in range(x_data.shape[1]):
        x_data[:, i] = x_data[:, i] - np.mean(x_data[:, i])
        x_data[:, i] = x_data[:, i]/np.std(x_data[:, i])
    return x_data

x_data = normalize_data(x_data)

In [12]:
for i in range(x_data.shape[1]):
    print(np.mean(x_data[:,i]), np.std(x_data[:,i]))

5.684341886080802e-16 1.0
-6.892264536872972e-17 0.9999999999999999
-1.2509104863056565e-15 1.0
8.874678769643651e-16 1.0
-3.588240815588506e-17 0.9999999999999998
-3.872457909892546e-16 1.0
1.6470380614919123e-15 0.9999999999999999
-5.4001247917767614e-17 1.0
-3.552713678800501e-17 0.9999999999999999
6.536993168992922e-17 1.0
1.1795009413617663e-16 0.9999999999999999
6.536993168992922e-17 1.0
-4.9737991503207014e-17 1.0
-2.1593393739749445e-15 0.9999999999999998
4.263256414560601e-17 1.0


## Split  in Train, Validation and Test

In [13]:
## Shuffle data 
def shuffle_lists_the_same_way(x_data, y_data):
    assert len(x_data) == len(y_data)
    p = np.random.permutation(len(x_data))
    return np.array(x_data)[p], np.array(y_data)[p]

x_data, y_data = shuffle_lists_the_same_way(x_data, y_data)

In [14]:
## Separating 70% train and 30% test
train_percentage = 0.7
validation_percentage = 0
test_percentage = 0.3

def separare_train_validation_test(x_data, y_data, train_percentage=0.6, validation_percentage=0.2, test_percentage=0.2):
    total = len(x_data)
    train_slice = int(train_percentage*total)
    validation_slice = train_slice + int(validation_percentage*total)
    test_slice = validation_slice + int(test_percentage*total)
    
    x_train, y_train = x_data[:train_slice], y_data[:train_slice]
    x_validation, y_validation = x_data[train_slice:validation_slice], y_data[train_slice:validation_slice]
    x_test, y_test = x_data[validation_slice:test_slice], y_data[validation_slice:test_slice]
    return x_train, y_train, x_validation, y_validation, x_test, y_test

x_train, y_train, x_validation, y_validation, x_test, y_test = separare_train_validation_test(x_data, y_data, train_percentage, validation_percentage, test_percentage)

In [15]:
print(x_train.shape, y_train.shape)
print(x_validation.shape, y_validation.shape)
print(x_test.shape, y_test.shape)

(3500, 15) (3500, 3)
(0, 15) (0, 3)
(1500, 15) (1500, 3)


<p> Data were split randomly, checking with number of samples for each class are somewhat distributed </p>

In [16]:
print(np.sum(y_train, axis=0))
print(np.sum(y_test, axis=0))

[1736  275 1489]
[765 137 598]


## Building Network

### 3.1 Testing number of nodes

In [None]:
train_acc = []
test_acc = []
response_list = []
hidden_nodes_list = [16, 32, 64, 128, 256, 512, 1024, 2048]

for hidden_nodes in hidden_nodes_list :
    model = Sequential()
    model.add(Dense(hidden_nodes, activation='relu', input_shape=([15])))
    model.add(Dense(3, activation='relu'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

### 3.2 Testing number of nodes in second hidden layer

In [None]:
train_acc = []
test_acc = []
response_list = []
hidden_nodes_list = [16, 32, 64, 128, 256, 512, 1024, 2048]

for hidden_nodes in hidden_nodes_list :
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(hidden_nodes, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()### 3.1 Testing number of nodes

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

### 3.3 Testing learning rate

In [None]:
train_acc = []
test_acc = []
response_list = []
learning_rates = [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05]

for learning_rate in learning_rates :
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    optimizer = keras.optimizers.Adam(lr=learning_rate)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=128, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

In [None]:
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=([15])))
model.add(Dense(1024, activation='relu'))
model.add(Dense(3, activation='relu'))

optimizer = keras.optimizers.Adam(lr=0.001, decay=1.5)
model.compile(loss='categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

response = model.fit(x_train, y_train, 
          batch_size=128, epochs=20, verbose=0)
score = model.evaluate(x_test, y_test, verbose=0)

train_acc = response.history['acc'][-1]
test_acc = score[1]
response_list.append(response.history['acc'])
keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

### 3.4 Testing batch size

In [None]:
train_acc = []
test_acc = []
response_list = []
batch_sizes = [1, 8, 32, 64, 128, 256, 512]

for batch_size in batch_sizes:
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    optimizer = keras.optimizers.Adam(lr=0.0001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=batch_size, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()
    
print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

## Oversampling Quasar class

In [17]:
def oversample(x_data, y_data):
    i = 0
    x_oversampled = list(x_data)
    y_oversampled = list(y_data)
    for x,y in zip(x_data, y_data):
        if y[1] == 1:
            for i in range(5):
                x_oversampled.append(x)
                y_oversampled.append(y)
    return shuffle_lists_the_same_way(x_oversampled, y_oversampled)
            
x_data_oversampled, y_data_oversampled = oversample(x_data, y_data)

In [19]:
print(np.sum(y_data_oversampled, axis=0))
print(np.sum(y_data_oversampled))

[2501 2472 2087]
7060


## K fold

In [22]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)
train_acc = []
test_acc = []
response_list = []
for train_index, test_index in skf.split(x_data_oversampled, lb_encoding.inverse_transform(y_data_oversampled)):
    x_data_fold = x_data_oversampled[train_index]
    y_data_fold = y_data_oversampled[test_index]
    
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=([15])))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(3, activation='relu'))
    
    optimizer = keras.optimizers.Adam(lr=0.0001)
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    
    response = model.fit(x_train, y_train, 
              batch_size=512, epochs=20, verbose=0)
    score = model.evaluate(x_test, y_test, verbose=0)
    
    train_acc.append(response.history['acc'][-1])
    test_acc.append(score[1])
    response_list.append(response.history['acc'])
    keras.backend.clear_session()

print("Train acc: ", train_acc)
print("Test acc: ", test_acc)

Train acc:  [0.9251428602082389, 0.9148571406773158, 0.910571425642286]
Test acc:  [0.9060000004768372, 0.8933333334922791, 0.9073333328564962]


In [25]:
print("Final accuracy in oversampled test data: {} +- {}".format(np.mean(test_acc), np.std(test_acc)))

Final accuracy in oversampled test data: 0.9022222222752041 +- 0.0063089196991281205


In [27]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               8192      
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 3075      
Total params: 536,579
Trainable params: 536,579
Non-trainable params: 0
_________________________________________________________________
