# Imports

In [37]:
import numpy as np
import pandas as pd
import random as rand


from keras.utils import to_categorical
from keras.layers import Dense, Activation, Conv1D, Conv2D, Dropout
from keras.models import Sequential

# Data pre-processing

Due to the large size of dataset, many columns that are not needed and some rows containing null values have already been dropped

In [2]:
data = pd.read_csv("dataset.csv")

data = data.drop(['class', 'epoch_cal'], axis=1)
data = data.dropna()
data['neo'] = data['neo'].replace(('Y','N'), (1,0))
data['pha'] = data['pha'].replace(('Y','N'), (1,0))

#num_classes = data['class'].nunique()
#classes = to_categorical(data['class'], num_classes)

data = data.astype('float32')
data /= 255

print(data.shape)

print(data.info)

(131124, 16)
<bound method DataFrame.info of         neo  pha         H  diameter    albedo         e         a         q  \
0       0.0  0.0  0.013333  3.683922  0.000353  0.000298  0.010859  0.010034   
1       0.0  0.0  0.016471  2.137255  0.000396  0.000902  0.010878  0.008376   
2       0.0  0.0  0.020902  0.967043  0.000839  0.001008  0.010464  0.007775   
3       0.0  0.0  0.011765  2.060392  0.001658  0.000348  0.009260  0.008439   
4       0.0  0.0  0.027059  0.418427  0.001075  0.000749  0.010094  0.008167   
...     ...  ...       ...       ...       ...       ...       ...       ...   
573686  0.0  0.0  0.061176  0.017612  0.000094  0.000717  0.011518  0.009411   
573687  0.0  0.0  0.060000  0.018020  0.000361  0.000289  0.012456  0.011538   
573688  0.0  0.0  0.060392  0.016024  0.000455  0.000326  0.012663  0.011610   
573689  0.0  0.0  0.064314  0.012820  0.000408  0.000827  0.012432  0.009811   
573690  0.0  0.0  0.065882  0.008361  0.000243  0.000899  0.011454  0.00882

Training parameters


In [3]:
INPUT_SHAPE = (16,1)
OPTIMIZER = 'adam'
LOSS = 'binary_crossentropy'
METRICS = 'accuracy'

BATCH_SIZE = 128
N_EPOCHS = 20
VERBOSE = 1
VALIDATION_SPLIT = 0.2

# Generator model
18 outputs as there are 18 columns in the dataset

In [4]:
def make_generator(latent_dim):
  model = Sequential()
  
  model.add(Dense(40, activation='relu', input_dim=latent_dim))
  model.add(Dense(18, activation='sigmoid'))

  return model

# Discriminator model
This is a binary CNN classifier, to determine whether the input is 'real' or not

In [40]:
def make_discriminator():
  model = Sequential()

  model.add(Conv1D(64, kernel_size=5, padding='same', activation='relu', input_shape=INPUT_SHAPE))

  model.add(Dense(1, activation='sigmoid'))

  return model

In [39]:
model = make_discriminator()

model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
model.summary()

ValueError: ignored

In [33]:
x_real = data
y_real = np.ones((131124,1))

Generate random, fake samples for training discriminator

In [28]:
def generate_fakes():
  neo = rand.choice([1,0])
  pha = rand.choice([1,0])
  diameter = rand.uniform(0,100000)
  albedo = rand.uniform(0,1)
  e = rand.uniform(0, 0.002)
  a = rand.uniform(0,1)
  q = rand.uniform(0,1)
  i = rand.uniform(0,1)
  om = rand.uniform(0,1)
  w = rand.uniform(0,2)
  ad = rand.uniform(0,0.05)
  n = rand.uniform(0,0.01)
  tp_cal = rand.uniform(0,100000)
  per = rand.uniform(0,10)
  moid = rand.uniform(0,0.1)

  X = pd.DataFrame(data = [neo, pha, diameter, albedo, e, a, q, i, om, w, ad, n, tp_cal, per, moid], columns = x_real.columns)
  print(X.shape)
  return X

In [29]:

x_fake = pd.DataFrame(columns = x_real.columns)
for i in range(131124):
  # fix below line - needs to add as a new row, not each value from the function as its own separate row
  current_fake = generate_fakes()

  print(current_fake)
  x_fake = pd.concat([x_fake, current_fake], ignore_index=True)
  print(x_fake.shape)
  print(x_fake)
y_fake = np.zeros(131124)

x = pd.concat([x_real, x_fake])
y = pd.DataFrame(pd.Series(y_real))
y = pd.concat([y, pd.Series(y_fake)])


ValueError: ignored

In [34]:
print(x_real.shape, y_real.shape)
print(y_real)
history = model.fit(x_real, y_real, epochs=N_EPOCHS, verbose=VERBOSE, validation_split=VALIDATION_SPLIT)

(131124, 16) (131124, 1)
[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [1.]]
Epoch 1/20


ValueError: ignored

# Putting the GAN together

In [None]:
def make_gan(generator, discriminator):
  model = Sequential()
  model.add(generator)
  model.add(discriminator)
  model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER)
  return model

In [None]:
generator = make_generator()
discriminator = make_discriminator()
gan = make_gan(generator, discriminator)