# Imports

In [1]:
import numpy as np
import pandas as pd
import random as rand


from keras.utils import to_categorical
from keras.layers import Dense, Activation, Conv1D, Dropout
from keras.models import Sequential

# Data pre-processing

Due to the large size of dataset, many columns that are not needed and some rows containing null values have already been dropped

In [2]:
data = pd.read_csv("dataset.csv")

data = data.drop(['class', 'epoch_cal'], axis=1)
data = data.dropna()
data['neo'] = data['neo'].replace(('Y','N'), (1,0))
data['pha'] = data['pha'].replace(('Y','N'), (1,0))

#num_classes = data['class'].nunique()
#classes = to_categorical(data['class'], num_classes)

data = data.astype('float32')
data /= 255

print(data.shape)

print(data.info)

(131124, 16)
<bound method DataFrame.info of         neo  pha         H  diameter    albedo         e         a         q  \
0       0.0  0.0  0.013333  3.683922  0.000353  0.000298  0.010859  0.010034   
1       0.0  0.0  0.016471  2.137255  0.000396  0.000902  0.010878  0.008376   
2       0.0  0.0  0.020902  0.967043  0.000839  0.001008  0.010464  0.007775   
3       0.0  0.0  0.011765  2.060392  0.001658  0.000348  0.009260  0.008439   
4       0.0  0.0  0.027059  0.418427  0.001075  0.000749  0.010094  0.008167   
...     ...  ...       ...       ...       ...       ...       ...       ...   
573686  0.0  0.0  0.061176  0.017612  0.000094  0.000717  0.011518  0.009411   
573687  0.0  0.0  0.060000  0.018020  0.000361  0.000289  0.012456  0.011538   
573688  0.0  0.0  0.060392  0.016024  0.000455  0.000326  0.012663  0.011610   
573689  0.0  0.0  0.064314  0.012820  0.000408  0.000827  0.012432  0.009811   
573690  0.0  0.0  0.065882  0.008361  0.000243  0.000899  0.011454  0.00882

In [None]:
data

Unnamed: 0,neo,pha,H,diameter,albedo,e,a,q,i,om,w,ad,n,tp_cal,per,moid
0,0.0,0.0,0.013333,3.683922,0.000353,0.000298,0.010859,0.010034,0.041545,0.314924,0.288618,0.011685,0.000839,79138.937500,6.600572,0.006254
1,0.0,0.0,0.016471,2.137255,0.000396,0.000902,0.010878,0.008376,0.136600,0.678528,1.216480,0.013379,0.000837,79140.085938,6.617298,0.004840
2,0.0,0.0,0.020902,0.967043,0.000839,0.001008,0.010464,0.007775,0.050945,0.666084,0.972809,0.013152,0.000887,79141.664062,6.243191,0.004056
3,0.0,0.0,0.011765,2.060392,0.001658,0.000348,0.009260,0.008439,0.028007,0.407101,0.591092,0.010082,0.001065,79139.257812,5.197775,0.004469
4,0.0,0.0,0.027059,0.418427,0.001075,0.000749,0.010094,0.008167,0.021049,0.555180,1.406464,0.012021,0.000936,79216.929688,5.915351,0.004297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
573686,0.0,0.0,0.061176,0.017612,0.000094,0.000717,0.011518,0.009411,0.109669,0.674466,0.631966,0.013625,0.000768,79137.734375,7.209965,0.005500
573687,0.0,0.0,0.060000,0.018020,0.000361,0.000289,0.012456,0.011538,0.090955,0.168830,0.812197,0.013374,0.000683,79297.695312,8.108079,0.007620
573688,0.0,0.0,0.060392,0.016024,0.000455,0.000326,0.012663,0.011610,0.064425,0.247204,0.285630,0.013716,0.000666,79256.156250,8.311297,0.007947
573689,0.0,0.0,0.064314,0.012820,0.000408,0.000827,0.012432,0.009811,0.052861,0.314412,0.295110,0.015053,0.000685,79256.945312,8.085117,0.006092


Training parameters


In [3]:
INPUT_SHAPE = (16,1)
OPTIMIZER = 'adam'
LOSS = 'binary_crossentropy'
METRICS = 'accuracy'

BATCH_SIZE = 128
N_EPOCHS = 20
VERBOSE = 1
VALIDATION_SPLIT = 0.2

# Generator model
18 outputs as there are 18 columns in the dataset

In [4]:
def make_generator(latent_dim):
  model = Sequential()
  
  model.add(Dense(40, activation='relu', input_dim=latent_dim))
  model.add(Dense(18, activation='sigmoid'))

  return model

# Discriminator model
This is a binary CNN classifier, to determine whether the input is 'real' or not

In [5]:
def make_discriminator():
  model = Sequential()

  model.add(Conv1D(64, kernel_size=5, padding='same', activation='relu', input_shape=INPUT_SHAPE))

  model.add(Dense(1, activation='sigmoid'))

  return model

In [6]:
model = make_discriminator()

model.compile(loss=LOSS, optimizer=OPTIMIZER, metrics=METRICS)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 16, 64)            384       
                                                                 
 dense (Dense)               (None, 16, 1)             65        
                                                                 
Total params: 449
Trainable params: 449
Non-trainable params: 0
_________________________________________________________________


In [7]:
x_real = data
y_real = np.ones((131124))

Generate random, fake samples for training discriminator

In [8]:
def generate_fakes():
  neo = rand.choice([1,0])
  pha = rand.choice([1,0])
  diameter = rand.uniform(0,100000)
  albedo = rand.uniform(0,1)
  e = rand.uniform(0, 0.002)
  a = rand.uniform(0,1)
  q = rand.uniform(0,1)
  i = rand.uniform(0,1)
  om = rand.uniform(0,1)
  w = rand.uniform(0,2)
  ad = rand.uniform(0,0.05)
  n = rand.uniform(0,0.01)
  tp_cal = rand.uniform(0,100000)
  per = rand.uniform(0,10)
  moid = rand.uniform(0,0.1)

  X = np.array([neo, pha, diameter, albedo, e, a, q, i, om, w, ad, n, tp_cal, per, moid])
  
  return X

In [None]:
x_fake = pd.DataFrame()
for i in range(131124):
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
y_fake = np.zeros((131124))

x = x_real.append(x_fake)
y = pd.DataFrame(pd.Series(y_real))
y = y.append(pd.Series(y_fake), ignore_index=True)
history = model.fit(x, y, epochs=N_EPOCHS, verbose=VERBOSE, validation_split=VALIDATION_SPLIT)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignore_index=True)
  x_fake = x_fake.append(pd.Series(generate_fakes()), ignor

# Putting the GAN together

In [None]:
def make_gan(generator, discriminator):
  model = Sequential()
  model.add(generator)
  model.add(discriminator)
  model.compile(loss='binary_crossentropy', optimizer=OPTIMIZER)
  return model

In [None]:
generator = make_generator()
discriminator = make_discriminator()
gan = make_gan(generator, discriminator)