**Open this notebook from google drive**<br>
**Go to "Edit" -> "Notebook settings" and enable GPU.**


In [None]:
# Check if NVIDIA GPU is enabled
!nvidia-smi

**Connect and authorize google drive with google colab:**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
!ls

**Open our project "Galaxy Classifier" direct0ry in google drive:**

In [None]:
# %cd /content/gdrive/My Drive/
%cd /content/gdrive/My Drive/Colab Notebooks/galaxy_classifier/
!ls

**Data for our "Galaxy Classifier" directory in google drive:**

In [None]:
%cd /content/gdrive/My Drive/data/galaxy_data/
!ls

**Install all required libraries for our project:**

In [None]:
# !pip install -r ./requirements.txt

In [None]:
import os, random, shutil

import matplotlib.pyplot as plt
import pandas            as pd

from keras_preprocessing import image

%matplotlib inline  

import tensorflow as tf
print(tf.__version__)
tf.test.gpu_device_name()

**Test if TensorFlow works with gpu for you, in output should see similar results:**
```
2.2.0
'/device:GPU:0'
```

In [None]:
current_dir = os.getcwd()
data_dir    = os.path.join(current_dir, 'galaxy_data')


training_solutions = os.path.join(data_dir, 'training_solutions_rev1.csv')
training_images    = os.path.join(data_dir, 'images_training_rev1')

print(data_dir)
print(training_solutions)
print(training_images)

**1. Introduction**

Only interested in three major types of galaxies:

- Elliptical galaxies (early type);
- Lenticular galaxies (in-between);
- Spiral galaxies (late type);

To come up with a collection of images for each one of these classes, we sub-set some images in the Galaxy Zoo dataset based on the probabilities distributions available in the CSV file. The entire dataset is comprised of 61578 images.
To find a sub-set for each type we define a path in the decision tree to select observartions as follows:

- Elliptical: Class1.1 > 0.8 and Class7.1 > 0.4
- Lenticular: Class1.1 > 0.8 and Class7.2 > 0.4
- Spiral: Class1.2 > 0.8 and Class2.1 > 0.4

**2. Create the Dataset**

Read the training probablilities CVS file to a pandas dataframe.

In [None]:
# data frame
df = pd.read_csv(training_solutions)

# Replace Column names ('Class' -> 'C', just to shorten the codes)
df.columns = list( map(lambda s: s.replace('Class', 'C'), df.columns ) )

print(df.shape) # (61578, 38)
print(df.columns)
print(df.head())

In [None]:
# Get the galaxy types, as defined above
ellipticals = list( df[ (df['C1.1'] > 0.8) & (df['C7.1'] > 0.4) ]['GalaxyID'] )
lenticulars = list( df[ (df['C1.1'] > 0.8) & (df['C7.2'] > 0.4) ]['GalaxyID'] )
spirals     = list( df[ (df['C1.2'] > 0.8) & (df['C2.1'] > 0.4) ]['GalaxyID'] )

print('Total number of elliptical examples: {0}'.format(len(ellipticals)) )
print('Total number of lenticular examples: {0}'.format(len(lenticulars)) )
print('Total number of spiral examples: {0}'.format(len(spirals)) )

In [None]:
# Create a function that randomly copies the images for a given class (label)
# to the corresponding training and validation directories,
# these are later used in the datasets generators:

def _split_training_valid_datasets(cat_data, category = '', train_ratio = 0.75):
	training_dest = os.path.join(training_dir, category)
	valid_dest    = os.path.join(valid_dir, category)

	if not os.path.exists(training_dest):
		os.makedirs(training_dest)

	if not os.path.exists(valid_dest):
		os.makedirs(valid_dest)

	random.shuffle(cat_data)

	idx = int( len(cat_data) * train_ratio )

	for i in cat_data[:idx]:
		shutil.copyfile( os.path.join(training_images, str(i)+'.jpg'), os.path.join(training_dest, str(i)+'.jpg') )

	for i in cat_data[idx:]:
		shutil.copyfile( os.path.join(training_images, str(i)+'.jpg'), os.path.join(valid_dest, str(i)+'.jpg') )
	
	print(category + ' <<')
	##
# End - def

In [None]:
# Create directories for training and validation datasets
data_path    = os.path.join(current_dir, 'data')
training_dir = os.path.join(data_path, 'training')
valid_dir    = os.path.join(data_path, 'validation')

if not os.path.exists(data_path):
	os.makedirs(data_path)

if not os.path.exists(training_dir):
	os.makedirs(training_dir)

if not os.path.exists(valid_dir):
	os.makedirs(valid_dir)		

print(data_path)

In [None]:
# Split into the training and validation sets for each galaxy type
_split_training_valid_datasets(ellipticals, category = 'elliptical', train_ratio = 0.75)
_split_training_valid_datasets(lenticulars, category = 'lenticular', train_ratio = 0.75)
_split_training_valid_datasets(spirals, category = 'spiral', train_ratio = 0.75)

In [None]:
# Check the total number of training and validation samples:
print('Total train elliptical:', len(os.listdir(os.path.join(data_path, 'training', 'elliptical'))))
print('Total train lenticular:', len(os.listdir(os.path.join(data_path, 'training', 'lenticular'))))
print('Total train spiral:', len(os.listdir(os.path.join(data_path, 'training', 'spiral'))))


print('Total validation elliptical:', len(os.listdir(os.path.join(data_path, 'validation', 'elliptical'))))
print('Total validation lenticular:', len(os.listdir(os.path.join(data_path, 'validation', 'lenticular'))))
print('Total validation spiral:', len(os.listdir(os.path.join(data_path, 'validation', 'spiral'))))

In [None]:
# Finally, show a few examples from each galaxy class:

# elliptical class
plt.figure()
for num, file in enumerate(os.listdir(os.path.join(data_path, 'training', 'elliptical'))[0:3]):
    img = image.load_img(os.path.join(data_path, 'training', 'elliptical', file) , target_size=(150, 150))
    plt.subplot(1, 3, num+1)
    plt.axis('off')
    plt.imshow(img)
plt.title('Elliptical galaxies')
plt.show()




# lenticular class
plt.figure()
for num, file in enumerate(os.listdir(os.path.join(data_path, 'training', 'lenticular'))[0:3]):
    img = image.load_img(os.path.join(data_path, 'training', 'lenticular', file) , target_size=(150, 150))
    plt.subplot(1, 3, num+1)
    plt.axis('off')
    plt.imshow(img)
plt.title('Lenticular galaxies')
plt.show()
plt.close()



# spiral class
plt.figure()
for num, file in enumerate(os.listdir(os.path.join(data_path, 'training', 'spiral'))[0:3]):
    img = image.load_img(os.path.join(data_path, 'training', 'spiral', file) , target_size=(150, 150))
    plt.subplot(1, 3, num+1)
    plt.axis('off')
    plt.imshow(img)
plt.title('Spiral galaxies')
plt.show()
plt.close()


print('The final dataset : ' + data_path)    