# Build Datasets

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

## SDSS

Datasets build with data from the `SpecPhoto` SDSS data table. The data is downloaded using a CSV file, built using the following SQL:

```sql
SELECT objID,ra,dec,class,modelMag_u AS u,modelMag_g AS g,modelMag_r AS r,modelMag_i AS i,modelMag_z AS z, z as redshift
INTO mydb.SpecPhoto
FROM SpecPhoto
```

In [2]:
filename = 'SpecPhoto_data.csv'

### `sdss_specphoto_ugriz_redshift`

In [3]:
# load csv to dataframe and remove columns
df = pd.read_csv(filename)
df = df.drop(['ra', 'dec', 'class'], axis=1)

df.head()

Unnamed: 0,objID,u,g,r,i,z,redshift
0,1237661464385356095,19.78247,18.51326,17.70357,17.26893,16.97296,0.15327
1,1237661463848550513,20.85782,18.6041,17.62333,17.17327,16.83212,0.079166
2,1237662336799408157,18.25609,17.21735,16.89063,16.75081,16.66703,7.5e-05
3,1237661464921964802,21.8164,19.32997,17.96913,17.32537,17.02673,0.223294
4,1237661465458835623,18.13901,17.30386,17.09862,16.98777,16.85346,0.022392


In [4]:
# build set of np arrays
ids = df[df.columns[0]].to_numpy()
X = df[df.columns[1:-1]].to_numpy()
X_cols = df.columns[1:-1].to_numpy()
y = df[df.columns[-1]].to_numpy()
y_cols = np.array([df.columns[-1]])

ids.shape, X.shape, X_cols.shape, y.shape, y_cols.shape

((4613773,), (4613773, 5), (5,), (4613773,), (1,))

In [5]:
np.savez('sdss_specphoto_ugriz_redshift.npz', ids=ids, X=X, y=y, X_cols=X_cols, y_cols=y_cols)

### `sdss_specphoto_ugriz_class`

In [6]:
# load csv to dataframe and remove columns
df = pd.read_csv(filename)
df = df.drop(['ra', 'dec', 'redshift'], axis=1)
df = df[['objID', 'u', 'g', 'r', 'i', 'z', 'class']]

df.head()

Unnamed: 0,objID,u,g,r,i,z,class
0,1237661464385356095,19.78247,18.51326,17.70357,17.26893,16.97296,GALAXY
1,1237661463848550513,20.85782,18.6041,17.62333,17.17327,16.83212,GALAXY
2,1237662336799408157,18.25609,17.21735,16.89063,16.75081,16.66703,STAR
3,1237661464921964802,21.8164,19.32997,17.96913,17.32537,17.02673,GALAXY
4,1237661465458835623,18.13901,17.30386,17.09862,16.98777,16.85346,GALAXY


In [7]:
# build set of np arrays
ids = df[df.columns[0]].to_numpy()
X = df[df.columns[1:-1]].to_numpy()
X_cols = df.columns[1:-1].to_numpy()
y_cols = np.array([df.columns[-1]])

In [8]:
# one hot encode classes
classes = ['GALAXY', 'STAR', 'QSO']
l = []
for c in df[df.columns[-1]]:
    curr = np.zeros(3)
    curr[classes.index(c)] = 1
    l.append(curr)
y = np.array(l)

y.shape

(4613773, 3)

In [9]:
np.savez('sdss_specphoto_ugriz_class.npz', X=X, X_cols=X_cols, y=y, y_cols=y_cols, classes=np.array(classes))