# Build Available Data

In [1]:
import pandas as pd
import numpy as np

## SDSS

Datasets build with data from the `SpecPhoto` SDSS data table. The data is downloaded using a CSV file, built using the following SQL:

```sql
SELECT objID,ra,dec,class,modelMag_u AS u,modelMag_g AS g,modelMag_r AS r,modelMag_i AS i,modelMag_z AS z, z as redshift
INTO mydb.SpecPhoto
FROM SpecPhoto
```

In [2]:
filename = 'SpecPhoto_data.csv'

### `sdss_specphoto`

In [3]:
# load csv to dataframe
df = pd.read_csv(filename)

df.head()

Unnamed: 0,objID,ra,dec,class,u,g,r,i,z,redshift
0,1237661464385356095,254.71961,20.675319,GALAXY,19.78247,18.51326,17.70357,17.26893,16.97296,0.15327
1,1237661463848550513,254.52927,20.236403,GALAXY,20.85782,18.6041,17.62333,17.17327,16.83212,0.079166
2,1237662336799408157,254.43515,21.19604,STAR,18.25609,17.21735,16.89063,16.75081,16.66703,7.5e-05
3,1237661464921964802,254.52014,21.203996,GALAXY,21.8164,19.32997,17.96913,17.32537,17.02673,0.223294
4,1237661465458835623,254.88178,21.637263,GALAXY,18.13901,17.30386,17.09862,16.98777,16.85346,0.022392


In [6]:
# build set of np arrays
id = df[df.columns[0]].to_numpy()
class_ = df[df.columns[3]].to_numpy()
ugriz = df[df.columns[4:-1]].to_numpy()
redshift = df[df.columns[-1]].to_numpy()

id.shape, class_.shape, ugriz.shape, redshift.shape

((4613773,), (4613773,), (4613773, 5), (4613773,))

In [13]:
# one hot encode classes
classes = ['GALAXY', 'STAR', 'QSO']
l = []
for c in df[df.columns[3]]:
    curr = np.zeros(3)
    curr[classes.index(c)] = 1
    l.append(curr)
class1hot = np.array(l)

class1hot.shape

(4613773, 3)

In [15]:
np.savez('sdss_specphoto.npz', id=id, ugriz=ugriz, redshift=redshift, class_=class_, class1hot=class1hot)