# PCA to Stellar Spectral Classification


From Deeming 1963 applied to late stars

He discussed the specific case of late-type giants. He used 84 G and K class III giants. 


In [6]:
#Bokeh plotting
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import Span, Label, Arrow, NormalHead
from bokeh.models import HoverTool, tools, ColumnDataSource, CustomJS, Slider, BoxAnnotation
from bokeh.layouts import  column, row
from bokeh.palettes import Category20_18
import re
import glob, os
from astropy.io import fits
import urllib
#from urllib import urlretrieve
from sklearn.decomposition import TruncatedSVD
#Garbage collector
import gc

from pyraf import iraf
output_notebook()
iraf.noao()
iraf.noao.onedspec()
iraf.dataio()

Defined the classes. Lines and spec. Add a new attribute with the general type of star. 

In [7]:
#Emission Lines
class line(object):
    """Line with the name of the line, line center and the region around it. 
    This region around it will be use to fit it using IRAF function. It has to be a list of numbers"""
    def __init__(self,name,linecenter,regiontofit):
        self.name = name
        self.linecenter = linecenter
        self.regiontofit = regiontofit
        
def find_nearest(array,value):
    idx = (np.abs(array-value)).argmin()
    return idx, array[idx]

def isDigit(x):
    try:
        float(x)
        return True
    except ValueError:
        return False
    
def gaussian(x, mu, sig,core):
    """Gaussian"""
    return core*np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))

        
class spec2(object):
    """Spectra created from the url. Probably in the future better with the fiber
    , plate and MJD"""
    def __init__(self,url):
        self.url = url
        self.name = url.split('/')[-1]
        self.wave, self.flux, self.model, self.type, self.typegeneral = self.get_type()
       
            
    def get_type(self, verbose=False):
        """Get the type, flux, wavelenght and model from the SDSS TABLE. 
        It downloadst he fit if it doesnt exist. Had to do it like that becase it would waint until the download
        to call the other method"""
        #Download if it doesnt't exists. 
        if not os.path.isfile(self.name):
            file_name = self.name
            u = urllib.urlopen(self.url)
            f = open(file_name, 'wb')
            meta = u.info()
            file_size = int(meta.getheaders("Content-Length")[0])
            #print "Downloading: %s Bytes: %s" % (file_name, file_size)

            file_size_dl = 0
            block_sz = 8192
            while True:
                buffer = u.read(block_sz)
                if not buffer:
                    break

                file_size_dl += len(buffer)
                f.write(buffer)
                status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
                status = status + chr(8)*(len(status)+1)
                if verbose:
                    print status,

            f.close()
            #print('Downloaded '+ self.name)
        
        ob = fits.open(self.name,memmap=False)
        #Get data and save wavelenft in x, and flux on y
        dataob = ob[1].data
        x = 10**(dataob['loglam'])
        y2 = dataob['model']
        y = dataob['flux']
        typee = ob[2].data['SUBCLASS'][0]
        ob.close()
        #Delete
        del ob
        del dataob
        gc.collect()
        
        npattern = re.compile('[O,B,A,F,G,K,WD,M,CV]{1,2}')
        typegeneral = npattern.findall(typee)
        if 'WD' in typegeneral:
            typegeneral = 'WD'
        elif len(typegeneral) == 0:
            typegeneral = 'Other'
        else:
            typegeneral = typegeneral[0]
        #delete to aboid too many oepenf files see
        #http://docs.astropy.org/en/stable/io/fits/appendix/faq.html#i-m-opening-many-fits-files-in-a-loop-and-getting-oserror-too-many-open-files
        return x,y,y2, typee, typegeneral
    
    def get_iraffits(self, modelfit = True):
        """From the SDSS table creates a fits file to work with iraf"""
        #Which one to convert to iraf fts
        if modelfit:
            fluxormodel = self.model
        else:
            fluxormodel = self.flux
        #Name of iraf .txt file and create text file
        
        namenewfits = '{name}.txt'.format(name=self.name)
        with open(namenewfits,'w') as file:
            for x,y in zip(self.wave,fluxormodel):
                file.write('{}\t{}\n'.format(x,y))
                
        #Create fits file from text file interpolation to work wtih IRAF functions
        iraffitsname = 'iraf'+self.name
        if os.path.exists(iraffitsname+'.fits'):
            os.remove(iraffitsname+'.fits')
        iraf.rspectext(namenewfits,iraffitsname,dtype='interp')
        #Remove the text file
        os.remove(namenewfits)
        
    
    def fit_lines(self, dicoflines, errorestimate = True, verbose='No'):
        """Fit lines using gaussian fitport. It populates a diccionary with the lines
         and the fit of the lines. The parameter if a diccionary of lines objects. 
         The class is define before. Verbose can be yes or NO"""
        self.linesdicall = []
        errorparam = []
        if not os.path.exists('iraf'+self.name):
            self.get_iraffits()
            
        #Initialize files log and lines
        ! echo '' > fited.log

        for indexline,linesfit in enumerate(dicoflines):
            regionf = "{} {}".format(linesfit.regiontofit[0],linesfit.regiontofit[1])
            #wavelenght
            xlimns = [find_nearest(self.wave,i)[0] for i in linesfit.regiontofit   ]
            wavex = self.wave[xlimns[0]:xlimns[1]]
            lineszero = linesfit.linecenter
            ! echo '$lineszero' > lines.lines

            filename = 'iraf'+self.name
            #Error estimation iraf: http://stsdas.stsci.edu/cgi-bin/gethelp.cgi?fitprofs
            if errorestimate:        
                iraf.fitprofs(filename,pos='lines.lines', reg=regionf ,
                              fitbackground= 'yes', 
                              logfile='fited.log'
                              ,nerrsample='100',sigma0='4',invgain='4',verbose=verbose)
            else:
                iraf.fitprofs(filename,pos='lines.lines', reg=regionf ,
                              fitbackground= 'yes', 
                              logfile='fited.log', verbose=verbose)

            #Plotting the gaussian
            #Find in log file of fitprofs.  
            npattern = re.compile('[-\d.]+')
            npattern2 = re.compile('[-\d.E?]+') #Gets the exponentioals

            gparameters=[]
            with open('fited.log','r') as file:
                for lines in file:

                    if '(' not in lines:
                        temp = npattern2.findall(lines)
                        if 'INDEF' in lines:
                            gparameters.append(7*[0])
                    else:
                        errorparam.append(npattern2.findall(lines))

                    if len(temp) == 7 and all(isDigit(i) for i in temp) and 'INDEF' not in lines:
                        gparameters.append(temp)

            #gaussian
            if len(gparameters) > 0:
                gparamfinal = [ float(i) for i in gparameters[-1] ]
                centerg, contg, fluxg, eqwg, coreg, fwhmg, fwhml = gparamfinal
                yg = gaussian(wavex,centerg,fwhmg/2.3538,coreg) + contg


            if errorestimate == True:
                errorparamfinal = [ float(i) for i in errorparam[-1] ]
                #print(gparamfinal)
                #print(errorparamfinal)


                linesdic = {'linename':linesfit.name,
                             'center':centerg,
                              'EW': eqwg,
                              'EWerror':errorparamfinal[3],
                            'fluxg':fluxg,
                            'coregaus':coreg,
                            'fwgmgaus':fwhmg,
                            'contgaus':contg,
                            'gaussian':{'x':wavex,'y':yg}
                            }
            else:
                 linesdic = {'linename':linesfit.name,
                             'center':centerg,
                              'EW': eqwg,
                              'EWerror':0,
                            'fluxg':fluxg,
                            'coregaus':coreg,
                            'fwgmgaus':fwhmg,
                            'contgaus':contg,
                            'gaussian':{'x':wavex,'y':yg}
                            }
                
                
                
            self.linesdicall.append(linesdic)

        
        


In [8]:
#Define the new object
tryspec = spec2('https://dr14.sdss.org/sas/dr14/sdss/spectro/redux/26/spectra/0266/spec-0266-51630-0015.fits')
tryspec.typegeneral

'A'

In [9]:
%%time
diclines = {line('P15',8547,[8520,8560]),line('P14',8600,[8600-20,8600+20]),
           line('Halpha',6562,[6550,6575]),line('P16',8504,[8504-20,8504+20]),
            line('P13',8667,[8667-20,8667+20]),line('P12',8752,[8752-20,8752+20]),
           line('P11',8865,[8865-20,8865+20]),line('P10',9017,[9017-20,9017+20])}




dicofspectra = [];
filestoread = ['../listG1V.txt',
              '../listG2.txt','../listG4V.txt','../listG5III+.txt',
               '../listG8V.txt','../listG9IV.txt','../listK1III.txt',
               '../listK2III.txt','../listK3III.txt','../listK4III.txt',
               '../listK5III.txt','../listK7.txt']

#filestoread = glob.glob('../list*.txt')

limitonlines = 10
print('Reading {} files and {} lines per file'.format(len(filestoread),limitonlines))

for index,files in enumerate(filestoread):
    with open(files,'r') as f:
        linereads = f.readlines()
        for i in range(min(limitonlines, len(linereads))):
            lineread = linereads[i]
            #print(lineread)
            url = lineread.split(',')[1]
            #Strip Return a copy of the string with leading and trailing characters removed. 
            url = url.strip()
            url = url.replace('segue1','sdss')
            #print(url)
            tempspec = spec2(url)
            dicofspectra.append(tempspec)
            gc.collect()
            #print(index)
            
            
#Plot parameters
colors = 100*Category20_18 #+ Viridis8 + Dark2_8 + Paired8 + Set1_4
#Plot range
xr = (5000,8900)
yr = (0,40)


#Tool to get wavelength
hover2 = HoverTool(
        tooltips=[
            ("(x,y)", "($x{1}, $y)"),
        ]
    )
#Add the tool
#Start index of color
index = 0
fit = True
plot = False

if plot:
        #Create the Bokeh Figure
    pl =  figure(x_axis_label='Angstrom', y_axis_label='Y',title="Click on the desired stellar template to overplot", x_range=xr, y_range=yr
                  ,active_drag='pan', active_scroll='wheel_zoom',
                  plot_width=900, plot_height=1000
                 )

    pl.add_tools(hover2)





# Loop over spec object list:

for spectra in dicofspectra:

    # Plot the 
    if plot:
        t = pl.line(x=spectra.wave,y=spectra.model,color=colors[index], line_alpha=1.0, 
                line_width=4,legend=spectra.type+str(index),muted_alpha=0.,muted_color=colors[index])

    if fit:
        try:
            spectra.fit_lines(diclines, verbose='No', errorestimate=False)
        except:
            pass
        for i in spectra.linesdicall:
            if plot:
                pl.line(i['gaussian']['x'],i['gaussian']['y'],color='red', line_alpha=1.,
                    line_width=5,legend=tryspec.name,muted_alpha=0.,muted_color=colors[index])

    index = index +1

if plot:    
    pl.legend.location = "top_left"
    pl.legend.click_policy="mute"

    show(pl)  

            

Reading 12 files and 10 lines per file




CPU times: user 48 s, sys: 9.01 s, total: 57 s
Wall time: 3min 2s


In [14]:
typess = []
for star in dicofspectra:
    typess.append(star.type)
np.unique(typess,return_counts=True)

(array(['G1V (95128)', 'G2', 'G4V (32923)', 'G5III+... (157910)',
        'G8V (101501)', 'G9IV (100030)', 'K1III (18322)', 'K2III (115136)',
        'K3III (101673)', 'K4III (136726)', 'K5III (111335)', 'K7'],
       dtype='|S18'), array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10]))

In [28]:
for index,i in enumerate(diclines):
    print(index,i.name)

(0, 'P16')
(1, 'P13')
(2, 'P12')
(3, 'P11')
(4, 'P10')
(5, 'P15')
(6, 'P14')
(7, 'Halpha')


In [38]:
matrix = []
#Loop over each line in the diccionart of lines we created. 
for linesfit in diclines:
    listofeq = []
    starclass = []
    #Loop over all the stars in the diccionary of spectra.
    #For each star if we were able to fit a line get the EW
    #Of the line
    for indexs,stars in enumerate(dicofspectra):
        if hasattr(stars,'linesdicall'):
            lines = stars.linesdicall
            #We find the line and ew of the lne and append
            for line in lines:
                if line['linename'] == linesfit.name:
                    eq = line['EW']
                    listofeq.append(eq)
                    starclass.append(stars.type)
                    
    matrix.append(listofeq)
        #    print(stars.typegeneral)

In [39]:
mat = np.array(matrix)
mat.shape

(8, 120)

In [40]:
np.matmul(mat, mat.T).shape

(8, 8)

In [144]:
U, D, V = np.linalg.svd(mat, full_matrices=False)
D

array([  1.18448443e+05,   2.45185121e+04,   2.25401800e+04,
         1.88988985e+04,   1.87375894e+04,   9.07187025e+03,
         3.72158756e+03,   4.21935775e+00])

In [145]:
svd2 = TruncatedSVD(n_components=2,algorithm='arpack')
svdfit = svd2.fit_transform(mat.T)

In [150]:
svdfit[:,1]

array([  3.62483645e+00,   2.21618854e+00,   3.99900696e+00,
         3.88359334e+00,   1.66905152e+00,   3.63353716e+00,
         4.42623580e+00,   1.69529121e+00,   2.33492286e+00,
         2.28317220e+00,   1.10719024e+00,   1.23406434e+00,
         9.19153741e+03,   8.97023579e-01,   1.56643084e+00,
         1.69119959e+00,   1.65402440e+00,   1.32549293e+00,
         1.52545259e+00,   1.71000424e+00,   6.78102210e+02,
         3.27276615e+01,   6.22064741e+02,   1.69348553e+00,
         1.92182823e+00,   2.38103166e+00,   2.32821223e+00,
         1.50933247e+04,   3.54523251e+02,   2.70294354e+00,
         2.78824141e+00,   1.87924483e+00,   2.46993704e+00,
         1.00427193e+00,   2.58452816e+00,   2.29301767e+00,
         3.03151504e+00,   1.94503514e+00,   3.98329307e+00,
         6.05194249e+00,   1.67219853e+00,   2.06729265e+00,
        -6.41337588e+01,   2.44483091e+00,   6.40403915e+03,
         2.05914677e+00,   2.24772313e+00,  -4.03407377e+01,
         1.14996639e+00,

In [151]:
V[0]

array([ -1.76577802e-06,  -1.07506148e-06,  -1.98033896e-06,
        -1.84103056e-06,  -8.18652816e-07,  -1.65923950e-06,
        -2.04071301e-06,  -8.20362639e-07,  -1.12176838e-06,
        -1.11461138e-06,  -1.03873860e-06,  -1.17414681e-06,
        -6.19107917e-05,  -8.69277227e-07,  -1.52587069e-06,
        -1.61725175e-06,  -1.46986768e-06,  -1.27643449e-06,
        -1.48579232e-06,  -1.67540293e-06,  -4.84333177e-04,
        -2.07249216e-05,  -3.04650319e-05,  -3.46693033e-06,
        -6.95139088e-06,  -1.14136688e-05,  -7.65214467e-06,
        -5.12119508e-03,  -7.23530188e-02,  -7.42211224e-06,
        -1.29100765e-06,  -1.95404969e-06,  -2.61411803e-06,
        -3.30490524e-06,  -1.18333549e-06,  -1.06042584e-06,
        -1.36367833e-06,  -8.08261731e-06,   3.74991754e-05,
        -2.81326063e-06,  -9.59246973e-07,  -1.18935856e-06,
        -1.21308195e-01,  -1.54665377e-05,  -6.52624769e-02,
         2.34608387e-06,  -2.82403231e-05,  -8.27858877e-02,
        -1.15769354e-06,

# Plot

In [148]:
coldic ={}
for index,typestar in enumerate(np.unique([i.type for i in dicofspectra])):
    coldic[str(typestar)] = colors[index]
coldic
label = starclass
colorsvd = [ coldic[i] for i in starclass ]


dd = {}
dd['x'] = svdfit[:,0]
dd['y'] = svdfit[:,1]
dd['labels'] = label
dd['colors'] = colorsvd
source = ColumnDataSource(dd)


p = figure()


p.circle(x='x', y='y', color='colors', legend = 'labels',source=source , size =5)
show(p)

In [149]:
coldic ={}
for index,typestar in enumerate(np.unique([i.type for i in dicofspectra])):
    coldic[str(typestar)] = colors[index]
coldic
label = starclass
colorsvd = [ coldic[i] for i in starclass ]


dd = {}
dd['x'] = V[0]
dd['y'] = V[1]
dd['labels'] = label
dd['colors'] = colorsvd
source = ColumnDataSource(dd)


p2 = figure()


p2.circle(x='x', y='y', color='colors', legend = 'labels',source=source , size =5)
show(p2)

The matrix actually returned by Trucanted SVD is the dot product of U and  D matrix. 

In [162]:
svdfit[0:5]

array([[ 0.20915366,  3.62483645],
       [ 0.12733936,  2.21618854],
       [ 0.23456807,  3.99900696],
       [ 0.2180672 ,  3.88359334],
       [ 0.09696815,  1.66905152]])

In [205]:
new_D = D[1]
new_U = V[:,0]
new_U.dot(new_D)

array([ -4.32942498e-02,   3.62483645e+00,   2.84472279e+00,
        -3.12304611e+00,  -2.60668163e+00,  -7.21225485e+00,
        -3.34014082e+01,  -4.61427887e+03])

In [229]:
mat

array([[  1.70100000e+00,   1.05200000e+00,   1.86800000e+00,
          1.82100000e+00,   7.86800000e-01,   1.70300000e+00,
          2.05700000e+00,   8.11000000e-01,   1.11600000e+00,
          1.08300000e+00,   4.87700000e-01,   5.38200000e-01,
         -3.01300000e+02,   4.00900000e-01,   6.89300000e-01,
          7.32600000e-01,   7.19600000e-01,   5.79700000e-01,
          6.87700000e-01,   7.60100000e-01,   0.00000000e+00,
          8.18300000e-01,   7.27000000e+02,   8.39800000e-01,
          9.39000000e-01,   1.17100000e+00,   1.15200000e+00,
          1.18380000e+04,   2.51200000e+02,   1.33200000e+00,
          1.36400000e+00,   9.10500000e-01,   1.20300000e+00,
          1.17900000e+00,   1.26100000e+00,   1.11900000e+00,
          1.45700000e+00,   9.54700000e-01,   1.94100000e+00,
          2.87900000e+00,   7.82900000e-01,   9.55900000e-01,
          8.07600000e-01,   1.12900000e+00,   7.54200000e+03,
          9.81900000e-01,   1.04300000e+00,   2.11300000e+00,
        

In [226]:
V.shape

(8, 120)

In [234]:
np.diag(D)

array([[  1.18448443e+05,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   2.45185121e+04,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   2.25401800e+04,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.88988985e+04,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.87375894e+04,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   9.07187025e+03,
          0.00000000e+00,   0.0

In [236]:
np.dot(np.dot(U,np.diag(D)) , V)

array([[  1.70100000e+00,   1.05200000e+00,   1.86800000e+00,
          1.82100000e+00,   7.86800000e-01,   1.70300000e+00,
          2.05700000e+00,   8.11000000e-01,   1.11600000e+00,
          1.08300000e+00,   4.87700000e-01,   5.38200000e-01,
         -3.01300000e+02,   4.00900000e-01,   6.89300000e-01,
          7.32600000e-01,   7.19600000e-01,   5.79700000e-01,
          6.87700000e-01,   7.60100000e-01,   3.05003616e-12,
          8.18300000e-01,   7.27000000e+02,   8.39800000e-01,
          9.39000000e-01,   1.17100000e+00,   1.15200000e+00,
          1.18380000e+04,   2.51200000e+02,   1.33200000e+00,
          1.36400000e+00,   9.10500000e-01,   1.20300000e+00,
          1.17900000e+00,   1.26100000e+00,   1.11900000e+00,
          1.45700000e+00,   9.54700000e-01,   1.94100000e+00,
          2.87900000e+00,   7.82900000e-01,   9.55900000e-01,
          8.07600000e-01,   1.12900000e+00,   7.54200000e+03,
          9.81900000e-01,   1.04300000e+00,   2.11300000e+00,
        

In [291]:
prueba = np.array([ [1,2,5,6], [2,3,7,8],[6,7,9,50]])
print(prueba.shape)
Up, Dp, Vp = np.linalg.svd(prueba,full_matrices=False)
print(Up.shape)
print(Vp.shape)
print(prueba)

(3, 4)
(3, 3)
(3, 4)
[[ 1  2  5  6]
 [ 2  3  7  8]
 [ 6  7  9 50]]


In [292]:
np.dot(np.dot(Up, np.diag(Dp)), Vp)

array([[  1.,   2.,   5.,   6.],
       [  2.,   3.,   7.,   8.],
       [  6.,   7.,   9.,  50.]])

In [293]:
Dp

array([ 53.01701453,   6.85938137,   0.38086351])

In [307]:
dim = 3
Upn = Up[:,0:dim]
Dpn= Dp[0:dim]
Vpn = Vp[0:dim,:]
Dpn

array([ 53.01701453,   6.85938137,   0.38086351])

In [308]:
np.dot(np.dot(Upn, np.diag(Dpn)), Vpn)

array([[  1.,   2.,   5.,   6.],
       [  2.,   3.,   7.,   8.],
       [  6.,   7.,   9.,  50.]])

In [312]:
svd2.fit(prueba)

ValueError: matrix type must be 'f', 'd', 'F', or 'D'

In [91]:
iraf.noao()
iraf.noao.onedspec()
iraf.dataio()

AttributeError: 'NoneType' object has no attribute 'Expand'

In [95]:
import sys, os
executable = sys.argv[0]
while os.path.islink(executable):
    executable = os.readlink(executable)
if os.path.split(executable)[1] == "pyraf":
# this code executes only if this is a pyraf session
    from pyraf import iraf
    startup = iraf.osfn("home$pyraflogin.py")
    if os.path.exists(startup):
        execfile(startup)
    del startup
del executable   # clean up namespace
    

In [4]:
iraf.noao()