**Steps involved :**

**Data Collection**
* Preprocessing the Data
* Splitting the data(spectrum) into separate filter lengths
* Passing the filters through a CNN for reconstruction of the spectrum
* We can construct spectrum from just an image
---
**For the redshift part :**

* Soften the data using Savitzky Golay filter
* Construct the XGBoost model from scratch or from library
* Predict the redshift
* Compare the results

**Step 1 : Data collection :**
The cnn model we are using almost require 700 datasets. Which include 4 filters of the galaxy and it's spectrum respectively.But Downloading such mass of a dataset manually isn't possible and we couldn't find a ready set online. SO we had to create an automation software to mimic the manual entry and download.

* For the automation task we employed a library named Selenium. And the site from which the data is being downloaded is SDSS sky survey. The Automation process includes, Entering into the website and navigating to find the form regarding the SpaceID of the object of interest. The program then fills the form from one of the ID's stored in a CSV file tailored by us selecting suitable galaxies.
* After navigating, a new page is loaded to download the filters of the galaxy in a corrected frame.


---


* The file is then downloaded and stored in the following format:
* Data/galaxy/galaxy_id_filter.fits.bz2/galaxy_id_spectrum.fits.bz2

---

* Then the files from which in format of fits.bz2 is unpacked to .fits format.

In [29]:
# Importing the base modules
import pandas as pd
import numpy as np
import os
import shutil
import glob
import time
import bz2
from colorist import Color

# Import the web automation library
from selenium import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options

# Intitializing the options to an empty function to prevent errors
options = Options()

# Initializing the chrome driver


# Enter the SDSS data collection webpage
class Id_collection:
    def __init__(self,n_points,csv_file):
        self.n_points = n_points
        self.csv_file = csv_file
    def get_Ids(self):
        file = pd.read_csv(self.csv_file)
        ids = file["specobj_id"].head(self.n_points).tolist()
        SpaceIds = []
        for i in range(len(ids)):
            SpaceIds.append(int(ids[i][1:-1]))
        return SpaceIds
    def url_const_im(self,SpaceIds):
        urls = []
        for i in range(len(SpaceIds)):
            urls.append(f"http://cas.sdss.org/dr18/VisualTools/explore/summary?sId={SpaceIds[i]}")
        return urls
    def url_const_sp(self,SpaceIds):
        urls = []
        for i in range(len(SpaceIds)):
            urls.append(f"https://cas.sdss.org/dr18/VisualTools/explore/fitsspec?spec={SpaceIds[i]}")
        return urls

class Unpack_move:
    def unpack_fits(file_path):
        fits_file_n = file_path[:-4]
        with bz2.open(file_path,"rb") as file:
            with open(fits_file_n,"wb") as fits_file:
                fits_file.write(file.read())
        return fits_file_n
    def move_rename_images(file_name,id):
        collection_unit = os.path.join("C:/Users/HP/Desktop/Python/data",f"Galaxy_{id}/images")
        os.makedirs(collection_unit,exist_ok=True)
        shutil.move(file_name,os.path.join(collection_unit),os.path.basename(file_name))
    def move_rename_spectrum(file_name,id):
        collection_unit = os.path.join("C:/Users/HP/Desktop/Python/data",f"Galaxy_{id}/spectrum")
        os.makedirs(collection_unit,exist_ok=True)
        shutil.move(file_name,os.path.join(collection_unit),os.path.basename(file_name))


    
# Intitialize the Id collection class
Id_collection = Id_collection(10,"./optical_search_412544.csv")
spaceIds = Id_collection.get_Ids()
#print(spaceIds)
url_collection = Id_collection.url_const_im(spaceIds)
print(url_collection)
url_collection_spectrum = Id_collection.url_const_sp(spaceIds)
print(url_collection_spectrum)

# Now we have the urls for all n_points number of galxies in a list
# Next part is to access the url and download the filters and spectrum

def navigate_to_fits(url):
    driver = webdriver.Firefox()
    driver.get(url)
    # Define an explicit wait time (e.g., 15 seconds)
    wait = WebDriverWait(driver, 15)

    # Wait until the "FITS" link becomes clickable
    fits_link = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, 'FITS')))

    # Click the "FITS" link once it's clickable
    fits_link.click()
    
    # Switchcing the tab
    driver.switch_to.window(driver.window_handles[1])

    # Download all filters in corrected frames
    filters = ["u","g","r","i","z"]

    # Loop for all the filters and download the first link.
    for filter in filters:
        wait_time = WebDriverWait(driver,10)
        download_link = wait_time.until(EC.presence_of_element_located((By.LINK_TEXT,filter)))
        time.sleep(3)
        download_link.click()
    time.sleep(4)
    driver.quit()

def navigate_to_spectrum(url):
    driver = webdriver.Firefox()
    driver.get(url)
    wait = WebDriverWait(driver,15)

    spectrum_link = wait.until(EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT,"Download")))
    spectrum_link.click()
    time.sleep(2)
    driver.quit()
   
#running a loop

for i in range(3):
    navigate_to_fits(url_collection[i])
    navigate_to_spectrum(url_collection_spectrum[i])


downloads_path = "C:/Users/HP/Downloads"

file_list = glob.glob(os.path.join(downloads_path,"*.fits.bz2"))
file_list.sort(key = os.path.getmtime,reverse = True)


#Make batches of 5
for i in range(len(file_list)):
    folder_id = i//5 + 1
    ftbu = file_list[i]
    unpacked_file = Unpack_move.unpack_fits(ftbu)
    #moving to folder
    Unpack_move.move_rename_images(unpacked_file,folder_id)

#Formating the spectrum into the respective files

spec_file_list = glob.glob(os.path.join(downloads_path,"*fits"))
spec_file_list.sort(key = os.path.getmtime,reverse=True)

for i in range(len(spec_file_list)):
    folder_id = i+1
    Unpack_move.move_rename_spectrum(spec_file_list[i],folder_id)


['http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299489677444933632', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299489952322840576', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299490502078654464', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299491051834468352', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299491326712375296', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299491876468189184', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299492151346096128', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299492426224003072', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299492701101910016', 'http://cas.sdss.org/dr18/VisualTools/explore/summary?sId=299492975979816960']
['https://cas.sdss.org/dr18/VisualTools/explore/fitsspec?spec=299489677444933632', 'https://cas.sdss.org/dr18/VisualTools/explore/fitsspec?spec=299489952322840576', 'https://cas.sdss.org/dr18/VisualTools/explo