## 1. Imports and Functions (Run Every Time)

The following block of code consists of all of the imports and functions required to run the code in all of the sections below. Thus, this block must be run every time the kernel is restarted (important to run when first opening the document and when using Section 5).

In [1]:
### Imports
%matplotlib inline
from matplotlib import rcParams
rcParams['axes.labelsize'] = '16'
rcParams['legend.frameon'] = False
import numpy as np
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
from matplotlib.colors import LogNorm
import os
from astropy.table import Table
import glob
import tensorflow as tf
import keras
import pandas as pd
from alive_progress import alive_bar

### Underlying Functions

def random_uniform_array(start,stop,size):
    values=np.zeros(size)
    for i in range(size):
        values[i]=random.uniform(start,stop)
    return values

def Rotation_Matrix(alpha,beta,gamma): ### z, y, x / yaw, pitch, roll
    return np.array([[np.cos(alpha)*np.cos(beta),np.cos(alpha)*np.sin(beta)*np.sin(gamma)-np.sin(alpha)*np.cos(gamma),np.cos(alpha)*np.sin(beta)*np.cos(gamma)+np.sin(alpha)*np.sin(gamma)],
                     [np.sin(alpha)*np.cos(beta),np.sin(alpha)*np.sin(beta)*np.sin(gamma)+np.cos(alpha)*np.cos(gamma),np.sin(alpha)*np.sin(beta)*np.cos(gamma)-np.cos(alpha)*np.sin(gamma)],
                     [-np.sin(beta),np.cos(beta)*np.sin(gamma),np.cos(beta)*np.cos(gamma)]])

def Projected_Points(points,alpha,beta,gamma):
    projections=np.zeros(points.shape)
    R=Rotation_Matrix(alpha,beta,gamma)
    for i in range(len(points)):
        v=points[i]
        projections[i]=(R@v)
    return projections

def Tensor_Image(points):
    plt.rcParams["figure.figsize"]=(25/36,25/36)
    fig=plt.figure()
    plot=fig.add_subplot(111)
    nan_indices = np.isnan(points[:,0]) | np.isnan(points[:,1])
    finite_indices=~nan_indices
    plot.hist2d(points[:,0][finite_indices],points[:,1][finite_indices],bins=(50,50),norm=LogNorm())
    plot.axis("equal")
    plot.axis("off")
    fig.canvas.draw()
    w,h=fig.canvas.get_width_height()
    Image=np.fromstring(fig.canvas.tostring_argb(),dtype=np.uint8)
    Image.shape=(w,h,4)
    Image=np.roll(Image,3,axis=2)
    Tensor_Image=tf.cast(Image,tf.float32)/255
    plt.close()
    return Tensor_Image

def Save_Image_Set(Images,File_Name):
    if type(Images)==tf.Tensor:
        Images=Images.numpy()
        np.save(File_Name,Images)
    else:
        np.save(File_Name,Images)
    
def Load_Array(File_Name):
    Images=np.load(File_Name,allow_pickle=True)        
    if type(Images[0])==np.ndarray:
        Images=tf.cast(Images,tf.float32)
    else:
        Images=tf.cast(Images,tf.string)
    return Images

def Image_Set_Classifications(N1,File_Name,classification):
    Class_Name=File_Name[:-4]+"_classifications.npy"
    classifications=np.repeat(classification,N1)
    np.save(Class_Name,classifications)
    
def Classifications_to_Labels(Data_Classifications,class_names):
    Labels=np.ones(len(Data_Classifications))
    for i in range(len(Data_Classifications)):
        for j in range(len(class_names)):
            if Data_Classifications[i]==class_names[j]:
                Labels[i]=j
    return Labels

def Image_Reader(Data_Folder):
    cwd=os.getcwd()
    os.chdir(Data_Folder)
    Files=glob.glob("*.fits")
    
    File_name_0=Files[0]
    alldata_0=Table.read(File_name_0)
    xdata_0=alldata_0["pos_x"]
    ydata_0=alldata_0["pos_y"]
    zdata_0=alldata_0["pos_z"]
    
    points=np.empty((len(xdata_0),3))
    for j in range(len(xdata_0)):
        points[j]=np.array([xdata_0[j],ydata_0[j],zdata_0[j]])
        
    Data=[Tensor_Image(points)]
    
    for i in range(1,len(Files)):
        File_name=Files[i]
        alldata=Table.read(File_name)
        xdata=alldata["pos_x"]
        ydata=alldata["pos_y"]
        zdata=alldata["pos_z"]
        
        points=np.empty((len(xdata),3))
        for j in range(len(xdata)):
            points[j]=np.array([xdata[j],ydata[j],zdata[j]])
        New_Data=Tensor_Image(points)
        
        Data.append(New_Data)
    os.chdir(cwd)
    
    Data=tf.cast(Data,tf.float32)
    return Files,Data

def Import_Sheet_Classifications(Sheet_ID,Sheet_Names):
    URL=f"https://docs.google.com/spreadsheets/d/{Sheet_ID}/gviz/tq?tqx=out:csv&sheet={Sheet_Names[0]}"
    Sheet=pd.read_csv(URL,warn_bad_lines=False)
    Files=np.array(Sheet["Halo ID"],dtype=str)
    Classifications=np.array(Sheet["Classification"],dtype=str)
    for i in range(1,len(Sheet_Names)):
        URL=f"https://docs.google.com/spreadsheets/d/{Sheet_ID}/gviz/tq?tqx=out:csv&sheet={Sheet_Names[i]}"
        Sheet=pd.read_csv(URL,warn_bad_lines=False)
        New_Files=np.array(Sheet["Halo ID"],dtype=str)
        New_Classifications=np.array(Sheet["Classification"],dtype=str)
        Files=np.append(Files,New_Files)
        Classifications=np.append(Classifications,New_Classifications)
    return Files,Classifications

def Import_Folder_Files(Folder):
    Files=[]
    for i in glob.glob(Folder+"/**/*.fits",recursive=True):
        Files.append(str(i))
    Files=np.array(Files,dtype=str)
    return Files

def Large_Set_Reader(Prediction_Folder):
    cwd=os.getcwd()
    os.chdir(Prediction_Folder)
    Folders=glob.glob("*.fits")
    Files=[]
    Tensor_Images=[]
    Classifications=[]
    for Folder in Folders:
        Files.append(Folder)
        os.chdir(Folder)
        Tensor_Image=np.load("Tensor_Image.npy")
        Tensor_Images.append(Tensor_Image)
        Classification=np.load("Classification.npy")
        Classifications.append(Classification[0])
        os.chdir("..")
    os.chdir(cwd)
    
    Files=np.array(Files)
    Tensor_Images=np.array(Tensor_Images)
    Classifications=np.array(Classifications)
    
    return Files,Tensor_Images,Classifications

### Primary Functions

def Sheet_Data_Set_Creator(Folder,Sheet_ID,Sheet_Names,Set_Name):
    Sheet_Files,Sheet_Classifications=Import_Sheet_Classifications(Sheet_ID,Sheet_Names)
    Folder_Files=Import_Folder_Files(Folder)
    Included_Files=[]
    Included_Classifications=[]
    Indices=[]
    for i in range(len(Sheet_Files)):
        if np.flatnonzero(np.char.find(Folder_Files,Sheet_Files[i])!=-1).size !=0 and Sheet_Classifications[i]!="nan":
            Indices.append(np.flatnonzero(np.char.find(Folder_Files,Sheet_Files[i])!=-1)[0])
            Included_Files.append(Folder_Files[Indices[-1]])
            Included_Classifications.append(Sheet_Classifications[i])
    Included_Files=np.array(Included_Files)
    
    with alive_bar(len(Indices),force_tty=True) as bar:
    
        File_name_0=Folder_Files[Indices[0]]
        alldata_0=Table.read(File_name_0)
        xdata_0=alldata_0["pos_x"]
        ydata_0=alldata_0["pos_y"]
        zdata_0=alldata_0["pos_z"]
        points=np.empty((len(xdata_0),3))
        for j in range(len(xdata_0)):
            points[j]=np.array([xdata_0[j],ydata_0[j],zdata_0[j]])
        Data=[Tensor_Image(points)]
        Data=tf.cast(Data,tf.float32)
        bar()
    
    
        for i in range(1,len(Indices)):
            File_name=Folder_Files[Indices[i]]
            alldata=Table.read(File_name)
            xdata=alldata["pos_x"]
            ydata=alldata["pos_y"]
            zdata=alldata["pos_z"]
            points=np.empty((len(xdata),3))
            for j in range(len(xdata)):
                points[j]=np.array([xdata[j],ydata[j],zdata[j]])
            New_Data=[Tensor_Image(points)]
            New_Data=tf.cast(New_Data,tf.float32)
            Data=tf.concat((Data,New_Data),axis=0)
            bar()
        
    Save_Image_Set(Included_Files,"External_Data_Image_Sets/"+Set_Name+"_Files.npy")
    Save_Image_Set(Data,"External_Data_Image_Sets/"+Set_Name+".npy")
    Save_Image_Set(Included_Classifications,"External_Data_Image_Sets/"+Set_Name+"_Classifications.npy")

def nan_Set_Creator(Folder,Sheet_ID,Sheet_Names,Set_Name):
    Sheet_Files,Sheet_Classifications=Import_Sheet_Classifications(Sheet_ID,Sheet_Names)
    Folder_Files=Import_Folder_Files(Folder)
    Included_Files=[]
    Included_Classifications=[]
    Indices=[]
    for i in range(len(Sheet_Files)):
        if  np.flatnonzero(np.char.find(Folder_Files,Sheet_Files[i])!=-1).size !=0 and Sheet_Classifications[i]=="nan":
            Indices.append(np.flatnonzero(np.char.find(Folder_Files,Sheet_Files[i])!=-1)[0])
            Included_Files.append(Folder_Files[Indices[-1]])
            Included_Classifications.append(Sheet_Classifications[i])
    Included_Files=np.array(Included_Files)
    
    with alive_bar(len(Indices),force_tty=True) as bar:
        File_name_0=Folder_Files[Indices[0]]
        alldata_0=Table.read(File_name_0)
        xdata_0=alldata_0["pos_x"]
        ydata_0=alldata_0["pos_y"]
        zdata_0=alldata_0["pos_z"]
        points=np.empty((len(xdata_0),3))
        for j in range(len(xdata_0)):
            points[j]=np.array([xdata_0[j],ydata_0[j],zdata_0[j]])
        Data=[Tensor_Image(points)]
        Data=tf.cast(Data,tf.float32)
        bar()
    
        for i in range(1,len(Indices)):
            File_name=Folder_Files[Indices[i]]
            alldata=Table.read(File_name)
            xdata=alldata["pos_x"]
            ydata=alldata["pos_y"]
            zdata=alldata["pos_z"]
            points=np.empty((len(xdata),3))
            for j in range(len(xdata)):
                points[j]=np.array([xdata[j],ydata[j],zdata[j]])
            New_Data=[Tensor_Image(points)]
            New_Data=tf.cast(New_Data,tf.float32)
            Data=tf.concat((Data,New_Data),axis=0)
            bar()    
    Save_Image_Set(Included_Files,"External_Data_Image_Sets/"+Set_Name+"_Files.npy")
    Save_Image_Set(Data,"External_Data_Image_Sets/"+Set_Name+".npy")
    Save_Image_Set(Included_Classifications,"External_Data_Image_Sets/"+Set_Name+"_Classifications.npy")

def Data_Set_Extender(Current_Set_Name,Final_Set_Name):
    Data=Load_Array("External_Data_Image_Sets/"+Current_Set_Name+".npy")
    Current_Classifications=Load_Array("External_Data_Image_Sets/"+Current_Set_Name+"_Classifications.npy")
    Files=Load_Array("External_Data_Image_Sets/"+Current_Set_Name+"_Files.npy")
    Classifications=Current_Classifications
    
    Single_Node_Condition=tf.where(Current_Classifications=="Single Node")
    Single_Node_Indices=[]
    for i in range(len(Single_Node_Condition)):
        Single_Node_Indices.append(Single_Node_Condition[i][0].numpy())
    
    Multiple_Node_Condition=tf.where(Current_Classifications=="Multiple Nodes")
    Multiple_Node_Indices=[]
    for i in range(len(Multiple_Node_Condition)):
        Multiple_Node_Indices.append(Multiple_Node_Condition[i][0].numpy())
    N=np.abs(len(Multiple_Node_Indices)-len(Single_Node_Indices))
    if len(Multiple_Node_Indices)>len(Single_Node_Indices):
        with alive_bar(int(N),force_tty=True) as bar:
            for i in range(N):
                random_i=np.random.choice(Single_Node_Indices)
                File_name=Files[random_i]
                alldata=Table.read(File_name.numpy().decode("utf-8"))
                xdata=alldata["pos_x"]
                ydata=alldata["pos_y"]
                zdata=alldata["pos_z"]
                points=np.empty((len(xdata),3))
                for j in range(len(xdata)):
                    points[j]=np.array([xdata[j],ydata[j],zdata[j]])
                alpha,beta,gamma=random_uniform_array(-2*np.pi,2*np.pi,3)
                projections=Projected_Points(points,alpha,beta,gamma)
                New_Data=[Tensor_Image(projections)]
                New_Data=tf.cast(New_Data,tf.float32)
                Data=tf.concat((Data,New_Data),axis=0)
                New_Classification=tf.cast([Current_Classifications[random_i]],tf.string)
                Classifications=tf.concat((Classifications,New_Classification),axis=0)
                New_File=tf.cast([File_name],tf.string)
                Files=tf.concat((Files,New_File),axis=0)
                bar()
    if len(Multiple_Node_Indices)<len(Single_Node_Indices):
        with alive_bar(int(N),force_tty=True) as bar:
            for i in range(N):
                random_i=np.random.choice(Multiple_Node_Indices)
                File_name=Files[random_i]
                alldata=Table.read(File_name.numpy().decode("utf-8"))
                xdata=alldata["pos_x"]
                ydata=alldata["pos_y"]
                zdata=alldata["pos_z"]
                points=np.empty((len(xdata),3))
                for j in range(len(xdata)):
                    points[j]=np.array([xdata[j],ydata[j],zdata[j]])
                alpha,beta,gamma=random_uniform_array(-2*np.pi,2*np.pi,3)
                projections=Projected_Points(points,alpha,beta,gamma)
                New_Data=[Tensor_Image(projections)]
                New_Data=tf.cast(New_Data,tf.float32)
                Data=tf.concat((Data,New_Data),axis=0)
                New_Classification=tf.cast([Current_Classifications[random_i]],tf.string)
                Classifications=tf.concat((Classifications,New_Classification),axis=0)
                New_File=tf.cast([File_name],tf.string)
                Files=tf.concat((Files,New_File),axis=0)
                bar()
            
    Save_Image_Set(Data,"External_Data_Image_Sets/"+Final_Set_Name+".npy")   
    Save_Image_Set(Classifications,"External_Data_Image_Sets/"+Final_Set_Name+"_Classifications.npy")
    Save_Image_Set(Files,"External_Data_Image_Sets/"+Final_Set_Name+"_Files.npy")
            
    print("Current data set length: ",len(Data))
    N=int(input("How many more images should be generated? "))
        
    with alive_bar(N,force_tty=True) as bar:
        for i in range(N):
            if Classifications[-1]=="Single Node":
                random_i=np.random.choice(Multiple_Node_Indices)
            if Classifications[-1]=="Multiple Nodes":
                random_i=np.random.choice(Single_Node_Indices)
            File_name=Files[random_i]
            alldata=Table.read(File_name.numpy().decode("utf-8"))
            xdata=alldata["pos_x"]
            ydata=alldata["pos_y"]
            zdata=alldata["pos_z"]
            points=np.empty((len(xdata),3))
            for j in range(len(xdata)):
                points[j]=np.array([xdata[j],ydata[j],zdata[j]])
            alpha,beta,gamma=random_uniform_array(-2*np.pi,2*np.pi,3)
            projections=Projected_Points(points,alpha,beta,gamma)
            New_Data=[Tensor_Image(projections)]
            New_Data=tf.cast(New_Data,tf.float32)
            Data=tf.concat((Data,New_Data),axis=0)
            New_Classification=tf.cast([Current_Classifications[random_i]],tf.string)
            Classifications=tf.concat((Classifications,New_Classification),axis=0)
            New_File=tf.cast([File_name],tf.string)
            Files=tf.concat((Files,New_File),axis=0)
            bar()
            
    Save_Image_Set(Data,"External_Data_Image_Sets/"+Final_Set_Name+".npy")   
    Save_Image_Set(Classifications,"External_Data_Image_Sets/"+Final_Set_Name+"_Classifications.npy")
    Save_Image_Set(Files,"External_Data_Image_Sets/"+Final_Set_Name+"_Files.npy")
     
def Check_Set_Length(File_List,Set_Fraction):
    Total=0
    for File in File_List:
        Data=Load_Array(File)
        Data=Data[:(int(Set_Fraction*len(Data)))]
        print(File,":",len(Data))
        Total+=len(Data)
    print("Total Set Length :",Total)

def Model_Creator(File_List,Validation_Fraction,Set_Fraction,epoch_number,class_names,classifier,save,Folder,Model_Name):
    with alive_bar(int(len(File_List)),force_tty=True) as bar:
        File_Name_0=File_List[0]
        Classification_0=File_List[0][:-4]+"_Classifications.npy"
        Data_Set=Load_Array(File_Name_0)
        Data_Set=Data_Set[:(int(Set_Fraction*len(Data_Set)))]
        Data_Classifications=Load_Array(Classification_0)
        Data_Classifications=Data_Classifications[:(int(Set_Fraction*len(Data_Classifications)))]
        Data_Labels=Classifications_to_Labels(Data_Classifications,class_names)
        bar()
    
        for i in range(1,len(File_List)):
            File_Name=File_List[i]
            Classification=File_List[i][:-4]+"_Classifications.npy"
            New_Data=Load_Array(File_Name)
            New_Data=New_Data[:(int(Set_Fraction*len(New_Data)))]
            New_Classifications=Load_Array(Classification)
            New_Classifications=New_Classifications[:(int(Set_Fraction*len(New_Classifications)))]
            New_Labels=Classifications_to_Labels(New_Classifications,class_names)
        
            Data_Set=tf.concat((Data_Set,New_Data),axis=0)
            Data_Labels=tf.concat((Data_Labels,New_Labels),axis=0)
            bar()
        
    indices = tf.range(start=0, limit=tf.shape(Data_Set)[0], dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)
    
    Data_Set=tf.gather(Data_Set,shuffled_indices)
    Data_Labels=tf.gather(Data_Labels,shuffled_indices)
    
    Data_Training=Data_Set[int(Validation_Fraction*len(Data_Set)):]
    Data_Test=Data_Set[:int(Validation_Fraction*len(Data_Set))]
    
    Labels_Training=Data_Labels[int(Validation_Fraction*len(Data_Labels)):]
    Labels_Test=Data_Labels[:int(Validation_Fraction*len(Data_Labels))]
    
    Data_shape=(Data_Training.shape[1],Data_Training.shape[2],Data_Training.shape[3])
    Model=keras.Sequential()
    Model.add(keras.layers.Conv2D(16,3,padding="same",activation="relu",input_shape=Data_shape))
    Model.add(keras.layers.MaxPooling2D())
    Model.add(keras.layers.Conv2D(32,3,padding="same",activation="relu"))
    Model.add(keras.layers.MaxPooling2D())
    Model.add(keras.layers.Conv2D(64,3,padding="same",activation="relu"))
    Model.add(keras.layers.MaxPooling2D())
    Model.add(keras.layers.Flatten())
    Model.add(keras.layers.Dense(128,activation="relu"))
    Model.add(keras.layers.Dense(len(class_names)))
    Model.compile(optimizer=classifier,
                 loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                 metrics=["accuracy"])
    Model.fit(Data_Training,Labels_Training,epochs=epoch_number,validation_data=(Data_Test,Labels_Test))
    loss,accuracy=Model.evaluate(Data_Test,Labels_Test,verbose=2)
    
    if save:
        Model.save(Folder+"/"+Model_Name)
        
def New_Image_Classifier(Data_Folder,Model_Folder,Model_Name,class_names,save,save_path,save_Folder):
    Data_Folder=os.path.abspath(Data_Folder)
    Model_Folder=os.path.abspath(Model_Folder)
    save_path=os.path.abspath(save_path)
    Model=tf.keras.models.load_model(Model_Folder+"/"+Model_Name,compile=True)
    Files,Data=Image_Reader(Data_Folder)
    Labels=np.argmax(Model.predict(Data),axis=1)
    Classifications=np.empty(Labels.shape,object)
    for i in range(len(Labels)):
        Classifications[i]=class_names[Labels[i]]
    Save_Data=np.array(["Predictions Made by: "+Model_Name,Files,Classifications])
    
    New_directory=str(save_path+"/"+save_Folder)
    if os.path.isdir(New_directory)==False:
        os.mkdir(New_directory)
    
    Text_File_Name=str(New_directory+"/Predictions.txt")
    NPY_File_Name=str(New_directory+"/Predictions.npy")
    np.savetxt(Text_File_Name,Save_Data,fmt="%s")
    np.save(NPY_File_Name,Save_Data)
    
def Large_Set_Image_Classifier(Data_Folder,Model_Folder,Model_Name,class_names,save,save_path,save_Folder):
    Data_Folder=os.path.abspath(Data_Folder)
    Model_Folder=os.path.abspath(Model_Folder)
    save_path=os.path.abspath(save_path)
    Model=tf.keras.models.load_model(Model_Folder+"/"+Model_Name,compile=True)
    cwd=os.getcwd()
    os.chdir(Data_Folder)
    Files=glob.glob("*.fits")
    New_directory=str(save_path+"/"+save_Folder)
    if os.path.isdir(New_directory)==False:
        os.mkdir(New_directory)
    for File in Files:
        File_Info_Folder=New_directory+"/"+str(File)
        if os.path.isdir(File_Info_Folder)==False:
            os.mkdir(File_Info_Folder)
            alldata=Table.read(File)
            xdata=alldata["pos_x"]
            ydata=alldata["pos_y"]
            zdata=alldata["pos_z"]
            points=np.empty((len(xdata),3))
            for j in range(len(xdata)):
                points[j]=np.array([xdata[j],ydata[j],zdata[j]])
            Data=[Tensor_Image(points)]
            Data=np.array(Data)
            np.save(File_Info_Folder+"/Tensor_Image.npy",Data)
            Label=np.argmax(Model.predict(Data))
            Classification=np.array([class_names[Label]])
            np.savetxt(File_Info_Folder+"/Classification.txt",Classification,fmt="%s")
            np.save(File_Info_Folder+"/Classification.npy",Classification) 
    os.chdir(cwd)
    
def Export_Results(Prediction_Folder,save_path,save_Folder):
    cwd=os.getcwd()
    Files,Tensor_Images,Classifications=Large_Set_Reader(Prediction_Folder)
    Text_File_Data=np.column_stack([Files,Classifications])
    NPY_File_Data=np.array([Files,Classifications],dtype="object")
    
    New_directory=str(save_path+"/"+save_Folder)
    if os.path.isdir(New_directory)==False:
        os.mkdir(New_directory)
    os.chdir(New_directory)
    
    np.savetxt("Results.txt",Text_File_Data,delimiter=",",fmt="%s")
    np.save("Results.npy",NPY_File_Data)
    np.save("Images.npy",Tensor_Images)
    
    os.chdir(cwd)
        
def Accuracy_Metrics(File_List,Validation_Fraction,Set_Fraction,class_names,Model_Folder,Model_Name):
    Model=tf.keras.models.load_model(Model_Folder+"/"+Model_Name,compile=True)
    print("Loading Data Files")
    with alive_bar(int(len(File_List)),force_tty=True) as bar:
        File_Name_0=File_List[0]
        Classification_0=File_List[0][:-4]+"_Classifications.npy"
        Data_Set=Load_Array(File_Name_0)
        Data_Set=Data_Set[:(int(Set_Fraction*len(Data_Set)))]
        Data_Classifications=Load_Array(Classification_0)
        Data_Classifications=Data_Classifications[:(int(Set_Fraction*len(Data_Classifications)))]
        Data_Labels=Classifications_to_Labels(Data_Classifications,class_names)
        bar()
    
        for i in range(1,len(File_List)):
            File_Name=File_List[i]
            Classification=File_List[i][:-4]+"_Classifications.npy"
            New_Data=Load_Array(File_Name)
            New_Data=New_Data[:(int(Set_Fraction*len(New_Data)))]
            New_Classifications=Load_Array(Classification)
            New_Classifications=New_Classifications[:(int(Set_Fraction*len(New_Classifications)))]
            New_Labels=Classifications_to_Labels(New_Classifications,class_names)
        
            Data_Set=tf.concat((Data_Set,New_Data),axis=0)
            Data_Labels=tf.concat((Data_Labels,New_Labels),axis=0)
            bar()
        
        
    indices = tf.range(start=0, limit=tf.shape(Data_Set)[0], dtype=tf.int32)
    shuffled_indices = tf.random.shuffle(indices)
    
    Data_Set=tf.gather(Data_Set,shuffled_indices)
    Data_Labels=tf.gather(Data_Labels,shuffled_indices)
    
    Data_Test=Data_Set[:int(Validation_Fraction*len(Data_Set))]
    Labels_Test=Data_Labels[:int(Validation_Fraction*len(Data_Labels))]
    
    Prediction_Labels=np.argmax(Model.predict(Data_Test),axis=1)

    TP=0
    FP=0
    TN=0
    FN=0
    print("Evaluating Data Set")
    with alive_bar(int(len(Prediction_Labels)),force_tty=True) as bar:
        for i in range(len(Prediction_Labels)):
            if Prediction_Labels[i]==0 and Labels_Test[i]==0:
                TP+=1
            if Prediction_Labels[i]==0 and Labels_Test[i]==1:
                FP+=1
            if Prediction_Labels[i]==1 and Labels_Test[i]==1:
                TN+=1
            if Prediction_Labels[i]==1 and Labels_Test[i]==0:
                FN+=1
            bar()
        
    p0=TP/(TP+FP)
    r0=TP/(TP+FN)
    F1_0=2*(p0*r0)/(p0+r0)
    print("For Class 0:")
    print("Precision: (worst:0,best:1)",p0)
    print("Recall: (worst:0,best:1)",r0)
    print("F1 score: (worst:0,best:1)",F1_0)
    print("")
    p1=TN/(TN+FN)
    r1=TN/(TN+FP)
    F1_1=2*(p1*r1)/(p1+r1)
    print("For Class 1:")
    print("Precision: (worst:0,best:1)",p1)
    print("Recall:(worst:0,best:1)",r1)
    print("F1 score: (worst:0,best:1)",F1_1)
    print("")
    print("Independent of Class:")
    ACC=(TP+TN)/(TP+FP+TN+FN)
    print("Accuracy: (worst:0,best:1)",ACC)
    MCC=((TP*TN)-(FP*FN))/np.sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN))
    print("Matthews Correlation Coefficient: (worst:-1,best:1)",MCC)

## 2. Image Set Creator

Use the following code to create and save data sets consisting of images and classifications of external data. The classification information is imported from a public google spreadsheet (must have share settings turned to "anyone with the link") with columns denoted "Halo ID" and "Classification". Visit https://docs.google.com/spreadsheets/d/1kW7Veg-SqdSNLBt9pQYPB3_xrPDxJOiVp-jpcQK2_s8/edit#gid=1114089073 to see an example of the required format. The variable "Folder" refers to the folder in which the input data is stored in the form of .fits files with names including the Halo ID; "Sheet_ID" refers to a unique string found within the URL of the desired google sheet (for this example, Sheet_ID="1kW7Veg-SqdSNLBt9pQYPB3_xrPDxJOiVp-jpcQK2_s8"); "Sheet_Names" refers to a list of the names of the sheets in the instance that there are multiple located at this google address (use the "Sheet_Names" function in the example spreadsheet appscript to create this list more easily); "Set_Name" refers to the name of the resulting .npy files that are created to store input images, classifications, and corresponding file names.

In [2]:
Folder="Example_Fits_Files"
Sheet_ID="1kW7Veg-SqdSNLBt9pQYPB3_xrPDxJOiVp-jpcQK2_s8"
Sheet_Names=['1', '7', '10', '23', '25' ]
Set_Name="Example"

Sheet_Data_Set_Creator(Folder,Sheet_ID,Sheet_Names,Set_Name)

|████████████████████████████████████████| 14/14 [100%] in 7.9s (1.76/s)                           ▆█▆ 4/14 [29%] in 3s (1.3/s, eta: 8s)            


For the cases in which classifications are not known, these Halo IDs can be given the label "nan" and combined into a data set using the following function. All of the input variables are the same as the previous function and provides a similar output with the aim of using this data to obtain new classifications.

Nan Set Creator

In [3]:
Folder="Example_Fits_Files"
Sheet_ID="1kW7Veg-SqdSNLBt9pQYPB3_xrPDxJOiVp-jpcQK2_s8"
Sheet_Names=['1', '7', '10' ]
Set_Name="nan_Example"

nan_Set_Creator(Folder,Sheet_ID,Sheet_Names,Set_Name)

|████████████████████████████████████████| 2/2 [100%] in 2.0s (0.99/s)                      


The next function is used to lengthen the dataset and ensure that each class has approximately an equal number of entries. This is the final step for preparing a data set for model creation. The variables "Current_Set_Name" and "Final_Set_Name" refer to the name of the saved dataset file and that of the output file respectively (note that these names can be the same if desired). Once this function begins running, it will indicate the current length of the dataset and then ask the user for how many new images should be generated and added to the dataset.

In [4]:
Current_Set_Name="Example"
Final_Set_Name="Extended_Example"

Data_Set_Extender(Current_Set_Name,Final_Set_Name)

|████████████████████████████████████████| 7/7 [100%] in 10.1s (0.69/s)                     1/7 [14%] in 2s (0.4/s, eta: 9s)                                    
Current data set length:  21
How many more images should be generated? 19
|████████████████████████████████████████| 19/19 [100%] in 22.8s (0.83/s)                   


## 3. Model Creator

The following code is used to check the length of the input data before they are used to train the model. Because there is an approximate maximum of 200000 total images due to memory limitations, a specified portion can be selected as to not exceed this value. "File_List" refers to the selection of dataset files used to train the model; "Set_Fraction" refers to the amount of the total files that will be used to determine the length of the input data. 

In [5]:
File_List=["External_Data_Image_Sets/Extended_Example.npy"]
Set_Fraction=1

Check_Set_Length(File_List,Set_Fraction)

External_Data_Image_Sets/Extended_Example.npy : 40
Total Set Length : 40


Use the following code to create and train an image classification model. This section implements an entirely new set of variables. "Validation_Fraction" refers to the percentage of total data assigned to test the accuracy of the final model; "epoch_number" refers to the number of epochs that will occur during the model training; "class_names" represents the list of total classifications present (in this case, it will always remain "Single Node" and "Multiple Nodes"); "classifier" refers the algorithm used for training the model (see https://www.tensorflow.org/api_docs/python/tf/keras/optimizers and look under "Classes" for more options); "save" represents the option to save the model that has just been created, set "True" to record the trained model or "False" if you do not want it to be saved; "Folder" refers to the directory to which the model can be saved (this folder is externally created); and Model_Name refers to the name given to the model if it is saved (It will appear as a new folder in the directory chosen for saved models).

In [6]:
File_List=["External_Data_Image_Sets/Extended_Example.npy"]
Validation_Fraction=0.2
Set_Fraction=1
epoch_number=50
class_names=["Single Node","Multiple Nodes"]
classifier="SGD"
save=True
Folder="Saved_Models"
Model_Name="Example_Model"

Model_Creator(File_List,Validation_Fraction,Set_Fraction,epoch_number,class_names,classifier,save,Folder,Model_Name)

|████████████████████████████████████████| 1/1 [100%] in 0.1s (9.72/s)                      
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
1/1 - 0s - loss: 0.7064 - accuracy: 0.5000
INFO:tensorflow:Assets written to: Saved_Models/Example_Model/assets


Use the following code to save the inputs used to create the model. In this block, only the information of "Loss" and "Accuracy" will need to be changed. These values will be printed when the model training has completed. The rest of the variables will be the same as those defined previously.

In [7]:
Loss,Accuracy=0.7064, 0.5000

Summary=["Files",File_List,
         "Validation_Fraction=",Validation_Fraction,
         "Set_Fraction=",Set_Fraction,
         "epoch_number=",epoch_number,
         "classifier=",classifier,
         "Loss=",Loss,
         "Accuracy=",Accuracy]


New_directory=str(Folder+"/"+Model_Name+"_Parameters/")
if os.path.isdir(New_directory)==False:
        os.mkdir(New_directory)
    
Textfile_Data=np.array(Summary,dtype=object)
Name=str(New_directory+Model_Name+"_Overview.txt")
np.savetxt(Name,Textfile_Data,fmt="%s")
NPY_Name=str(New_directory+Model_Name+"_Overview.npy")
np.save(NPY_Name,Textfile_Data)

## 4. New Data Predictions

In running this code, fits files from other sources are evaluated by a model and given either the classification of "Single Node" or "Multiple Nodes." Important data will be saved as a .txt file for easy viewing as well as a .npy file to be easily loaded into python notebooks. The final block also introduces new variables used to specify the original folder containing the fits files to be evaluated and the final location for saving the predictions that the model makes. More specifically, "Data_Folder" refers to the folder of input data / externally obtained fits files (Note: every file ending in .fits contained in the provided folder will be evaluated); "Model_Folder" refers to the directory that contains the model selected to make the evaluations; "save_path" refers to the directory in which saved evaluations will be stored; "save_Folder" refers to the name given to the folder storing the .txt and .npy outputs.

In [8]:
Data_Folder="New_Data"
Model_Folder="Saved_Models"
Model_Name="Final_Model"
class_names=["Single Node","Multiple Nodes"]
save=True
save_path="New_Image_Predictions"
save_Folder="Example_Predictions[Final_Model]"

New_Image_Classifier(Data_Folder,Model_Folder,Model_Name,class_names,save,save_path,save_Folder)

The following code serves the same purpose as the previous function, however this block is better suited to evaluating extremely large datasets. The input variables are all the same, but the output format is very different. Each image evaluation is given its own subdirectory in which a .txt and .npy output is stored. If the process is interrupted and restarted, the previously completed evaluations will be recognized and the process can resume by running on new data. Check that the most recently created subdirectory contains the output data before rerunning. If this is not the case, delete this subdirectory before resuming the evaluation process.

In [9]:
Data_Folder="New_Data"
Model_Folder="Saved_Models"
Model_Name="Final_Model"
class_names=["Single Node","Multiple Nodes"]
save=True
save_path="New_Image_Predictions"
save_Folder="Example_Long_Set_Predictions[Final_Model]"

Large_Set_Image_Classifier(Data_Folder,Model_Folder,Model_Name,class_names,save,save_path,save_Folder)

If the "Large_Set_Classifier" is used as opposed to the normal function, the following code can be used to condense all of the output data into a single location for better interpretation. The "Prediction_Folder" variable should be the same as the "save_Folder" variable used previously.

In [10]:
Prediction_Folder="New_Image_Predictions/Example_Long_Set_Predictions[Final_Model]"
save_path="New_Image_Predictions"
save_Folder="Example_Long_Set_Complete_Predictions"

Export_Results(Prediction_Folder,save_path,save_Folder)

In running this code, several accuracy metrics of the trained model can be calculated and printed by selecting the same parameters used in the model's training.

In [11]:
File_List=["External_Data_Image_Sets/Extended_Example.npy"]
Validation_Fraction=0.2
Set_Fraction=1
class_names=["Single Node","Multiple Nodes"]
Folder="Saved_Models"
Model_Name="Final_Model"

Accuracy_Metrics(File_List,Validation_Fraction,Set_Fraction,class_names,Folder,Model_Name)

Loading Data Files
|████████████████████████████████████████| 1/1 [100%] in 0.1s (9.81/s)                      
Evaluating Data Set
|████████████████████████████████████████| 8/8 [100%] in 0.1s (78.03/s)                     
For Class 0:
Precision: (worst:0,best:1) 1.0
Recall: (worst:0,best:1) 0.6666666666666666
F1 score: (worst:0,best:1) 0.8

For Class 1:
Precision: (worst:0,best:1) 0.8333333333333334
Recall:(worst:0,best:1) 1.0
F1 score: (worst:0,best:1) 0.9090909090909091

Independent of Class:
Accuracy: (worst:0,best:1) 0.875
Matthews Correlation Coefficient: (worst:-1,best:1) 0.7453559924999299
