## Data Processing for machine learning

In [None]:
# pip install missingno

In [None]:
import numpy as np
import matplotlib.pyplot as pl
import pandas as pd
import os 
import os.path as osp
import itertools
import astropy.io.fits as fits

from sources.preprocessor import data_processor # A well defined function to sample columns of interest from a full catalogue
import missingno as msno

# plot style
import seaborn as sns
sns.set_style("ticks")

In [None]:
directory_path = "normalised"
os.makedirs(directory_path, exist_ok=True)

In [None]:
file = osp.join("COSMOSXMATCH+classes_040422_withphotometry.fits")
# table = Table.read("COSMOSXMATCH+classes_040422_withphotometry.fits")

fp = fits.open(file, memmap=True)
head = fp[1].header
data = fp[1].data
fp.close

# number of elements in the data
N_all = len(data)

In [None]:
# In this section we convert MID infrared flux densities to colours
# The colours we use are shown to be good feature for distinguishing AGNs from SFGs by Donley et al...

# # Important data columns for machine learning, these are the columns we will need for our analysis from the original catalogue
input_features = [ 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX',
 'SPLASH_4_FLUX', 'L14','LIR_WHz','MASS_lephare', 'class_star', 'qir']

# input_features = [ 'CATID', 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX',
#  'SPLASH_4_FLUX', 'L14','LIR_WHz','MASS_lephare', 'class_star', 'qir', '', '' ]

# These are columns on the original catalogue that specify sources that are AGN, SFG or sources that are thought as probable SFG
output_features = ['AGN', 'SFG', 'probSFG', 'unclass']

# We call the data processor function
mightee = data_processor(data, input_features, output_features)

In [None]:


# We use missingno to view the missingness in each feature in the data
import missingno as msno

fig = msno.matrix(mightee, figsize=(10, 8))

fig_copy = fig.get_figure()
fig_copy.savefig('missingno.png', bbox_inches = 'tight')

In [None]:
# Generate the missingno bar plot
ax = msno.bar(mightee)

# Customize: hide the right vertical axis (right spine)
ax.spines['right'].set_visible(False)

# Further customization (optional)
ax.set_xlabel('Features', fontsize=12)
ax.set_ylabel('Count of non-null values', fontsize=12)
ax.set_title('Missing Data Overview', fontsize=15)

# Show the plot
pl.show()

In [None]:
# In the catalogues the Irac fluxes S3.6, S4.5, S5.8, S8.0 are labelled SPLASH1, SPLASH2, SPLASH3 and SPLASH4 flux respectively
mightee['SPLASH_1_FLUX'] = pd.to_numeric(mightee['SPLASH_1_FLUX'], errors='coerce')
mightee['SPLASH_2_FLUX'] = pd.to_numeric(mightee['SPLASH_2_FLUX'], errors='coerce')
mightee['SPLASH_3_FLUX'] = pd.to_numeric(mightee['SPLASH_3_FLUX'], errors='coerce')
mightee['SPLASH_4_FLUX'] = pd.to_numeric(mightee['SPLASH_4_FLUX'], errors='coerce')


S8_S45 = np.log10( mightee['SPLASH_4_FLUX'] / mightee['SPLASH_2_FLUX'])
S58_S36 = np.log10(mightee['SPLASH_3_FLUX'] / mightee['SPLASH_1_FLUX'])
S45_S36 = np.log10(mightee['SPLASH_2_FLUX'] / mightee['SPLASH_1_FLUX'])
# putting the features together
mid_data = np.vstack([np.array(S8_S45),
               np.array(S58_S36)]).T

mightee_data = mightee.drop(['SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX','SPLASH_4_FLUX', 'L14','LIR_WHz'], axis = 1)

# and equating it to the list
mightee_data['log(S8/S45)'] = S8_S45
mightee_data['log(S58/S36)'] = S58_S36
mightee_data['log(S45/S36)'] = S45_S36

# lastly we convert the -99 to Nan
mightee_data1 = mightee_data.replace([- np.inf, -99, np.inf], np.NaN, regex=True)
# mightee_data1 = mightee_data.replace([ np.inf, -99], np.NaN, regex=True)


mightee_data1.rename(columns = {'AGN':'class_labels', 'MASS_lephare':'Mstar'}, inplace = True)


In [None]:
msno1 = mightee_data1.drop("class_labels", axis='columns')
msno1 = mightee_data1.drop("log(S45/S36)", axis='columns')

In [None]:
msno1

In [None]:
msno2= msno1.rename(columns = {'Mstar':'$log (M_{\rm \star})$', 
                               'qir':'$q_\mathrm{IR}$', 
                               'log(S8/S45)':'$log(S8.0/S4.5)$', 
                               'log(S58/S36)': '$log(S5.8/S3.6)$',
                               'log(S45/S36)': '$log(S4.5/S3.6)$'
                              } )

In [None]:
msno2

In [None]:
# import matplotlib.pyplot as plt
# import pandas as pd

# # Example data with LaTeX-like labels
# columns = [r'$log (M_{\rm \star})$', r'$class_{\rm star}$', r'$q_{ir}$', r'$log (S_{8}/S_{45})$', 
#            r'$log (S_{58}/S_{36})$', r'$log (S_{45}/S_{36})$']
# values = [10, 15, 20, 5, 12, 7]

# # Create a DataFrame
# df = pd.DataFrame({'Column Names': columns, 'Values': values})

# # Plotting
# fig, ax = pl.subplots()
# ax.bar(range(len(df)), df['Values'])  # Plot using the index for tick positions

# # Set x-tick positions and LaTeX-style labels
# ax.set_xticks(range(len(df)))
# ax.set_xticklabels(df['Column Names'], rotation=45, ha='right')  # LaTeX-style labels are handled here

# pl.tight_layout()  # Adjust layout
# pl.show()


In [None]:
# # We use missingno to view the missingness in each feature in the data
# fig = msno.bar(msno1,figsize=(10, 8), fontsize=26, color = 'g')

# fig_copy = fig.get_figure()
# fig_copy.savefig('missingno.pdf', bbox_inches = 'tight')

In [None]:
msno1

In [None]:
columns = [r'$log (M_{\rm star})$', 'class_star', r'$q_\mathrm{IR}$', r'$log (S_{8.0}/S_{4.5})$', 
           r'$log (S_{5.8}/S_{3.6})$', r'$log (S_{4.5}/S_{3.6})$']

# cols_true = msno1.columns

# Generate the missingno bar plot
ax = msno.bar(msno1,figsize=(12, 8), fontsize=22, color = 'g')

# Customize: hide the right vertical axis (right spine)
ax.spines['right'].set_visible(False)

# Further customization (optional)
ax.set_xlabel('Features', fontsize=22)
ax.set_ylabel('Count of Valid Measurements', fontsize=22)
# ax.set_xticks(cols_true) 
ax.set_xticks(range(len(columns)))  # Set tick positions to match the indices
ax.set_xticklabels(columns)

ax.spines['right'].set_visible(False)

# Show the plot
pl.savefig('missingno.pdf', bbox_inches = 'tight')
pl.show()

In [None]:
# Generate the missingno bar plot
ax = msno.matrix(msno1,figsize=(12, 8), fontsize=22)

# Customize: hide the right vertical axis (right spine)
ax.spines['right'].set_visible(False)

# Further customization (optional)
ax.set_xlabel('Features', fontsize=20)
ax.set_ylabel('Count of non-null values', fontsize=20)
ax.set_title('Missing Data Overview', fontsize=22)

# Show the plot
pl.show()

### For PLot purposes

In [None]:
# # We rename AGN column to labels
# mightee_data1.rename(columns = {'class_labels':'class'}, inplace = True)
# col = ['qir', 'class_star', 'Mstar', 'log(S8/S45)','log(S58/S36)', 'log(S45/S36)', 'class']
# mightee_plot1 =  mightee_data1[col]


# fig = msno.bar(mightee_plot1,figsize=(10, 8), fontsize=26, color = 'g')

# fig_copy = fig.get_figure()
# fig_copy.savefig('missingno_plot.png', bbox_inches = 'tight')

In [None]:
# Since We have sampled the important features we can throw away the obsevations with missingness in one of the features

catalogue = mightee_data1.dropna()

# catalogue = catalog.drop("unclass", axis='columns')

In [None]:
print(mightee_data1.columns)

In [None]:
msno1["ML dataset"] = catalogue["qir"]

In [None]:
msno1

In [None]:
msno1 = msno1.drop("class_labels", axis='columns')


In [None]:
msno1

In [None]:
# columns = [r'$log (M_{\rm star})$', 'class_star', r'$q_\mathrm{IR}$', r'$log (S_{8.0}/S_{4.5})$', 
#            r'$log (S_{5.8}/S_{3.6})$', "ML dataset"]
# # r'$log (S_{4.5}/S_{3.6})$'

# # cols_true = msno1.columns

# # Generate the missingno bar plot
# ax1 = msno.bar(msno1,figsize=(12, 8), fontsize=22, color = 'g')

# # Customize: hide the right vertical axis (right spine)
# ax1.spines['right'].set_visible(False)

# # Further customization (optional)
# ax1.set_xlabel('Features', fontsize=22)
# ax1.set_ylabel('Count of Valid Measurements', fontsize=22)
# # ax.set_xticks(cols_true) 
# ax1.set_xticks(range(len(columns)))  # Set tick positions to match the indices
# ax1.set_xticklabels(columns)

# ax1.spines['right'].set_visible(False)

# # Show the plot
# pl.savefig('missingno.pdf', bbox_inches = 'tight')
# pl.show()

In [None]:
# Your defined column labels (6 columns)
columns = [
    r'$log (M_{\rm star})$', 
    'class_star', 
    r'$q_\mathrm{IR}$', 
    r'$log (S_{8.0}/S_{4.5})$',
    r'$log (S_{5.8}/S_{3.6})$', 
    "ML dataset"
]

# Check ACTUAL columns in your DataFrame
print("Actual columns:", msno1.columns.tolist())  # Debug step

# Ensure msno1 has ONLY the 6 columns you want to plot
msno1 = msno1[msno1.columns.tolist()]  # Filter to match your labels

# Generate the plot
ax1 = msno.bar(msno1, figsize=(12, 8), fontsize=22, color='g')
ax1.spines['right'].set_visible(False)
ax1.set_xlabel('Features', fontsize=22)
ax1.set_ylabel('Count of Valid Measurements', fontsize=22)
ax1.set_xticks(range(len(columns)))  # Now matches exactly
ax1.set_xticklabels(columns, rotation=45)  # Rotate labels if crowded

pl.savefig('missingno.pdf', bbox_inches='tight')
pl.show()

In [None]:
# # We plot the distribution of each feature 

# fig, axs = pl.subplots(2, 3, figsize = (18, 13))
# fig.suptitle('Important features', fontweight ='bold', fontsize =18)

# ## IRAC colors 'S58_S36'
# axs[0, 0].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['log(S58/S36)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[0, 0].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['log(S58/S36)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[0, 0].legend()
# # axs[0, 0].set_title("IRAC color")
# axs[0, 0].set_xlabel('IRAC color - log(S58/S36)',fontweight ='bold', fontsize =18)
# axs[0, 0].set_ylabel('frequency',fontweight ='bold', fontsize =18)
# axs[0, 0].set_xlim(-0.6, 0.6)
            
# ## IRAC colors ''S8_S45''
# axs[0, 1].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['log(S8/S45)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[0, 1].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['log(S8/S45)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[0, 1].legend()
# # axs[0, 1].set_title("IRAC colors")
# axs[0, 1].set_xlabel('IRAC color - log(S8/S45)', fontweight ='bold', fontsize =18)
# axs[0, 1].set_ylabel('frequency', fontweight ='bold', fontsize =18)
# axs[0, 1].set_xlim(-1, 1)

# # Colour 'log(S45/S36)'
# axs[0, 2].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['log(S45/S36)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[0, 2].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['log(S45/S36)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[0, 2].legend()
# axs[0, 2].set_xlabel('log(S45/S36)',fontweight ='bold', fontsize =18)
# axs[0, 2].set_ylabel('frequency',fontweight ='bold', fontsize =18)
# axs[0, 2].set_xlim(-.25, .25)

# ## QIR
# axs[1, 0].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['qir']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[1, 0].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['qir']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[1, 0].legend()
# # axs[1, 0].set_title("QIR")
# axs[1, 0].set_xlabel('qir',fontweight ='bold', fontsize =18)
# axs[1, 0].set_ylabel('frequency',fontweight ='bold', fontsize =18)
# axs[1, 0].set_xlim(-1, 3.5)

# ## Mstar
# axs[1, 1].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['Mstar']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[1, 1].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['Mstar']]), bins = 200, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[1, 1].legend()
# axs[1, 1].set_xlabel('stellar mass', fontweight ='bold', fontsize =18)
# axs[1, 1].set_ylabel('frequency', fontweight ='bold', fontsize =18)
# axs[1, 1].set_xlim(8.5, 12.3)

# ## Class Star
# axs[1, 2].hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['class_star']]), bins = 10, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# axs[1, 2].hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['class_star']]), bins = 10, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# axs[1, 2].legend()
# axs[1, 2].set_xlabel('class_star', fontweight ='bold', fontsize =18)
# axs[1, 2].set_ylabel('frequency', fontweight ='bold', fontsize =18)
# axs[1, 2].set_xlim(-0.02, 1)

# pl.savefig('source_distribution')
# fig.tight_layout()

## FOR Write up

In [None]:
# # We plot the distribution of each feature 

# pl.figure(figsize = (10, 8))
# # fig.suptitle('Important features', fontweight ='bold', fontsize =18)

# ## IRAC colors ''S8_S45''
# pl.hist(np.array(catalogue[catalogue['class_labels']=='SFG'][['log(S8/S45)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "r", label = 'SFG')
# pl.hist(np.array(catalogue[catalogue['class_labels']=='AGN'][['log(S8/S45)']]), bins = 100, 
#              histtype = "step", linewidth = 3, alpha= 1, color= "b", label = 'AGN')
# pl.xlabel('IRAC color - log(S8/S45)', fontweight ='bold', fontsize =18)
# pl.ylabel('frequency', fontweight ='bold', fontsize =18)
# pl.xlim(-1, 1)
# pl.legend(prop={'size': 22})
# pl.savefig('hist2')
# pl.show()

## Number Of Sources

### Original Catalogue

In [None]:
# !pip install tabulate

In [None]:
# mightee

In [None]:
# agn = mightee[mightee['AGN']==True]
# sfg = mightee[mightee['SFG']==True]
# probsfg = mightee[mightee['ProbSFG']==True]
# noclass = mightee[mightee['unclass']==True]

In [None]:
# # Drawing a table for results
# from tabulate import tabulate

# print("The Clean Mightee Catalogue with a total of 4589 sources classified as AGNs, SFGs or ProbSFG) ")
# # create data
# mydata = [["AGN", len(agn), len(agn)/len(mightee],
#           ["SFG", len(sfg), len(sfg)/len(mightee)], 
#          ["probSFG", len(probsfg), len(probsfg)/len(mightee)],
#          ["no class", len(noclass), len(noclass)/len(mightee)]]
  
# # create header
# head = ["Class", "Number of sources", "percentage(%)" ]
  
# # display table
# ML_results = tabulate(mydata, headers=head, tablefmt="grid")
# print(ML_results)


### Reduced and Clean Catalogue

In [None]:
# print("The Original Mightee Catalogue with a total of 4273 sources classified as AGNs or SFGs) ")
# # create data
# mydata = [["AGN", 1484, 35, 15],
#           ["SFG", 2789, 65, 2]]
  
# # create header
# head = ["Class", "Number of sources", "percentage(%)", 'frac of sources lost(%)' ]
  
# # display table
# ML_results = tabulate(mydata, headers=head, tablefmt="grid")
# print(ML_results)
# print("About 7% of source has been discarded due to the missing value in one of the band.")

In [None]:
# saving the dataframe as raw_data
catalogue = catalogue.dropna()
catalogue.reset_index(inplace = True)
catalogue.to_csv('raw_data1.csv', index = False, header=True)
# catalogue = catalogue.set_index('CATID')

# We split the data into training and test size for further analysis
from sklearn.model_selection import train_test_split
# from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

# scaler = MinMaxScaler()
# scaler = MaxAbsScaler()
scaler = StandardScaler(with_mean=False)

labels = catalogue['class_labels']
X = catalogue.drop(['class_labels'], axis = 1)

X_norm = scaler.fit_transform(X)
scaled_X = pd.DataFrame( X_norm, columns = X.columns)

scaled_X = scaled_X.drop("index", axis="columns")

scaled_X['class_labels'] = labels

# encoding target class
y, clas = pd.factorize(labels) #getting the class 0 = agn, 1 =notagn, 2 = no class
y_target = pd.DataFrame(y, columns = ['labels'])
X_train, X_test, y_train, y_test = train_test_split(scaled_X, y_target, stratify = y, test_size=0.25, random_state=42)

# saving the dataframe as raw_data
X_train.to_csv('normalised/X_train_zs.csv', index = False, header=True)
y_train.to_csv('normalised/y_train_zs.csv', index = False, header=True)
X_test.to_csv('normalised/X_test_zs.csv', index = False, header=True)
y_test.to_csv('normalised/y_test_zs.csv', index = False, header=True)

# X_train.to_csv('X_train_table.csv', index = False, header=True)
# y_train.to_csv('y_train_table.csv', index = False, header=True)
# X_test.to_csv('X_test_table.csv', index = False, header=True)
# y_test.to_csv('y_test_table.csv', index = False, header=True)

scaled_X.to_csv('normalised/scaled_raw_zs.csv', index = False, header=True)


In [None]:
len(catalogue)

In [None]:
print(X_train.isna().sum().sum())
