In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
import os.path as osp
import itertools
import astropy.io.fits as fits
from astropy.table import Table
from imblearn.under_sampling import RandomUnderSampler
from sources.preprocessor import data_processor # A well defined function to sample columns of interest from a full catalogue
import missingno as msno

# plot style
import seaborn as sns
# sns.set_style("ticks")
%matplotlib inline

In [None]:
# loading the data from the machine
# file = osp.join("MIGHTEEXMATCH+allmulti+classes2.fits")

file = osp.join("COSMOSXMATCH+classes_040422_withphotometry.fits")
# table = Table.read("COSMOSXMATCH+classes_040422_withphotometry.fits")

fp = fits.open(file, memmap=True)
head = fp[1].header
data = fp[1].data
fp.close

# number of elements in the data
N_all = len(data)

In [None]:
file2 = osp.join("COSMOSXMATCH+classes_040422_withphotometry_xrayonly.fits")
# table = Table.read("COSMOSXMATCH+classes_040422_withphotometry.fits")

fp2 = fits.open(file2, memmap=True)
head2 = fp2[1].header
data2 = fp2[1].data
fp2.close

# number of elements in the data
N_all2 = len(data2)

In [None]:
# print(data.columns)

In [None]:
## Astropy table.
df_data = Table(fp[1].data)
df_data = df_data.to_pandas()


In [None]:
## Astropy table.
df_data2 = Table(fp2[1].data)
df_data2 = df_data2.to_pandas()


In [None]:
df_data2

In [None]:
# merged_df = pd.merge(df1, df2, on='CATID', how='inner')

In [None]:
# Important data columns for machine learning, these are the columns we will need for our analysis from the original catalogue
features = [ 'CATID', 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX',
 'SPLASH_4_FLUX', 'L14','LIR_WHz','MASS_lephare', 'class_star', 'qir', 'AGN', 'SFG', 'probSFG', 'unclass'] #, 'X-ray-only', 'VLBI-only']

mightee = df_data2[features]

In [None]:
# Sampling the sources that are classified as midIRAGB = AGN and the sources that are classified as notmidIRAGN = SFG
AGN = mightee[mightee["AGN"] == True]
SFG = mightee[mightee["SFG"] == True]
probSFG = mightee[mightee["probSFG"] == True]
unclass = mightee[mightee["unclass"] == True]

In [None]:
print('shape of the AGN: ', AGN.shape)
print('shape of the SFG: ', SFG.shape)
print('shape of the probSFG: ', probSFG.shape)
print('shape of the unclass: ', unclass.shape)
print('total sample: ', len(AGN) + len(SFG)+len(probSFG)+len(unclass))

In [None]:
y_features = ['AGN', 'SFG', 'probSFG', 'unclass']

# We now drop the not column
mightee_agn = AGN.drop([y_features[1], y_features[2]], axis = 1)
mightee_sfg = SFG.drop([y_features[1], y_features[2]], axis = 1)
mightee_probsfg = probSFG.drop([y_features[1], y_features[2]], axis = 1)
mightee_unclass = unclass.drop([y_features[1], y_features[2]], axis = 1)


# We now replace True with the true label AGN or SFG for the corresponding source
mightee_agn1 = mightee_agn.replace(True, 'AGN', regex=True)
mightee_sfg1 = mightee_sfg.replace(False, 'SFG', regex=True)
mightee_probsfg1 = mightee_probsfg.replace(False, 'SFG', regex=True)
mightee_unclass1 = mightee_unclass.replace(False, 'NONE', regex=True)
# combining this data into one
complete_mightee = pd.concat([mightee_agn1, mightee_sfg1, mightee_probsfg1, mightee_unclass1], sort=False)
complete_mightee1 = complete_mightee.replace(-np.inf, np.nan, regex=True) 
catalogue = complete_mightee1.drop("unclass", axis='columns')

In [None]:
# In the catalogues the Irac fluxes S3.6, S4.5, S5.8, S8.0 are labelled SPLASH1, SPLASH2, SPLASH3 and SPLASH4 flux respectively
S8_S45 = np.log10( catalogue['SPLASH_4_FLUX'] / catalogue['SPLASH_2_FLUX'])
S58_S36 = np.log10(catalogue['SPLASH_3_FLUX'] / catalogue['SPLASH_1_FLUX'])
S45_S36 = np.log10(catalogue['SPLASH_2_FLUX'] / catalogue['SPLASH_1_FLUX'])
# putting the features together
mid_data = np.vstack([np.array(S8_S45),
               np.array(S58_S36)]).T

mightee_data = catalogue.drop(['SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX','SPLASH_4_FLUX', 'L14','LIR_WHz'], axis = 1)

# and equating it to the list
mightee_data['log(S8/S45)'] = S8_S45
mightee_data['log(S58/S36)'] = S58_S36
mightee_data['log(S45/S36)'] = S45_S36

# lastly we convert the -99 to Nan
mightee_data1 = mightee_data.replace([- np.inf, -99, np.inf], np.NaN, regex=True)
# mightee_data1 = mightee_data.replace([ np.inf, -99], np.NaN, regex=True)


mightee_data1.rename(columns = {'AGN':'class_labels', 'MASS_lephare':'Mstar'}, inplace = True)


In [None]:
subsample = df_data2[['CATID', 'X-ray-only', 'VLBI-only']]#'XAGN', 'VLBAAGN' ]]

# Merge on "id"
merged_df = pd.merge(mightee_data1, subsample, on='CATID', how='inner')  # Inner join by default

# print(merged_df)

In [None]:
merged_df = merged_df.rename(columns={'X-ray-only': "XAGN", 'VLBI-only': "VLBAAGN"})

In [None]:
merged_df

In [None]:
# subset_xray = merged_df[merged_df["XAGN"] == True]

# subset_vlbi = merged_df[(merged_df["VLBAAGN"] == True) & (merged_df["unclass"] == True)]
# subset_vlbi = merged_df[merged_df["VLBAAGN"] == True]

In [None]:
catalog = merged_df.dropna()

print(len(catalog))

In [None]:
# subset_vlbi = merged_df[(merged_df["VLBAAGN"] == True) & (merged_df["unclass"] == True)]
catalog_agn_sfg = catalog[catalog['class_labels'] != "NONE"]

catalog_agn_sfg_noxray = catalog_agn_sfg[catalog_agn_sfg['XAGN'] == False]

# catalog_agn_sfg_noxray_novlbi = catalog_agn_sfg_noxray[catalog_agn_sfg_noxray['VLBAAGN'] == False]
# catalog_agn_sfg_noxray_novlbi = catalog_agn_sfg[catalog_agn_sfg['VLBAAGN'] == False]


catalog_agn_sfg_noxray_novlbi = catalog_agn_sfg[(catalog_agn_sfg['XAGN'] == False) & (catalog_agn_sfg['VLBAAGN'] == False)]

# catalog_agn_sfg_noxray_novlbi
print(len(catalog_agn_sfg))
print(len(catalog_agn_sfg_noxray))
print(len(catalog_agn_sfg_noxray_novlbi))
# print(len(catalog_agn_sfg_noxray_novlbi2))


In [None]:
# catalog_agn_sfg_noxray

In [None]:
catalog_agn_sfg_xray = catalog_agn_sfg[catalog_agn_sfg['XAGN'] == True]
catalog_agn_sfg_vlbi = catalog_agn_sfg[catalog_agn_sfg['VLBAAGN'] == True]
catalog_agn_sfg_vlbi_xray = pd.concat([catalog_agn_sfg_vlbi, catalog_agn_sfg_xray], sort=False)

print(len(catalog_agn_sfg_xray))
print(len(catalog_agn_sfg_vlbi))
print(len(catalog_agn_sfg_vlbi_xray))
# print(len(catalog_agn_sfg_vlbi_xray2))


In [None]:
catalog_agn_sfg['XAGN'] = [0 if item is True else item for item in catalog_agn_sfg['XAGN']]

In [None]:
# Check for duplicates in the 'ID' column
duplicate_ids = catalog_agn_sfg_vlbi_xray['CATID'].duplicated(keep=False)
print(f"Duplicate IDs: {catalog_agn_sfg_vlbi_xray['CATID'][duplicate_ids]}")

catalog_agn_sfg_vlbi_xray_unique = catalog_agn_sfg_vlbi_xray[~catalog_agn_sfg_vlbi_xray['CATID'].duplicated(keep='first')]


In [None]:
print("Total Betwwen the two Samples: ", len(catalog_agn_sfg_vlbi_xray_unique) + len(catalog_agn_sfg_noxray_novlbi))
print("The Original length is: ", len(catalog_agn_sfg))

In [None]:
# catalog_agn_sfg_vlbi_xray_unique = catalog_agn_sfg_vlbi_xray_unique.drop(["XAGN",	"VLBAAGN"], axis='columns')


In [None]:
# catalog_agn_sfg_vlbi_xray_unique

In [None]:
# catalog_agn_sfg_noxray_novlbi = catalog_agn_sfg_noxray_novlbi.drop(["XAGN",	"VLBAAGN"], axis='columns')

In [None]:
catalog_agn_sfg_noxray_novlbi

In [None]:
catalog_agn_sfg_xray 

In [None]:
catalog_agn_sfg_vlbi 

In [None]:
from sklearn.model_selection import train_test_split

# Stratified split based on the 'Category' column
df_75_vlbi, df_25_vlbi = train_test_split(catalog_agn_sfg_vlbi, test_size=0.2, stratify=catalog_agn_sfg_vlbi ['VLBAAGN'], random_state=42)

df_75_xray, df_25_xray = train_test_split(catalog_agn_sfg_xray, test_size=0.2, stratify=catalog_agn_sfg_xray ['XAGN'], random_state=42)

df_75_all, df_25_all = train_test_split(catalog_agn_sfg_noxray_novlbi, test_size=0.2, stratify=catalog_agn_sfg_noxray_novlbi ['class_labels'], random_state=42)



In [None]:
df_75_xray

In [None]:
df_75_vlbi

In [None]:
# Combine the DataFrames vertically
df_train = pd.concat([df_75_vlbi, df_75_xray, df_75_all], ignore_index=True)
# df_train = df_75_all


In [None]:
df_train

In [None]:
df_test = pd.concat([df_25_vlbi, df_25_xray, df_25_all], ignore_index=True)


In [None]:
df_test

In [None]:
X_cols = ['CATID', 'Mstar', 'class_star', 'qir', 'log(S8/S45)', 'log(S58/S36)', 'log(S45/S36)']
# X_cols = ['class_star', 'qir', 'log(S8/S45)', 'log(S58/S36)']

# encoding target class
y_te, clas_te = pd.factorize(df_test["class_labels"]) #getting the class 0 = agn, 1 =notagn, 2 = no class
y_tr, clas_tr = pd.factorize(df_train["class_labels"]) #getting the class 0 = agn, 1 =notagn, 2 = no class
y_te_xray, clas_tr_xray = pd.factorize(df_25_xray["class_labels"]) 
y_te_vlbi, clas_tr_vlbi = pd.factorize(df_25_vlbi["class_labels"])
# y_tr_vlbi, clas_tr_vlbi = pd.factorize(df_25_vlbi["class_labels"])

y_train = pd.DataFrame(y_tr, columns = ['labels'])
y_test = pd.DataFrame(y_te, columns = ['labels'])
y_test_vlbi = pd.DataFrame(y_te_vlbi, columns = ['labels'])
y_test_xray = pd.DataFrame(y_te_xray, columns = ['labels'])



X_test = df_test[X_cols]
X_test_vlbi = df_25_vlbi[X_cols]
X_test_xray = df_25_xray[X_cols]

X_train = df_train[X_cols]

# # saving the dataframe as raw_data
X_train.to_csv('final-train-test/X_train.csv', index = False, header=True)
y_train.to_csv('final-train-test/y_train.csv', index = False, header=True)
X_test.to_csv('final-train-test/X_test.csv', index = False, header=True)
y_test.to_csv('final-train-test/y_test.csv', index = False, header=True)

y_test_xray.to_csv('final-train-test/y_test_xray.csv', index = False, header=True)
y_test_vlbi.to_csv('final-train-test/y_test_vlbi.csv', index = False, header=True)
X_test_vlbi.to_csv('final-train-test/X_test_vlbi.csv', index = False, header=True)
X_test_xray.to_csv('final-train-test/X_test_xray.csv', index = False, header=True)


# Original raw files
df_test.to_csv('final-train-test/original_test_df.csv', index = False, header=True)
df_train.to_csv('final-train-test/original_train_df.csv', index = False, header=True)
df_25_vlbi.to_csv('final-train-test/vlbi_only_test_original.csv', index = False, header=True)
df_25_xray.to_csv('final-train-test/xray_only_test_original.csv', index = False, header=True)
df_25_all.to_csv('final-train-test/noxrayvlbi-test.csv', index = False, header=True)

df_75_vlbi.to_csv('final-train-test/vlbi_only_train20.csv', index = False, header=True)
df_75_xray.to_csv('final-train-test/xray_only_train20.csv', index = False, header=True)
df_75_all.to_csv('final-train-test/noxrayvlbi-train20.csv', index = False, header=True)

---

In [None]:
# shuffled_test = catalog_agn_sfg_vlbi_xray_unique.sample(frac=1).reset_index(drop=True)

# shuffled_train = catalog_agn_sfg_noxray_novlbi.sample(frac=1).reset_index(drop=True)

# # Set 'ID' column as the index
# # shuffled_test = shuffled_test.set_index('CATID')
# # # Set 'ID' column as the index
# # shuffled_train = shuffled_train.set_index('CATID')

In [None]:
# print('Number of AGN', len(catalog_agn_sfg_noxray_novlbi[catalog_agn_sfg_noxray_novlbi["class_labels"] == 'AGN']))
# print('Number of SFG', len(catalog_agn_sfg_noxray_novlbi[catalog_agn_sfg_noxray_novlbi["class_labels"] == 'SFG']))


In [None]:
# shuffled_train[shuffled_train["class_labels"] == 'AGN']

In [None]:
# shuffled_train

In [None]:
# X_cols_origin = ['Mstar', 'class_star', 'qir', 'log(S8/S45)', 'log(S58/S36)', 'log(S45/S36)']
# X_cols = ['class_star', 'qir', 'log(S8/S45)', 'log(S58/S36)']

# # encoding target class
# y_te, clas_te = pd.factorize(shuffled_test["class_labels"]) #getting the class 0 = agn, 1 =notagn, 2 = no class
# y_tr, clas_tr = pd.factorize(shuffled_train["class_labels"]) #getting the class 0 = agn, 1 =notagn, 2 = no class
# y_test = pd.DataFrame(y_te, columns = ['labels'])
# y_train = pd.DataFrame(y_tr, columns = ['labels'])

# X_test = shuffled_test[X_cols]
# X_train = shuffled_train[X_cols]

# x_xray = catalog_agn_sfg_xray[X_cols_origin]
# y_xray = catalog_agn_sfg_xray['class_labels']

# x_vlba = catalog_agn_sfg_vlbi[X_cols_origin]
# y_vlba = catalog_agn_sfg_vlbi['class_labels']

# ## Save the catalogue with ID's
# shuffled_train.to_csv('train_with_id.csv', index = False, header=True)
# shuffled_test.to_csv('test_with_id.csv', index = False, header=True)

# # # # saving the dataframe as raw_data
# # X_train.to_csv('X_train_bal.csv', index = False, header=True)
# # y_train.to_csv('y_train_bal.csv', index = False, header=True)
# # # X_test.to_csv('X_test_new.csv', index = False, header=True)
# # # y_test.to_csv('y_test_new.csv', index = False, header=True)

# # # XRAY
# # x_vlba.to_csv('X_vlba.csv', index = False, header=True)
# # y_vlba.to_csv('y_vlba.csv', index = False, header=True)
# # x_xray.to_csv('X_xray.csv', index = False, header=True)
# # y_xray.to_csv('y_xray.csv', index = False, header=True)

In [None]:
# catalog_agn = catalog_agn_sfg[catalog_agn_sfg['class_labels'] == "AGN"]
# catalog_sfg = catalog_agn_sfg[catalog_agn_sfg['class_labels'] == "SFG"]

# Xagn_x = catalog_agn[X_cols]
# Xsfg_x = catalog_sfg[X_cols]
# Xray_x = catalog_agn_sfg_xray[X_cols]
# Xvlbi_x = catalog_agn_sfg_vlbi[X_cols]

## TSNE CALS

In [None]:
# from sklearn.manifold import TSNE

# tsne = TSNE(n_components=2, random_state=42)
# X_tsne = tsne.fit_transform(X_train)
# X_tsne_xray_vlbi = tsne.fit_transform(X_test)
# X_tsne_xray = tsne.fit_transform(Xray_x)
# X_tsne_vlbi = tsne.fit_transform(Xvlbi_x)
# X_tsne_sfg = tsne.fit_transform(Xsfg_x)
# X_tsne_agn = tsne.fit_transform(Xagn_x)


# tsne.kl_divergence_

In [None]:
# plt.figure(figsize = (12, 8))
# plt.scatter(x=X_tsne[:, 0], y=X_tsne[:, 1], marker='d', facecolor='none', c=y_train['labels'],alpha = 0.9)
# # plt.scatter(x=X_tsne_xray_vlbi[:, 0], y=X_tsne_xray_vlbi[:, 1], c='r', alpha = 0.5)
# plt.scatter(x=X_tsne_xray[:, 0], y=X_tsne_xray[:, 1], c='r', marker='>', alpha = 0.5, label = "XAGN")
# plt.scatter(x=X_tsne_vlbi[:, 0], y=X_tsne_vlbi[:, 1], c='b', marker='+', alpha = 0.5, label = "VLBI")
# # plt.scatter(x=X_tsne_sfg[:, 0], y=X_tsne_sfg[:, 1], c='g', alpha = 0.5, label = "AGB")
# # plt.scatter(x=X_tsne_agn[:, 0], y=X_tsne_agn[:, 1], c='y', alpha = 0.5, label = "SFG")
# plt.xlabel('X1', fontweight ='bold', fontsize =18)
# plt.ylabel('X2', fontweight ='bold', fontsize =18)
# plt.xticks(fontsize=16)
# plt.yticks(fontsize=16)
# plt.show()

In [None]:
# from sources.ellipses import *
# from sklearn.metrics import mean_squared_error, r2_score

# drawGraph(df_sne[df_sne['labels']==1][['X1']],  
#             df_sne[df_sne['labels']==1][['X2']],
#             df_sne[df_sne['labels']==0][['X1']],
#             df_sne[df_sne['labels']==0][['X2']],
#            'X1', 'X2', "cor1",[-65, 85], [-66,60] )


### Transform full set

In [None]:
# X_full = catalog_agn_sfg

In [None]:
# X_full = X_full.reset_index()

In [None]:
# X_ful = X_full[X_cols]

In [None]:
# catalog_agn_sfg

In [None]:
# X_tsne_ful = tsne.fit_transform(X_ful)


In [None]:
# X_tsne_ful

In [None]:
# x_sne = np.array([X_tsne_ful[:, 0], X_tsne_ful[:, 1]]).T

# df_sne = pd.DataFrame(x_sne, columns=[ 'X1', 'X2'])

# df_sne['labels'] = X_full['class_labels']
# df_sne['VLBAAGN'] = X_full['VLBAAGN']
# df_sne['XAGN'] = X_full['XAGN']
# # df_sne[''] = catalog_agn_sfg
# # df_sne[''] = catalog_agn_sfg

In [None]:
# df_sne

In [None]:
# plt.figure(figsize = (12, 8))
# # df_sne[df_sne['labels']=='SFG']
# # plt.scatter(x=df_sne[:, 0], y=X_tsne[:, 1], marker='d', facecolor='none', c=y_train['labels'],alpha = 0.9)
# # plt.scatter(x=X_tsne_xray_vlbi[:, 0], y=X_tsne_xray_vlbi[:, 1], c='r', alpha = 0.5)
# plt.scatter(df_sne[df_sne['labels']=='AGN']['X1'], df_sne[df_sne['labels']=='AGN']['X2'], c='g', alpha = 0.5, label = "AGN")
# plt.scatter(df_sne[df_sne['labels']=='SFG']['X1'], df_sne[df_sne['labels']=='SFG']['X2'], c='y', alpha = 0.5, label = "SFG")
# plt.scatter(df_sne[df_sne['XAGN']==True]['X1'], df_sne[df_sne['XAGN']==True]['X2'], c='r', marker='>', alpha = 0.5, label = "XAGN")
# plt.scatter(df_sne[df_sne['VLBAAGN']==True]['X1'], df_sne[df_sne['VLBAAGN']==True]['X2'], c='b', marker='+', alpha = 0.5, label = "VLBI")
# plt.xlabel('X1', fontweight ='bold', fontsize =18)
# plt.ylabel('X2', fontweight ='bold', fontsize =18)
# plt.xticks(fontsize=16)
# plt.yticks(fontsize=16)
# plt.legend()
# plt.show()

In [None]:
#---