In [None]:
# Loading necessary modlules

import os 
import os.path as osp
import itertools
import astropy.io.fits as fits
import astropy.units as u
import astropy.constants as c
from astropy.cosmology import FlatLambdaCDM
import numpy as np
import matplotlib.pyplot as pl
pl.rcParams['axes.labelsize'] = 16
pl.rcParams['axes.titlesize'] = 16

import pandas as pd
import seaborn as sns
# pl.style.use('seaborn')
import timeit

from astropy.stats import bayesian_blocks
from astropy.wcs import WCS
from astropy.coordinates import SkyCoord
from astropy.table import Table

from scipy.stats import binned_statistic,norm, bayes_mvs
import sklearn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score 
# from sklearn.model_selection import Kfold
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.metrics import classification_report, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import gaussian_kde

from sklearn.linear_model import LinearRegression
from astropy.wcs import WCS # my additional package
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from scipy.stats import gaussian_kde
import xgboost
from xgboost import plot_importance
from pprint import pprint
# pl.style.use('fivethirtyeight')
pl.style.use('seaborn-ticks')
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline


In [None]:
# loading the data from the machine
# file = osp.join("MIGHTEEXMATCH+allmulti+classes2.fits")
# file = osp.join("MIGHTEEXMATCH allmulti classes_081221.fits")

file = osp.join("COSMOSXMATCH+classes_040422_withphotometry.fits")
fp = fits.open(file, memmap=True)
head = fp[1].header

data = fp[1].data
fp.close

# number of elements in the data
N_all = len(data)

# IMPORTANT FUNCTIONS

In [None]:
# DATA SAMPLING AND PREPROCESSING
def data_processor( data, x_features, y_features, binary_classification = True ):
    """ This function prepares the samples a subset of features for machine learning from a high dimensional data.
        classification : type of classification, binary == SFG or AGNs, Other wise classification will be for AGNs
        or SFG or noclass
        data = high dimensional dataset (MIGHTEE data)
        X-features = columns of interest contain data
        Y-features = the output features 
        
    """
    if binary_classification == True:
        # extracting the x-features
        x_sets = []
        for i in x_features:
            x = data[i]
            x_sets.append(np.array(x))
    
        X = np.vstack((x_sets)).T
    
        #extracting the y-feature
        y_sets = []
        for l in y_features:
            y = data[l]
            y_sets.append(y)
        
        y = np.vstack((y_sets)).T
    
        # converting the features into the data frame
        y_data = pd.DataFrame(y, columns = y_features)
        X_data = pd.DataFrame(X, columns = x_features)

        # joinin the two data sets into one dataframe
        mightee_data = pd.concat([X_data, y_data], axis=1, sort=False)
    
        # Sampling the sources that are classified as midIRAGB = AGN and the sources that are classified as notmidIRAGN = SFG
        AGN = mightee_data[mightee_data[y_features[0]] == True]
        SFG = mightee_data[mightee_data[y_features[1]] == True]
        probSFG = mightee_data[mightee_data[y_features[2]] == True]
    
        print('shape of the AGN: ', AGN.shape)
        print('shape of the SFG: ', SFG.shape)
        print('shape of the probSFG: ', probSFG.shape)
        print('total sample: ', len(AGN) + len(SFG)+len(probSFG))
    
        # We now drop the not column
        mightee_agn = AGN.drop([y_features[1], y_features[2]], axis = 1)
        mightee_sfg = SFG.drop([y_features[1], y_features[2]], axis = 1)
        mightee_probsfg = probSFG.drop([y_features[1], y_features[2]], axis = 1)

        # We now replace True with the true label AGN or SFG for the corresponding source
        mightee_agn1 = mightee_agn.replace(True, 'AGN', regex=True)
        mightee_sfg1 = mightee_sfg.replace(False, 'SFG', regex=True)
        mightee_probsfg1 = mightee_probsfg.replace(False, 'SFG', regex=True)
    
        # combining this data into one
        complete_mightee = pd.concat([mightee_agn1, mightee_sfg1, mightee_probsfg1], sort=False)
        complete_mightee1 = complete_mightee.replace(-np.inf, np.nan, regex=True) 
        print("DONE PROCESSING")
    
        return complete_mightee1





    else:
        x_sets = []
        for i in x_features:
            x = data[i]
            x_sets.append(np.array(x))

        X = np.vstack((x_sets)).T

        #extracting the y-feature
        y_sets = []
        for l in y_features:
            y = data[l]
            y_sets.append(y)

        y = np.vstack((y_sets)).T

        # converting the features into the data frame
        y_data = pd.DataFrame(y, columns = y_features)
        X_data = pd.DataFrame(X, columns = x_features)

        # joinin the two data sets into one dataframe
        mightee_data = pd.concat([X_data, y_data], axis=1, sort=False)

        # Sampling the sources that are classified as midIRAGB = AGN and the sources that are classified as notmidIRAGN = SFG
        AGN = mightee_data[mightee_data[y_features[0]] == True]
        SFG = mightee_data[mightee_data[y_features[1]] == True]
        probSFG = mightee_data[mightee_data[y_features[2]] == True]
        noclass = mightee_data[(mightee_data[y_features[0]] == False) & (mightee_data[y_features[1]] == False) & (mightee_data[y_features[2]] == False)]

        print('shape of the AGN: ', AGN.shape)
        print('shape of the SFG: ', SFG.shape)
        print('shape of the probSFG: ', probSFG.shape)
        print('shape of unclassified: ',noclass.shape)
        print('total sample: ', len(AGN) + len(SFG) + len(probSFG) + len(noclass))

        # We now drop the not column
        mightee_agn = AGN.drop([y_features[1], y_features[2]], axis = 1)
        mightee_sfg = SFG.drop([y_features[1], y_features[2]], axis = 1)
        mightee_probsfg = probSFG.drop([y_features[1], y_features[2]], axis = 1)
        mightee_noclass = noclass.drop([y_features[1], y_features[2]], axis = 1)

        # We now replace True with the true label AGN or SFG for the corresponding source
        mightee_agn1 = mightee_agn.replace(True, 'AGN', regex=True)
        mightee_sfg1 = mightee_sfg.replace(False, 'SFG', regex=True)
        mightee_probsfg1 = mightee_probsfg.replace(False, 'SFG', regex=True)
        mightee_noclass1 = mightee_noclass.replace(False, 'noclass', regex=True)

        # combining this data into one
        complete_mightee = pd.concat([mightee_agn1, mightee_sfg1, mightee_probsfg1, mightee_noclass1], sort=False)
        complete_mightee1 = complete_mightee.replace(-np.inf, np.nan, regex=True) 
        print("DONE PROCESSING")
    
        return complete_mightee1

In [None]:
# CLASSIFIER AND METRICS ALGORITHMS

def classifier(model, X_features, y_features):
    #     getting the indices
    indices = np.arange(X_features.shape[0])
    
    X_dummies = pd.get_dummies(X_features)
    
    y_target, clas = pd.factorize(y_features) #getting the class 0 = agn, 1 =notagn, 2 = no class
    
    #     we split the sample into the testing and training
    x_train, x_test, y_train, y_test, i_train, i_test = train_test_split(X_dummies, y_target, indices, test_size=0.20)
    
    # Source Classification
    start_time = timeit.default_timer()

    model.fit(x_train, y_train)  

    y_xgb = model.predict(x_test)

    elapsed = timeit.default_timer() - start_time

    proba = xgb_model.predict_proba(x_test)


    acu = accuracy(y_test, y_xgb)
    
    print('Elapsed time for XGB: {} seconds'.format(elapsed))
    print(len(y_xgb))
    print('Accuracy for XGB is: {}'.format(acu))
    print(metrics.classification_report(y_test, y_xgb, target_names=clas, digits=4))
    
def confusion_matrix(cm, classes, 
                        name = '',
                        normalize=False,
                        title='Confusion matrix',
                        cmap=pl.cm.Greens):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    pl.imshow(cm, interpolation='nearest', cmap=cmap)
    pl.title(title)
    pl.colorbar()
    tick_marks = np.arange(len(classes))
    pl.xticks(tick_marks, classes, rotation=45)
    pl.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        pl.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    pl.tight_layout()
    pl.ylabel('True label')
    pl.xlabel('Predicted label')
    pl.savefig(name)
    pl.show()# Feature importance for the experiment
    
# Feature importance for the experiment
def feature_importance(data):
    importances = pd.DataFrame({
        'Feature': data.drop('AGN', axis=1).columns,
        'Importance': xgb_model.feature_importances_
    })
    importances = importances.sort_values(by='Importance', ascending=False)
    importances = importances.set_index('Feature')
#     print(importances)
    
    
    pl.figure(figsize = (16, 10))

    importances.plot.bar()
    pl.show()
    
    
    


In [None]:
# The train_VS_score manual cross validation
def train_vs_score_cv( model, X, y, train_size, binary = True):
    
    if binary == True:
        #         Scores per class
        agn_f1, sfg_f1 = [], []
        agn_rec, sfg_rec = [], []
        agn_pre, sfg_pre = [], []
        rand_stat = np.arange(10, 400, 20)
        for i in range(len(train_size)):
            # define the test size
            test_size = 1 - train_size[i]
            # ramdomly spliting the data n times for each test-size
            agn_f1_random, sfg_f1_random = [], []
            agn_rec_random, sfg_rec_random = [], []
            agn_pre_random, sfg_pre_random = [], []
            n  = 0
            while n < 20:
            #     we split the sample into the testing and training
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state = rand_stat[n])

            # Source Classification
                model.fit(X_train, np.ravel(y_train))  

                y_xgb = model.predict(X_test)
                
                agn_f1_sco1  = f1(y_test, y_xgb, average = 'weighted', labels = [0])
                sfg_f1_sco1  = f1(y_test, y_xgb, average = 'weighted', labels = [1])
                
                
                agn_rec1 = recall(y_test, y_xgb, average = 'weighted', labels = [0])
                sfg_rec1 = recall(y_test, y_xgb, average = 'weighted', labels = [1])

                agn_pre1 = precision(y_test, y_xgb, average = 'weighted', labels = [0])
                sfg_pre1 = precision(y_test, y_xgb, average = 'weighted', labels = [1])
        

                agn_f1_random.append(agn_f1_sco1)
                sfg_f1_random.append(sfg_f1_sco1)
                
                agn_rec_random.append(agn_rec1)
                sfg_rec_random.append(sfg_rec1)
                
                agn_pre_random.append(agn_pre1)
                sfg_pre_random.append(sfg_pre1)
                n = n + 1

            agn_f1.append(np.mean(agn_f1_random))
            sfg_f1.append(np.mean(sfg_f1_random))
            
            agn_rec.append(np.mean(agn_rec_random))
            sfg_rec.append(np.mean(sfg_rec_random))
            
            agn_pre.append(np.mean(agn_pre_random))
            sfg_pre.append(np.mean(sfg_pre_random))

        # plot style
        import seaborn as sns
        sns.set_style("whitegrid", {"axes.facecolor": ".98"})
        
        # Three subplots sharing both x/y axes
        f, (ax1, ax2, ax3) = pl.subplots(3, figsize=(13, 10), sharex=True) #, sharey=True)
        #F1
        ax3.plot(train_size, agn_f1,'--', label = 'AGN', c = 'r', linewidth = 3, alpha = 1)
        ax3.plot(train_size, sfg_f1, '-.', label = 'SFG', c = 'g', linewidth = 3, alpha = 1)
        ax3.set_ylabel('F1', fontweight ='bold', fontsize =15)
        ax3.set_xlabel('size of train data', fontweight ='bold', fontsize =15)
        ax3.legend(loc = 'lower right')
        
        #precision
        ax1.plot(train_size, agn_pre, '--', c = 'r', label = 'AGN', linewidth = 3, alpha = 1)
        ax1.plot(train_size, sfg_pre, '-.', c = 'g', label = 'SFG', linewidth = 3, alpha = 1)
        # ax1.set_title('RF model trained only with flux density')
        ax1.set_ylabel('precision', fontweight ='bold', fontsize =15)
        ax1.legend(loc = 'lower right')
        
        #Recall
        ax2.plot(train_size, agn_rec, '--', c = 'r', label = 'AGN', linewidth = 3, alpha = 1)
        ax2.plot(train_size, sfg_rec, '-.', c = 'g', label = 'SFG', linewidth = 3, alpha = 1)
        ax2.set_ylabel('recall', fontweight ='bold', fontsize =15)
        ax2.legend(loc = 'lower right')

        f.subplots_adjust(hspace=0)
        pl.setp([a.get_xticklabels() for a in f.axes[:-1]], visible=False)
        pl.savefig('color_ml_results')
        pl.show()

        #         converting scores to a dataframe
        agn_metrics = np.vstack([np.array(train_size),
                                 np.array(agn_f1),
                                 np.array(agn_pre),
                                 np.array(agn_rec)]).T
    
    
        sfg_metrics = np.vstack([np.array(train_size),
                                 np.array(sfg_f1),
                                 np.array(sfg_pre),
                                 np.array(sfg_rec)]).T
        
        agn_score_data = pd.DataFrame(agn_metrics, columns = ['train_size', 'f1', 'precision', 'recall'])
        sfg_score_data = pd.DataFrame(sfg_metrics, columns = ['train_size', 'f1', 'precision', 'recall'])
        
        return agn_score_data, sfg_score_data
        
    else:
        print('Error; only works for a binary classification')

In [None]:
# Drawing a table for results
from tabulate import tabulate
  
def mytable(agn_cat, sfg_cat, features):
    # assign data
    mydata = [["AGN", np.mean(agn_cat['f1']), np.mean(agn_cat['precision']), np.mean(agn_cat['recall'])],
          ["SFG", np.mean(sfg_cat['f1']), np.mean(sfg_cat['precision']), np.mean(sfg_cat['recall'])]]
  
    # create header
    head = ["Class", "F1_score", "Precision Score", "Recall score" ]
  
    # display table
    ML_results = tabulate(mydata, headers=head, tablefmt="grid")
    print('The table for ' + features)
    print(ML_results)

---
---

# DATA ANALYSIS

In [None]:
# THE SPATIAL DISTRIBUTION OF THE DATA

pl.figure(figsize = (10, 10))
pl.scatter( data['RADIORA'], data['RADIODEC'], c='r', lw=2, s=4)
pl.ylabel('DEC (deg)',  fontweight ='bold', fontsize =18)
pl.xlabel('RA (deg)',  fontweight ='bold', fontsize =18)
pl.title('Spatial distribution of MIGHTEE Sources',  fontweight ='bold', fontsize =21)
pl.show()

In [None]:
# Important data columns for machine learning

x_fea = ['COS_best_z_v5', 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX', 'SPLASH_4_FLUX',
            'L14','LIR_WHz','MASS_lephare', 'class_star','qir','flux_HSC-G','flux_HSC-R', 'flux_HSC-I','flux_HSC-Z',
            'flux_J', 'flux_H', 'flux_Ks', 'flux_Y']
y_fea = ['AGN', 'SFG', 'probSFG']


trad_flux = data_processor(data, x_fea, y_fea)

In [None]:
# We rename AGN column to labels

trad_flux.rename(columns = {'AGN':'class_labels', 'MASS_lephare':'Mstar'}, inplace = True)

trad_flux_clean1 = trad_flux.dropna()
trad_flux_clean = trad_flux_clean1.drop(['class_star', 'qir'], axis = 1)
# trad_flux_clean1 = trad_flux


In [None]:
#Using Pearson Correlation, ### You dont have to seperate the sources in a correlation matrix its called data leakage
pl.figure(figsize=(12,10))
cor = trad_flux_clean.corr()
sns.heatmap(cor, annot=True, cmap=pl.cm.Reds)
pl.show()

In [None]:
# ## Making a pair plot using seaborn
# pl.figure(figsize=(12,10))
# sns.set(font_scale=2)
# sns.set_style('ticks')
# sns.pairplot(trad_flux_clean, hue="class_labels", 
#              plot_kws = { 'edgecolor': 'k', 'alpha':0.4, 'lw':0.5, 's':20 }, size = 4)
# pl.show()

In [None]:
trad_flux_clean.columns

In [None]:
# importance
from sklearn.inspection import permutation_importance

# Machine learning models
from sklearn.ensemble import RandomForestClassifier
# Machine Learning models
rf_model = RandomForestClassifier()

In [None]:
x_cols = ['COS_best_z_v5', 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX', 'SPLASH_4_FLUX',
            'L14','LIR_WHz','Mstar','flux_HSC-G','flux_HSC-R', 'flux_HSC-I','flux_HSC-Z',
            'flux_J', 'flux_H', 'flux_Ks', 'flux_Y']

x_cols_no_mstar = ['COS_best_z_v5', 'SPLASH_1_FLUX', 'SPLASH_2_FLUX', 'SPLASH_3_FLUX', 'SPLASH_4_FLUX',
            'L14','LIR_WHz','flux_HSC-G','flux_HSC-R', 'flux_HSC-I','flux_HSC-Z',
            'flux_J', 'flux_H', 'flux_Ks', 'flux_Y']
x_trad = trad_flux_clean[x_cols]

y_trad = trad_flux_clean['class_labels']

# encoding target class
y, clas = pd.factorize(y_trad)
y_target = pd.DataFrame(y, columns = ['labels'])

X_train, X_test, y_train, y_test = train_test_split(x_trad, y_target, stratify = y, test_size=0.25, random_state=42)

In [None]:
# x_trad

In [None]:
def importance (rf_model, X_train, y_train):

    iterations = np.arange(0, 50, 10)
    mean_importances = []
    std_importances = []
    labels = []

    rf_model.fit(X_train, np.ravel(y_train))
    result = permutation_importance(rf_model, X_train, np.ravel(y_train), 
                                    n_repeats=10, random_state=42, 
                                    scoring = make_scorer(f1), n_jobs=2)

    mean_importances = result.importances_mean
    std_importances = result.importances_std

    column_names = np.array(X_train.columns)

    sorted_idx1 = mean_importances.argsort()
    sorted_idx1 = sorted_idx1[::-1]
    
    x_pos = np.arange(len(column_names))
    x_class = column_names[sorted_idx1]
   
    # Build the plot
    fig, ax = pl.subplots(figsize = (12, 9))
    ax.bar(x_pos, mean_importances[sorted_idx1], yerr=std_importances[sorted_idx1], align='center', alpha=0.5, color = 'green', ecolor='black', capsize=10)
    ax.set_ylabel('Importance', fontsize=25)
    ax.set_xticks(x_pos)
    # ax.set_title('Permutation feature importances', fontsize = 15, fontweight ='bold')
    ax.set_xticklabels(x_class, ha='right', rotation = 45, fontsize = 20) #, fontweight ='bold')
    # ax.yaxis.grid(True)

    # Save the figure and show
    pl.setp(ax.get_yticklabels(), fontsize=20, ha="right",
         rotation_mode="anchor")
    pl.tight_layout()
    pl.savefig('all_perm_importance.pdf')
    pl.show()
    
    
importance (rf_model, X_train, y_train)

# Machine Learning For Fluxes

In [None]:
tr_size = np.linspace(0.01, 0.99, 50)

tr_size_02 = np.linspace(0.1, 0.99, 50)

In [None]:
# fitting RF model
# y_pred = knn.fit(x_train,y_train).predict(x_test)
y_pred = rf_model.fit(X_train, np.ravel(y_train)).predict(X_test)


# Compute confusion matrix for kNN classifier
cm = metrics.confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot normalized confusion matrix
pl.figure(figsize=(11,11))
confusion_matrix(cm,classes=['AGN','SFG'], name = 'cm_fluxes', normalize=True,
                      title='Normalized confusion matrix')

In [None]:
# agn_flux, sfg_flux = train_vs_score_cv( rf_model, X_train, y_train, tr_size)

---
---

# ANALYSIS FOR COLOURS AS INPUT FEATURES

In [None]:
# The trad Fluxes

S8_S45 = np.log10( trad_flux_clean1['SPLASH_4_FLUX'] / trad_flux_clean1['SPLASH_2_FLUX'])
S58_S36 = np.log10( trad_flux_clean1['SPLASH_3_FLUX'] / trad_flux_clean1['SPLASH_1_FLUX'])
S45_S36 = np.log10( trad_flux_clean1['SPLASH_2_FLUX'] / trad_flux_clean1['SPLASH_1_FLUX'])


trad_flux_clean1['log(S8/S45)'] = S8_S45
trad_flux_clean1['log(S58/S36)'] = S58_S36
# trad_flux_clean1['log(S58/S36)'] = S8_S45
# trad_flux_clean1['log(S8/S45)'] = S58_S36
trad_flux_clean1['log(S45/S36)'] = S45_S36

In [None]:
# use color as features
# magnitudes between two different filter bands.
# features: [u-g, g-r, r-i, i-z]
g_hsc = trad_flux_clean1['flux_HSC-G']
r_hsc = trad_flux_clean1['flux_HSC-R']
i_hsc = trad_flux_clean1['flux_HSC-I']
z_hsc = trad_flux_clean1['flux_HSC-Z']   
    
# feaures
g_r, r_i, i_z, g_i, g_z, r_z = np.array(np.log10(g_hsc / r_hsc)), np.array(np.log10(r_hsc /i_hsc)), np.array(np.log10(i_hsc /z_hsc)), np.array(np.log10(g_hsc / i_hsc)), np.array(np.log10(g_hsc / z_hsc)), np.array(np.log10(r_hsc / z_hsc))

trad_flux_clean1['log(g/r)'], trad_flux_clean1['log(r/i)'], trad_flux_clean1['log(i/z)'], trad_flux_clean1['log(g/i)'], trad_flux_clean1['log(g/z)'], trad_flux_clean1['log(r/z)']  =  g_r, r_i, i_z, g_i, g_z, r_z

In [None]:
# The NIR colours

y_hsc = trad_flux_clean1['flux_Y']
h_hsc = trad_flux_clean1['flux_H']
k_hsc = trad_flux_clean1['flux_Ks']
j_hsc = trad_flux_clean1['flux_J']
    
    
# feaures
y_j, j_h, h_k, y_h, y_k, j_k = np.array(np.log10(y_hsc / j_hsc)), np.array(np.log10(j_hsc /h_hsc)), np.array(np.log10(h_hsc / k_hsc)), np.array(np.log10(y_hsc / h_hsc)), np.array(np.log10(y_hsc / k_hsc)), np.array(np.log10(j_hsc / k_hsc))
# hsc_columns = ['y/j', 'j/h', 'h/k']
trad_flux_clean1['log(Y/J)'], trad_flux_clean1['log(J/H)'], trad_flux_clean1['$log(H/K_{s})$'], trad_flux_clean1['log(Y/H)'], trad_flux_clean1['$log(Y/K_{s})$'],trad_flux_clean1['$log(J/K_{s})$'] =  y_j, j_h, h_k,y_h, y_k, j_k

In [None]:
trad_flux_clean1

In [None]:
colors_data = trad_flux_clean1.drop(x_cols_no_mstar, axis = 1)
colors_data1 = colors_data.replace(-np.inf, np.nan, regex=True)
colors = colors_data1.dropna()

In [None]:
#Using Pearson Correlation, ### You dont have to seperate the sources in a correlation matrix its called data leakage
pl.figure(figsize=(12,10))
cor1 = colors.corr()
sns.heatmap(cor1, annot=True, cmap=pl.cm.Reds)
pl.savefig('pearson_corr')
pl.show()

In [None]:
## Making a pair plot using seaborn
pl.figure(figsize=(12,10))
sns.set(font_scale=2)
sns.set_style('ticks')
sns.pairplot(colors, hue="class_labels", 
             plot_kws = { 'edgecolor': 'k', 'alpha':0.4, 'lw':0.5, 's':20 }, size = 4)
pl.savefig('colors_pairplot')
pl.show()

In [None]:
print(colors.columns)

---

In [None]:
x_color = colors.drop('class_labels', axis = 1)

y_color = colors['class_labels']

# encoding target class
y1, clas1 = pd.factorize(y_color)
target = pd.DataFrame(y1, columns = ['labels'])

X_train1, X_test1, y_train1, y_test1 = train_test_split(x_color, target, stratify = y1, test_size=0.15, random_state=42)

In [None]:
X_train1_final= X_train1.rename(columns = {'Mstar':'$log (M_{star})$', 
                               'qir':'$q_\mathrm{IR}$', 
                               'log(S8/S45)':'$log(S_{8.0}/S_{4.5})$', 
                               'log(S58/S36)': '$log(S_{5.8}/S_{3.6})$',
                               'log(S45/S36)': '$log(S_{4.5}/S_{3.6})$'
                              } )

In [None]:
X_train1_final

In [None]:
# saving the dataframe as raw_data
X_train1_final.reset_index(inplace = True)
X_train1_final.to_csv('scaled_all_colors.csv', index = False, header=True)

# We split the data into training and test size for further analysis
from sklearn.model_selection import train_test_split
# from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler

scaler = MinMaxScaler()
# scaler = MaxAbsScaler()
# scaler = StandardScaler()

# labels = catalogue['class_labels']
# X = catalogue.drop(['class_labels'], axis = 1)

X_norm = scaler.fit_transform(X_train1_final)
scaled_X = pd.DataFrame( X_norm, columns = X_train1_final.columns)

scaled_X = scaled_X.drop("index", axis="columns")


In [None]:
# scaled_X = scaled_X.drop("level_0", axis="columns")

In [None]:
scaled_X.describe()

In [None]:
importance (rf_model, scaled_X, y_train1) 

# importance (rf_model, X_train1_final, y_train1) 

In [None]:
# agn_flux1, sfg_flux1 = train_vs_score_cv( rf_model, X_train1, y_train1, tr_size)

In [None]:
# colors.to_csv('all_colors.csv', index = False, header=True)

---
---

# UMAP

In [None]:
# from umap import UMAP

# plt.figure(figsize=(20,15))
# model = UMAP(n_neighbors = 15, min_dist = 0.25, n_components = 2, verbose = True)
# umap = model.fit_transform(X_train1)
# plt.scatter(umap[:, 0], umap[:, 1], c = y_train.astype(int), cmap = 'tab10', s = 50)
# plt.title('UMAP', fontsize = 20)
# plt.xlabel("UMAP1", fontsize = 20)
# plt.ylabel("UMAP2", fontsize = 20)
# plt.show()

In [None]:
# We use missingno to view the missingness in each feature in the data
# import missingno as msno

# fig = msno.bar(x_color,figsize=(10, 8), color = 'g')

# fig_copy = fig.get_figure()
# fig_copy.savefig('all-missingno.pdf', bbox_inches = 'tight')

# Generate the missingno bar plot
# ax = msno.bar(x_color, color = 'g')

# # Customize: hide the right vertical axis (right spine)
# ax.spines['right'].set_visible(False)

# # Further customization (optional)
# ax.set_xlabel('Features', fontsize=12)
# ax.set_ylabel('Count of non-null values', fontsize=12)
# ax.set_title('Missing Data Overview', fontsize=15)

# # Show the plot
# plt.show()