In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os 
import os.path as osp
import itertools
import astropy.io.fits as fits

from sources.preprocessor import data_processor # A well defined function to sample columns of interest from a full catalogue
from sources.normalisation import*
from sklearn.metrics import f1_score as f1
import timeit
import random
import seaborn as sns
sns.set_style("whitegrid")

In [None]:
# loading the data from the machine

mightee_data = pd.read_csv('raw_data_original.csv')


In [None]:
# mightee_data

In [None]:
# We split the data into training and test size for further analysis
from sklearn.model_selection import train_test_split

labels = mightee_data['class_labels']
X = mightee_data.drop(['class_labels'], axis = 1)

# encoding target class
y, clas = pd.factorize(labels) #getting the class 0 = agn, 1 =notagn, 2 = no class
y_target = pd.DataFrame(y, columns = ['labels'])
X_train, X_test, y_train, y_test = train_test_split(X, y_target, stratify = y, test_size=0.25, random_state=42)

In [None]:
# y_train

In [None]:
# covert the the Xtrain into an arr

norm_par = np.array([[0.5, 0.7, 1, 1.2, 1.4, 1.7 ], 
                    [0.7, 1, 1.2, 1.4, 1.7, 0.5 ],
                    [1, 1.2, 1.4, 1.7, 0.5, 0.7 ],
                    [1.2, 1.4, 1.7, 0.5, 0.7, 1 ],
                    [1.4, 1.7, 0.5, 0.7, 1, 1.2 ],
                    [1.7, 0.5, 0.7, 1, 1.2, 1.4 ]]) # Normalisation constants

In [None]:
# normalising data
X_tr_norm = norm_arr([np.array(X_train['Mstar']), 
                            np.array(X_train['qir']),
                            np.array(X_train['log(S8/S45)']),
                            np.array(X_train['log(S58/S36)']),
                            np.array(X_train['log(S45/S36)']),
                            np.array(X_train['class_star'])])               

In [None]:
# X_tr_norm

In [None]:
product = []
for i in range(len(norm_par)):
    product.append(feat_multiplication (X_tr_norm, norm_par[i]))

In [None]:
# product

In [None]:
# Generating the combinations of normalised features

result = generate_combinations(product)

In [None]:
len(result)

In [None]:
def classifier(model, X, y, s):
    X = np.vstack(X).T
    # y1, clas = pd.factorize(y) 
    X_train, X_vald, y_train, y_vald = train_test_split(X, y, test_size= s, random_state=1, stratify = y, shuffle = True)
    # return X_train, X_vald, y_train, y_vald

    # Classification with ML
    start_time = timeit.default_timer()

    model.fit(X_train, y_train)  

    y_pred = model.predict(X_vald)

    elapsed = timeit.default_timer() - start_time

    proba = model.predict_proba(X_vald)
    
    return f1(y_vald, y_pred)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# KNN model
knn_model = KNeighborsClassifier()


In [None]:
m = 0
result_index = np.arange(len(result))
score = []
feat_index = []

while m < 100:
    k = random.choice(result_index)
    # print(classifier(knn_model, result[k], y_train['labels'], 0.2))
    score.append(classifier(knn_model, result[k], y_train['labels'], 0.2))
    feat_index.append(k)
    m+=1

In [None]:
print(np.max(score))

In [None]:
# Using the min-max scaler

# The train data
norm_X = pd.read_csv('X_train.csv')
norm_y = pd.read_csv('y_train.csv')

# The Unseen test data
norm_X_test = pd.read_csv('X_test.csv')
norm_y_test = pd.read_csv('y_test.csv')

labels = norm_y['labels']

In [None]:
norm_X_arr = np.array([np.array(norm_X['Mstar']), 
                            np.array(norm_X['qir']),
                            np.array(norm_X['log(S8/S45)']),
                            np.array(norm_X['log(S58/S36)']),
                            np.array(norm_X['log(S45/S36)']),
                            np.array(norm_X['class_star'])])

true_X_arr = np.array([np.array(X_train['Mstar']), 
                            np.array(X_train['qir']),
                            np.array(X_train['log(S8/S45)']),
                            np.array(X_train['log(S58/S36)']),
                            np.array(X_train['log(S45/S36)']),
                            np.array(X_train['class_star'])])

In [None]:
minmax_score = classifier(knn_model, norm_X_arr, norm_y['labels'], 0.2)

In [None]:
ref_score = classifier(knn_model, true_X_arr, y_train['labels'], 0.2)

In [None]:
ref_score

In [None]:
colors = ['blue', 'green', 'orange']
scores = [minmax_score, ref_score, np.max(score)]
score_model = ['minmax', 'original', 'base_norm']
df = pd.DataFrame({"model":['minmax', 'original', 'base_norm'], 
                   "scores":[minmax_score, ref_score, np.max(score)], 
                   "id":[0, 1, 2]})
df

In [None]:
fig, ax = plt.subplots(figsize=(7,5))
ax = sns.pointplot(x="model", y="scores",  data=df, join=False)
ax = plt.ylim(0.93, 1)
ax = plt.xticks(rotation=45)
ax = plt.tight_layout()
ax = plt.show()