In [1]:
%%time
import time
import os
from pathlib import Path
import numpy as np
import seaborn as sns
from tqdm import tqdm
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from MicroLIA import training_set
from MicroLIA import ensemble_model as models
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

def set_data(dataset_path, model_name, app_w = True):
    path_test = [Path(dataset_path, folder) for folder in os.listdir(dataset_path) if "test" in folder][0]
    path_train = [Path(dataset_path, folder) for folder in os.listdir(dataset_path) if "train" in folder][0] #Path(folders_path,f"training_set-{folders_path[:-1]}") # here we put directories with lightcurves associated to a class
    folders = [name for name in os.listdir(path_test) if not '.txt' in name]
    data_x, data_y = training_set.load_all(path_train, apply_weights=app_w)
    
    model_result = Path("Models_results", model_name+"-"+ dataset_path)
    os.mkdir(model_result)
    generated_files = ["all_features_.txt", "lightcurves__.fits", "MicroLIA_Training_Set.csv"]
    for file in generated_files:
        os.rename(Path("..", file), Path(model_result, file))
    return data_x, data_y, path_train, path_test, folders

def set_model(model_name,  data_x, data_y, path_train, path_test, folders):
    '''Set and create the model
    Get predictions in "predictions.txt" with the columns: class, max_prob [prob of each class] file'''
    model = models.Classifier(data_x,data_y, clf = "xgb", optimize=False, impute=True, balance=True)
    model.create()
    predict_array = np.empty((0,3+len(folders)))
    for i, folder in enumerate(folders):
        print(f"Folder: {folder}    {i+1}/{len(folders)}")
        for file in tqdm(os.listdir(Path(path_test,folder))):
            data = np.loadtxt(Path(path_test,folder, file))
            time, mag, magerr = data[:,0], data[:,1], data[:,2]
            pred = model.predict(time, mag, magerr, convert=True, zp=27.5)
            predict_array = np.append(predict_array, [[folder,pred[np.argmax(pred[:,1])][0],*pred[:,1],file]],axis=0)
    head_folders = " ".join(folders)
    headers=f"class max_prob {head_folders} lc_file".split(" ")
    predict_df = pd.DataFrame(predict_array, columns=headers)
    predict_df.to_csv('predictions.txt', index=False, sep=' ')
    os.rename("../predictions.txt", Path(model_result, file))
#     np.savetxt(Path(model_result, "predictions.txt", predict_array, header= f"class max_prob {" ".join(folders)} lc_file", fmt='%s')
    return model, predict_df # class, max_prob [prob of each class] file

def metrics(y_true, y_pred, model_name, dataset_path):
    '''Plot confusion matrix on "cf_matrix.png"
    Write the presicion, recall, accuracy and f1 score in "score_metrics.txt"'''
    model_result = Path("Models_results", model_name+"-"+ dataset_path.split("/")[-1])
    if model_name == "xgb":
        XGB = {}
        for i, folder in enumerate(sorted(folders)):
            XGB[f"{i}.0"]=folder
        y_pred = [XGB[str(y)] for y in y_pred]

    Labels = [name.split("_")[-1] for name in folders]
    
    cf_matrix = confusion_matrix(y_true, y_pred, labels=folders, normalize='true')
    precision = round(precision_score(y_true, y_pred, average='macro'),3)
    recall = round(recall_score(y_true, y_pred, average='macro'),3)
    accuracy = round(accuracy_score(y_true, y_pred),3)
    f1_score = round((2*precision*recall)/(presicion+recall),3)
    print('Precision score [tp/(tp+fp)] = ', precision)
    print('Recall score [tp/(tp+fn)] = ', recall)
    print('Accuracy = ', accuracy)
    print('F1_score [2*presicion*recall/(pres+recall)] = ', f1_score)
    
    fig, ax = plt.subplots(figsize=(5,5))  
    fontsize=10
    sns.set(font_scale=fontsize/10)
    s = sns.heatmap(cf_matrix, annot=True, cmap='viridis', xticklabels=Labels, yticklabels=Labels, ax=ax)  # cmap='OrRd'
    plt.xticks(rotation=90,fontsize=fontsize)
    plt.yticks(rotation=0,fontsize=fontsize) 
    plt.ylabel('True target',fontsize=fontsize) 
    plt.xlabel('Prediction (max_prom)',fontsize=fontsize) 
    plt.title(f'{model_name} {dataset_path.split("/")[0]}\n{precision}/{recall}/{accuracy}/{f1_score}')
    plt.savefig(Path(model_result, "cf_matrix.png"))
    plt.close()
    with open(Path(model_result,"score_metrics.txt"), "w") as f:
        f.write(f'Precision score [tp/(tp+fp)] = {precision}\n' )
        f.write(f'Recall score [tp/(tp+fn)] = {recall}\n' )
        f.write(f'Accuracy = {accuracy}\n' )
        f.write(f'F1_score [2*presicion*recall/(pres+recall)] = {f1_score}\n' )
    return cf_matrix, precision, recall, accuracy, f1_score


2023-06-14 18:35:14.161028: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-14 18:35:14.221650: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-06-14 18:35:14.222753: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


CPU times: user 6.04 s, sys: 2.23 s, total: 8.27 s
Wall time: 22.2 s


clf (str) – The machine learning classifier to optimize. Can either be ‘rf’ for Random Forest, ‘nn’ for Neural Network, or ‘xgb’ for Extreme Gradient Boosting. Defaults to ‘rf’.
models_names = ["xgb", "rf", "nn", "cnn"]

In [None]:
# PROBAR ESTO: 

dataset_path =     # Path of the folder where are the folders of the test and of the train sets
model_name =       # "xgb", "nn", "rf"
 
data_x, data_y, path_train, path_test, folders = set_data(dataset_path, model_name, app_w = True)
model, predict_df = set_model(model_name, data_x, data_y, path_train, path_test, folders)
y_true = predict_df["class"]
y_pred = predict_df["max_prom"]
cf_matrix, precision, recall, accuracy, f1_score = metrics(y_true, y_pred, model_name, dataset_path)

In [12]:
# runs = [("xgb", "2305222212Y")]
# dataset_paths = ["2305222212"+ i for i in "ugrizY"]
# models_names = ["rf", "nn"]
# for model in models_names:
#     for run in dataset_paths:
#         runs.append((model, run))
# models_names.append("xgb")
runs = [("xgb", "Anibal_dataset")]
# for model in models_names:
#     runs.append((model, "Anibal_dataset"))
# dataset_path= "2305222212/2305222212r"

for run in runs:
    print("--------------------------------------------------------------------------------------------")
    print(run[0])
    print(run[1])
    run_model(*run)


--------------------------------------------------------------------------------------------
xgb
Anibal_dataset
Writing files to home directory...
(7388, 148) 148
Complete! Files saved in: /home/nowokaren/
XGBoost classifier requires numerical class labels! Converting class labels as follows:
________________________________
ELASTICC_TRAIN_EB  ------------->     0
ELASTICC_TRAIN_Mdwarf-flare  ------------->     1
ELASTICC_TRAIN_RRL  ------------->     2
ELASTICC_TRAIN_uLens-Single_PyLIMA  ------------->     3
________________________________
Returning base xgb model...
Folder: ELASTICC_TRAIN_EB    1/4


100%|██████████| 793/793 [01:49<00:00,  7.27it/s]


Folder: ELASTICC_TRAIN_RRL    2/4


100%|██████████| 793/793 [01:54<00:00,  6.95it/s]


Folder: ELASTICC_TRAIN_Mdwarf-flare    3/4


100%|██████████| 793/793 [01:45<00:00,  7.54it/s]


Folder: ELASTICC_TRAIN_uLens-Single_PyLIMA    4/4


100%|██████████| 793/793 [03:15<00:00,  4.05it/s]


ELASTICC_TRAIN_EB ELASTICC_TRAIN_EB
Precision score [tp/(tp+fp)] =  0.859
Recall score [tp/(tp+fn)] =  0.834
Accuracy =  0.834
