In [None]:
import pandas as pd
import numpy as np
import iFeatureOmegaCLI
import os
import pickle
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, roc_auc_score

def extract_feature(input_txt_path, feature_name):
    feature = iFeatureOmegaCLI.iRNA(input_txt_path)

    feature.get_descriptor(feature_name)

    return feature.encodings.reset_index(drop=True)

feature_names = [
    "CKSNAP type 1", "Z_curve_48bit", "Kmer type 1", "Mismatch", "DBE", 
    "ENAC", "NAC", "MMI", "NCP", "PS2", "ASDC"
]

inputfile = '/ind_drosophila.fasta'

sequences = {}
for record in SeqIO.parse(inputfile, "fasta"):
    sequences[record.id] = str(record.seq)

all_features = {}

for feature_name in feature_names:
    feature_data = extract_feature(inputfile, feature_name)
    all_features[feature_name] = feature_data

    length = len(feature_data)
    pos_lab = np.ones(int(length / 2))
    neg_lab = np.zeros(int(length / 2))
    new_column = np.concatenate((pos_lab, neg_lab), axis=0)
    
    feature_data['new_column'] = new_column
    labels = feature_data['new_column'].to_numpy()

    data_io = feature_data.drop(columns=['new_column']).to_numpy()

    '''
    c = list(zip(data_only, labels))
    random.Random(100).shuffle(c)
    data_io, labels = zip(*c)
    '''

    feature_key = feature_name.lower().replace(" type 1", "").replace("_48bit", "").replace(" ", "_").replace("2","")
    globals()[f"data_{feature_key}"] = np.asarray(data_io)
    globals()[f"labels_{feature_key}"] = np.asarray(labels)

classifiers = ["LGBMClassifier", "SVC", "LogisticRegression","MLPClassifier","KNeighborsClassifier","AdaBoostClassifier", 
               "ExtraTreesClassifier", "CatBoostClassifier", "GaussianNB", "XGBClassifier","RandomForestClassifier"]

datasets = {
    'cksnap': (data_cksnap, labels_cksnap, classifiers),
    'Z_curve': (data_z_curve, labels_z_curve, classifiers),
    'kmer': (data_kmer, labels_kmer, classifiers),
    'mismatch': (data_mismatch, labels_mismatch, classifiers),
    'dbe': (data_dbe, labels_dbe, classifiers),
    'enac': (data_enac, labels_enac, classifiers),
    'nac': (data_nac, labels_nac, classifiers),
    'mmi': (data_mmi, labels_mmi, classifiers),
    'ncp': (data_ncp, labels_ncp, classifiers),
    'ps': (data_ps, labels_ps, classifiers),
    'asdc': (data_asdc, labels_asdc, classifiers),
}


independent_preds_list = []
independent_probs_list = []
all_y_tests = []
model_accuracies = []
model_accuracies_dict = {}

for model_type, (data, labels, classifiers) in datasets.items():
    
    for classifier in classifiers:
        model_name = classifier
        model_path = f'/danio_{model_name}_{model_type}_fold1.pickle'
        
        if os.path.exists(model_path):
            with open(model_path, 'rb') as model_file:
                loaded_classifier = pickle.load(model_file)
            
            y_pred = loaded_classifier.predict(data)
            accuracy = accuracy_score(labels, y_pred)
            independent_preds_list.append(y_pred)

            model_accuracies_dict[f'{model_name}_{model_type}'] = accuracy

            #print(f'{model_name} Accuracy ({model_type}): {accuracy}')

            if hasattr(loaded_classifier, "predict_proba"):
                y_prob = loaded_classifier.predict_proba(data)[:, 1]
                independent_probs_list.append(y_prob)


independent_y_preds = np.column_stack(independent_preds_list)
independent_y_probs = np.column_stack(independent_probs_list) if independent_probs_list else None

meta_model_path = '/danio_META_fold0.pickle'
with open(meta_model_path, 'rb') as model_file:
    meta_classifier = pickle.load(model_file)

y_pred_ens = meta_classifier.predict(independent_y_preds)

acc_indep = accuracy_score(labels, y_pred_ens)
mcc_indep = matthews_corrcoef(labels, y_pred_ens)

confusion = confusion_matrix(labels, y_pred_ens)
TN, FP, FN, TP = confusion.ravel()
sensitivity = TP / float(TP + FN)
specificity = TN / float(TN + FP)
F1Score = (2 * TP) / float(2 * TP + FP + FN)
precision = TP / float(TP + FP)
recall = TP / float(TP + FN)

y_pred_prob = meta_classifier.predict_proba(independent_y_preds)  
y_probs = y_pred_prob[:, 1]
y_probs_change = np.where(y_probs < 0.5, 1 - y_probs, y_probs)

ROCArea = roc_auc_score(labels, y_probs)

output_file = '/results.txt'

with open(output_file, 'w') as file:
    for seq_id, seq in sequences.items():
        idx = int(seq_id) - 1 
        pred = int(y_pred_ens[idx])
        prob = y_probs_change[idx]

        file.write(f"{seq}, {pred}, {prob:.6f}\n")

'''
results = {
    'Accuracy': [acc_indep],
    'Sensitivity': [sensitivity],
    'Specificity': [specificity],
    'MCC': [mcc_indep],
    'AUC': [ROCArea],
    'F1 Score': [F1Score]
}

df = pd.DataFrame(results)

df.to_csv('/classification_results.csv', index=False)

accuracies_df = pd.DataFrame(list(model_accuracies_dict.items()), columns=['Model', 'Accuracy'])
accuracies_df.to_csv('/model_accuracies.csv', index=False)

print('Acc: ', acc_indep)
print('Sn: ', sensitivity)
print('Sp: ', specificity)
print('Mcc: ', mcc_indep)
print('AUC: ', ROCArea)
print('F1: ', F1Score)
'''