In [1]:
import pandas as pd
import numpy as np

import sys
from os.path import expanduser
import os

sys.path.append('./ML_Components/')

from featgen import Generate_Features
from model_fitting import *

from sklearn.model_selection import train_test_split

In [2]:
## First - import data required

### IMPORT FEATURE DATA
sirt_x = pd.read_csv('./ML_Components/sirt_training_feature_set.csv', index_col=0)

### IMPORT SIRT LABELS
sirt_y = pd.read_csv('./ML_Components/All_Sirts_Feature_Y_labels.csv', index_col=0)
sirt_y = sirt_y.set_index('uid_pos', drop=True)

# Be sure they're sorted the same
sirt_x = sirt_x.loc[sirt_y.index]
sirt_y = sirt_y.loc[sirt_x.index]

In [3]:
### IMPORT EXPERIMENTAL DATA
# Experimental dataset scoring
exp = pd.read_csv('./ML_Components/Surface_Exposed_Proteome_13_mer_Base_Features_MuSite_Deep_Scored.csv', index_col=0)

In [5]:
exp = exp.set_index('uid_pos', drop=True)
exp_predicted = exp[['Site']]
exp = exp.drop(columns=['Site'])

In [6]:
exp_predicted = exp_predicted.reset_index(drop=False)
exp_predicted['Uniprot ID'] = exp_predicted['uid_pos'].str.split('_', expand=True)[0]
exp_predicted['Position'] = exp_predicted['uid_pos'].str.split('_', expand=True)[1]
exp_predicted = exp_predicted.set_index('uid_pos', drop=True)

In [7]:
def experimental_prediction(meta_model, base_model, base_sampling, feat_x, feat_y, exp):
    exp_scores = exp[['SECONDARY_ML_SCORE']]
    exp = exp.drop(columns=['SECONDARY_ML_SCORE'])
    
    if type(meta_model) is str:
        feat_x = feat_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # FIRST FIT + PREDICT WITH BASE MODEL
        if base_sampling != "none":
            feat_x_s, feat_y_s = base_sampling.fit_resample(feat_x, feat_y)
            base_model_fit = base_model.fit(feat_x_s, feat_y_s)
        else:
            base_model_fit = base_model.fit(feat_x, feat_y)
        
        exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:,[1]]
        
        # Fit base training data to base model
        if meta_model == 'simp_avg_eq':
            exp_scores['META_ML_SCORE'] = (exp_scores['PRIMARY_ML_SCORE'] + exp_scores['SECONDARY_ML_SCORE'])/2
        elif meta_model == 'simp_avg_pr':
            exp_scores['META_ML_SCORE'] = ((exp_scores['PRIMARY_ML_SCORE']*2) + exp_scores['SECONDARY_ML_SCORE'])/3
        else:
            exp_scores['META_ML_SCORE'] = (exp_scores['PRIMARY_ML_SCORE'] + (exp_scores['SECONDARY_ML_SCORE']*2))/3
    else:
        # FIRST FIT + PREDICT WITH BASE MODEL
        # Split features up
        base_x, meta_x, base_y, meta_y = train_test_split(feat_x, feat_y, test_size = 0.5, stratify=feat_y)
        
        base_x = base_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # Fit base training data to base model
        if base_sampling != "none":
            base_x_s, base_y_s = base_sampling.fit_resample(base_x, base_y)
            base_model_fit = base_model.fit(base_x_s, base_y_s)
        else:
            base_model_fit = base_model.fit(base_x, base_y)
        
        meta_train_x = meta_x[['SECONDARY_ML_SCORE']]
        meta_x = meta_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # Predict meta model training scores + exp scores via base model
        meta_train_x['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(meta_x)[:, [1]]
        exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:, [1]]
        
        # Fit meta training data to meta model
        meta_model_fitted = meta_model.fit(meta_train_x, meta_y)
        
        # Predict meta model score using primary and secondary scores
        exp_scores['META_ML_SCORE'] = meta_model_fitted.predict_proba(exp_scores)[:, [1]]
        
    return exp_scores      

In [8]:
# Run for all SIRTs
totest = [1, 2, 3, 4, 5, 6, 7]
base_models = []
base_bals = []
meta_models = []
sirt_number = []
base_params = []

for i in totest:
    # Isolate y data for running our ML fitting with
    name = 'SIRT_' + str(i) + '_EXPERIMENTALLY_ACTIVE'
    s_y = sirt_y[[name]]
    
    # Output to user
    print('Now running', str(name), 'with', str(sum(s_y[name])), 'positives -', str(round(sum(s_y[name])/len(s_y[name])*100, 2)), '% positive')
    
    # Format for input into ML models
    s_y = s_y.rename(columns={name:'EXPERIMENTALLY_ACTIVE'})
    
    savefile = './SIRT'+str(i)+'_Files/S'+str(i)
    
    ### BASE MODEL FITTING ###
    # Initial base model fitting on testing data
    s_model, s_unfit, s_bal, s_metrics, s_train_x, s_train_y, s_params, s_scoring = model_fitting(sirt_x, s_y, True, None, None, (savefile+'_Base_Model_'))
    
    # Run base model fit on training data for over/underfitting analysis
    # Generate metrics
    s_metrics_tr, s_roc_tr, s_roc_auc_tr = model_metric_generation(s_train_x, s_train_y, s_train_x, s_train_y, s_model, s_bal)
    # Plot
    Plot(s_metrics_tr, (savefile+'_Training_Data_as_Test_Set_Base_Model_PR_'), s_roc_tr, s_roc_auc_tr, (savefile+'_Training_Data_as_Test_Set_Base_Model_ROC_'), (savefile+'_Training_Data_as_Test_Set_Base_Model_Metric_Curve_'))
    
    ### META MODEL FITTING ###
    # Run meta-model
    s_mmodel, s_unfit_m, s_mbal, s_mmetrics, s_base_train_x, s_base_train_y, s_meta_combo_train_x, s_meta_train_y, s_meta_test_x, s_meta_test_y = meta_model_fitting(s_unfit, s_bal, sirt_x, s_y, sirt_x, None, None, (savefile+'_Meta_Model_'), s_scoring)
    
    
    # Run meta model fit on training data
    # Generate metrics
    s_mmetrics_tr, s_mroc_tr, s_mroc_auc_tr = model_metric_generation(s_meta_combo_train_x, s_meta_train_y, s_meta_combo_train_x, s_meta_train_y, s_mmodel, s_mbal)
    # Plot
    Plot(s_mmetrics_tr, (savefile+'_Training_Data_as_Test_Set_Meta_Model_PR_'), s_mroc_tr, s_mroc_auc_tr, (savefile+'_Training_Data_as_Test_Set_Meta_Model_ROC_'), (savefile+'_Training_Data_as_Test_Set_Meta_Model_Metric_Curve_'))

    
    # Run fit assessment on secondary scores alone
    sec_score = sirt_x[['SECONDARY_ML_SCORE']]
    s_sec_roc_auc, s_sec_metrics, s_sec_roc = pre_scored_metric_generation(s_y, sec_score)
    # Plot
    s_sec_metrics_no_f = s_sec_metrics.drop(columns=['F-score'])
    Plot(s_sec_metrics_no_f, (savefile+'_MuSite_Deep_PR'), s_sec_roc, s_sec_roc_auc, (savefile+'_MuSite_Deep_ROC'), (savefile+'_MuSite_Deep_Metric_Curve'))
    
    ## Run experimental predictions
    exp_preds = experimental_prediction(s_mmodel, s_model, s_bal, sirt_x, s_y, exp)
    
    # Format experimental predictions df for output
    s_exp_lab = 'Base SIRT' + str(i) + ' ML Model Score'
    m_exp_lab = 'SIRT' + str(i) +' Ensemble ML Score'
    
    new_exp_predicted = pd.DataFrame(exp_predicted)
    new_exp_predicted[s_exp_lab] = exp_preds['PRIMARY_ML_SCORE']
    new_exp_predicted['MuSite Deep Score'] = exp_preds['SECONDARY_ML_SCORE']
    new_exp_predicted[m_exp_lab] = exp_preds['META_ML_SCORE']
    
    # Cutoff for meta model positives
    f_max = s_mmetrics[s_mmetrics['F-score'] == s_mmetrics['F-score'].max()]
    threshold = f_max['Threshold'].max()
    t_idx = f_max['Threshold'].idxmax()
    m_bin_lab = 'SIRT' + str(i) + ' Deacetylation Prediction (0 = no, 1 = yes)'
    new_exp_predicted[m_bin_lab] = 0
    new_exp_predicted.loc[new_exp_predicted[m_exp_lab] >= threshold, m_bin_lab] = 1
    print(sum(new_exp_predicted[m_bin_lab]), 'Positives within dataset')
    
    ### NOW SAVE EVERYTHING TO OUTPUT
    resfile = './SIRT'+str(i)+'_Files/'
    df_file = resfile + 'dfs/SIRT' + str(i)
    # Dataframes of metrics
    s_metrics.to_csv((df_file + '_base_model_metrics.csv')) # base model metrics
    s_metrics_tr.to_csv((df_file + '_base_model_trainingset_as_testingset_metrics.csv')) # base model metrics with training data as test data
    s_mmetrics.to_csv((df_file + '_meta_model_metrics.csv')) # meta model metrics
    s_mmetrics_tr.to_csv((df_file + '_meta_model_trainingset_as_testingset_metrics.csv')) # meta model metrics with training data as test data
    s_sec_metrics_no_f.to_csv((df_file + '_secondary_score_musite_deep_metrics.csv')) # secondary score metrics
    
    # Dataframe of experimental predictions
    new_exp_predicted.to_csv((resfile+'SIRT'+str(i)+'_ML_predictions_experimental_surface_exposed_lysine_set.csv'))
    
    # Text file with base model, meta model 
    metric_file_out = open((resfile + 'SIRT' + str(i) + '_ML_info.txt'), 'w')
    metric_file_out.write('Training Data: ' + str(sum(s_y['EXPERIMENTALLY_ACTIVE'])) + ' positives in ' +
                          str(len(s_y['EXPERIMENTALLY_ACTIVE'])) + ' negatives (' + 
                          str(round(sum(s_y['EXPERIMENTALLY_ACTIVE'])/len(s_y['EXPERIMENTALLY_ACTIVE'])*100, 2)) + '% positive)')
    metric_file_out.write('Base Model: ' + str(s_model) + ' with params: ' + str(s_params) +
                          'and balancing:' + str(s_bal) + '\n')
    metric_file_out.write('Meta Model: ' + str(s_mmodel) + '\n')
    metric_file_out.write('Meta Model Threshold of: ' + str(threshold) +
                          ', Precision: ' + str(round(s_mmetrics.loc[t_idx, 'Precision'], 2)) +
                          ', Recall: ' + str(round(s_mmetrics.loc[t_idx, 'Recall'], 2)) +
                          ', Sensitivity: ' + str(round(s_mmetrics.loc[t_idx, 'Sensitivity'], 2)) +
                          ', Specificity: ' + str(round(s_mmetrics.loc[t_idx, 'Specificity'], 2)) +
                          ', F-Score: '+ str(round(s_mmetrics.loc[t_idx, 'F-score'], 2)) + '\n'
                         )
    metric_file_out.write('Predicted positives: ' + str(sum(new_exp_predicted[m_bin_lab])) +
                         ' within a dataset of ' + str(len(new_exp_predicted[m_bin_lab])) + ' (' +
                         str(round((sum(new_exp_predicted[m_bin_lab])/len(new_exp_predicted[m_bin_lab]))*100, 4)) + 
                         '% positive)')
    metric_file_out.close()
    
    base_models.append(s_model)
    base_params.append(s_params)
    base_bals.append(s_bal)
    meta_models.append(s_mmodel)
    sirt_number.append(i)
    
    print('FINISHED SIRT' + str(i) + '!')
    
    i+=1

Now running SIRT_4_EXPERIMENTALLY_ACTIVE with 1038 positives - 15.26 % positive
0.15262461402734892
Data is imbalanced ( 15.26 % pos) adjusting model metric to f-score to best assess fit...
0.15261437908496733
Data is imbalanced ( 15.26 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Model fitting of DummyClassifier(strategy='most_frequent')
Model fitting of DummyClassifier(strategy='most_frequent') resulted in an f-score of 0.0
Model fitting of LogisticRegression(max_iter=1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.004898122600498469
Model fitting of LinearDiscriminantAnalysis()
Model fitting of LinearDiscriminantAnalysis() resulted in an f-score of 0.08193230394859678
Model fitting of KNeighborsClassifier()
Model fitting of KNeighborsClassifier() resulted in an f-score of 0.07421546948171258
Model fitting of SVC()
Model fitting of SVC() resulted in an f-score of 0.0
Model fitting of BaggingClassifier()
Model fitting of BaggingClassifier() resulted in an f-score of 0.028281627052358187
Model fitting of RandomForestClassifier()
Model fitting of RandomForestClassifier() resulted in an f-score of 0.0049200572352868
Model fitting of ExtraTreesClassifier()
Model fitting of ExtraTreesClassifier() resulted in an f-score of 0.028489914463894705
Model fitting of GradientBoostingClassifier()
Model fitting of GradientBoostingClassifier() resulted in an f-score of 0.017312499350455757
Now onto the balancing methods...
Balancing te



Basic balancing testing of KMeansSMOTE() failed, trying other methods...
Balancing testing of KMeansSMOTE() resulted in an f-score of 0
Basic balancing testing of ADASYN()
Balancing testing of ADASYN() resulted in an f-score of 0.0021201941022769696
Basic balancing testing of RandomUnderSampler()
Balancing testing of RandomUnderSampler() resulted in an f-score of 0.2844129620923214
Basic balancing testing of TomekLinks()
Balancing testing of TomekLinks() resulted in an f-score of 0.10093467685912086
Basic balancing testing of EditedNearestNeighbours()
Balancing testing of EditedNearestNeighbours() resulted in an f-score of 0.17289478254869517
Basic balancing testing of NeighbourhoodCleaningRule()
Balancing testing of NeighbourhoodCleaningRule() resulted in an f-score of 0.16257405530413152
Basic balancing testing of OneSidedSelection()
Balancing testing of OneSidedSelection() resulted in an f-score of 0.103821155145059
Basic balancing testing of SMOTEENN()
Balancing testing of SMOTEENN

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 463, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
                   ^^^^^^^^^^^^^^^^^^^
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/s

Best hyperparameters {'solver': 'svd'}


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Running permutation importance...


  y = column_or_1d(y, warn=True)


Finished permutation importance... Now graphing...
           Mean Feature Importance  Standard Deviation
Gs(U)_NO              1.428052e-04            0.000286
Mw_NO                -3.197323e-03            0.003277
HP_NO                -7.304615e-07            0.000453
IP_NO                -4.521916e-03            0.000713
ECI_NO               -4.951828e-03            0.003359
...                            ...                 ...
162_maccs            -5.997794e-03            0.005460
163_maccs             2.314101e-03            0.004800
164_maccs             0.000000e+00            0.000000
165_maccs            -6.411046e-03            0.002364
166_maccs             0.000000e+00            0.000000

[443 rows x 2 columns]
              Mean Feature Importance  Standard Deviation  Abs Importance
ONE-HOT_12-K                -0.011978            0.003988        0.011978
97_maccs                    -0.010770            0.003337        0.010770
ONE-HOT_3-Q                  0.010679      

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Initial length of features - x: 6801 , y: 6801
Test set contains x: 681 , y: 681
After removing the test set, remaining features are - x: 6120 , y: 6120
Base train x set: 3060
Meta train x set: 3060
Balancing the dataset via RandomOverSampler()
Base model: LinearDiscriminantAnalysis()


  y = column_or_1d(y, warn=True)


Base train x set: 5186
0.15261437908496733
Data is imbalanced ( 15.26 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Meta model fitting commencing...
Model fitting of LogisticRegression(max_iter=1000)
Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.0
Model fitting of simp_avg_eq
k_fold: simp_avg in model name
Model fitting of simp_avg_eq resulted in an f-score of 0.2603188333870783
Model fitting of simp_avg_pr
k_fold: simp_avg in model name
Model fitting of simp_avg_pr resulted in an f-score of 0.26865486149017104
Model fitting of simp_avg_sec
k_fold: simp_avg in model name
Model fitting of simp_avg_sec resulted in an f-score of 0.24213972678303625
Now onto the balancing methods...
Finished model fitting, proceeding with simp_avg_pr and none with an f1 of 0.26865486149017104
Simple average primary weighted selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Sensitivity at 0.5: 0.5
Specificity at 0.5: 0.63
Precision at 0.5: 0.2
Recall at 0.5: 0.5
Maximised F-score of 0.29 at a threshold of 0.39 Recall: 0.74 Specificity: 0.4 Precision: 0.18 Sensitivity: 0.74
Simple average primary weighted selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, l

66921 Positives within dataset
FINISHED SIRT4!
Now running SIRT_7_EXPERIMENTALLY_ACTIVE with 1692 positives - 24.88 % positive
0.24878694309660343
Data is balanced 24.88 % pos) adjusting model metric to roc-auc to best assess fit...
0.2488562091503268
Data is balanced 24.89 % pos) will not be applying sampling methods, moving on...
Now onto the automatic model fitting...
Model fitting of DummyClassifier(strategy='most_frequent')
Model fitting of DummyClassifier(strategy='most_frequent') resulted in an f-score of 0.5
Model fitting of LogisticRegression(max_iter=1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.5805349198616534
Model fitting of LinearDiscriminantAnalysis()
Model fitting of LinearDiscriminantAnalysis() resulted in an f-score of 0.5930123169446901
Model fitting of DecisionTreeClassifier()
Model fitting of DecisionTreeClassifier() resulted in an f-score of 0.515588519881213
Model fitting of KNeighborsClassifier()
Model fitting of KNeighborsClassifier() resulted in an f-score of 0.4934348034045572
Model fitting of SVC()
Model fitting of SVC() resulted in an f-score of 0.5296156374172012
Model fitting of BaggingClassifier()
Model fitting of BaggingClassifier() resulted in an f-score of 0.5432714881626595
Model fitting of RandomForestClassifier()
Model fitting of RandomForestClassifier() resulted in an f-score of 0.5714505136338847
Model fitting of ExtraTreesClassifier()
Model fitting of ExtraTreesClassifier() resulted in an f-score of 0.5820084730482049
Model fitting of GradientBoostingClassifier()
Mode

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 463, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
                   ^^^^^^^^^^^^^^^^^^^
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/s

Best hyperparameters {'solver': 'lsqr'}


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, m

Finished generating metrics. Now plotting...
Running permutation importance...


  y = column_or_1d(y, warn=True)


Finished permutation importance... Now graphing...
           Mean Feature Importance  Standard Deviation
Gs(U)_NO                  0.085420            0.006632
Mw_NO                     0.001109            0.005240
HP_NO                    -0.001507            0.001599
IP_NO                     0.002549            0.003409
ECI_NO                    0.005554            0.005400
...                            ...                 ...
162_maccs                -0.002469            0.003872
163_maccs                -0.000973            0.002646
164_maccs                 0.000000            0.000000
165_maccs                -0.000058            0.000393
166_maccs                 0.000000            0.000000

[443 rows x 2 columns]
              Mean Feature Importance  Standard Deviation  Abs Importance
Gs(U)_NO                     0.085420            0.006632        0.085420
Z1_NO                        0.027711            0.010989        0.027711
Z2_NO                        0.020144      

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, m

Initial length of features - x: 6801 , y: 6801
Test set contains x: 681 , y: 681
After removing the test set, remaining features are - x: 6120 , y: 6120
Base train x set: 3060
Meta train x set: 3060
Did not balance the dataset as base_bal = none
Base model: LinearDiscriminantAnalysis(solver='lsqr')


  y = column_or_1d(y, warn=True)


Base train x set: 3060
0.24901960784313726
Data is balanced 24.9 % pos) will not be applying sampling methods, moving on...
Now onto the automatic model fitting...
Meta model fitting commencing...
Model fitting of LogisticRegression(max_iter=1000)
Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.5692575574592452
Model fitting of simp_avg_eq
k_fold: simp_avg in model name
Model fitting of simp_avg_eq resulted in an f-score of 0.5488572117246184
Model fitting of simp_avg_pr
k_fold: simp_avg in model name
Model fitting of simp_avg_pr resulted in an f-score of 0.5324515274845034
Model fitting of simp_avg_sec
k_fold: simp_avg in model name
Model fitting of simp_avg_sec resulted in an f-score of 0.5360000884419308
Now onto the balancing methods...
Finished model fitting, proceeding with LogisticRegression(max_iter=1000) and none with an roc_auc of 0.5692575574592452


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

Finished generating metrics. Now plotting...
Sensitivity at 0.5: 0.0
Specificity at 0.5: 1.0
Precision at 0.5: 0.0
Recall at 0.5: 0.0
Maximised F-score of 0.4 at a threshold of 0.19 Recall: 0.95 Specificity: 0.08 Precision: 0.25 Sensitivity: 0.95


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:, [1]]
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_scores['META_ML_SCORE'] = meta_model_fitted.predict_proba(exp_scores)[:, [1]]


125892 Positives within dataset
FINISHED SIRT7!


<Figure size 800x600 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>