In [1]:
import pandas as pd
import numpy as np

import sys
from os.path import expanduser
import os

sys.path.append('./ML_Components/')

from featgen import Generate_Features
from model_fitting_v2 import *

from sklearn.model_selection import train_test_split

In [2]:
## First - import data required

### IMPORT FEATURE DATA
ptp_x = pd.read_csv('./features/final_dataset/palma_2017_x_features.csv', index_col=0)

### IMPORT SIRT LABELS
ptp_info = pd.read_csv('./features/final_dataset/palma_2017_y_features.csv', index_col=0)
ptp_y = ptp_info[['PTP1B_Binary',
       'DEP1_Binary', 'TC-PTP_Binary', 'HD-PTP_Binary', 'LAR_Binary',
       'LyP_Binary', 'MEG-1_Binary', 'MEG-2_Binary', 'PTP-PEST_Binary',
       'PTPH1_Binary', 'rPTP-alpha_Binary', 'rPTP-beta_Binary', 'SAP-1_Binary',
       'SHP-1_Binary', 'SHP-2_Binary', 'SECONDARY_ML_SCORE', 'Gene_Name',
       'ACC_ID', 'SITE_LOC']]


# Be sure they're sorted the same, and account for the removal of peptides flagged as 
#  anything but "GOOD" from the Palma et. al., 2017 dataset (as suggested by the authors)
ptp_x = ptp_x.loc[ptp_y.index]
ptp_y = ptp_y.loc[ptp_x.index]

In [3]:
ptp_x['SECONDARY_ML_SCORE'] = ptp_y['SECONDARY_ML_SCORE']

In [4]:
### IMPORT EXPERIMENTAL DATA
# Experimental dataset scoring
exp = pd.read_csv('Full_Tyr_Proteome_Exp_Set/full_proteome_tyr_peps_13mer_features.csv', index_col=0)
exp_details = pd.read_csv('Full_Tyr_Proteome_Exp_Set/full_proteome_tyr_peps_13mer_details.csv', index_col=0)

In [5]:
exp['SECONDARY_ML_SCORE'] = exp_details['SECONDARY_ML_SCORE']

In [6]:
## IMPORT MASS SPEC VALIDATION DATA
ms = pd.read_csv('./PTP1B_Exp_Set/PTP1B_SHP1_SHP2_final_for_ML_features.csv')
ms_details = pd.read_csv('./PTP1B_Exp_Set/PTP1B_SHP1_SHP2_final_for_ML.csv', index_col = 0)

In [7]:
ms['uid_pos'] = ms['uid_pos'] + '_' + ms_details['Dataset']

In [8]:
ms_details['uid_pos'] = ms_details['uid_pos'] + '_' + ms_details['Dataset']

In [9]:
ms['SECONDARY_ML_SCORE'] = ms_details['SECONDARY_ML_SCORE']

In [10]:
# Need to remove any peptides within the training dataset from our 
#  mass spec validation data to get a true reading on performance by assessing
#  only unlabeled, unseen data
ms_ptp = pd.merge(ms_details, ptp_info, on='Peptide', how='left', indicator=True)
ms_in_ptp = ms_ptp[ms_ptp['_merge'] == 'both']
ms_not_in_ptp = ms_ptp.drop(ms_in_ptp.index)

In [11]:
ms = ms.drop(ms_in_ptp.index)
ms_details = ms_details.drop(ms_in_ptp.index)

In [12]:
ms_details = ms_details.set_index('uid_pos', drop=True)
ms = ms.set_index('uid_pos', drop=True)

In [13]:
def experimental_prediction(meta_model, base_model, base_sampling, feat_x, feat_y, exp):
    exp_scores = exp[['SECONDARY_ML_SCORE']]
    exp = exp.drop(columns=['SECONDARY_ML_SCORE'])
    
    if type(meta_model) is str:
        feat_x = feat_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # FIRST FIT + PREDICT WITH BASE MODEL
        if base_sampling != "none":
            feat_x_s, feat_y_s = base_sampling.fit_resample(feat_x, feat_y)
            base_model_fit = base_model.fit(feat_x_s, feat_y_s)
        else:
            base_model_fit = base_model.fit(feat_x, feat_y)
        
        exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:,[1]]
        
        # Fit base training data to base model
        if meta_model == 'simp_avg_eq':
            exp_scores['META_ML_SCORE'] = (exp_scores['PRIMARY_ML_SCORE'] + exp_scores['SECONDARY_ML_SCORE'])/2
        elif meta_model == 'simp_avg_pr':
            exp_scores['META_ML_SCORE'] = ((exp_scores['PRIMARY_ML_SCORE']*2) + exp_scores['SECONDARY_ML_SCORE'])/3
        else:
            exp_scores['META_ML_SCORE'] = (exp_scores['PRIMARY_ML_SCORE'] + (exp_scores['SECONDARY_ML_SCORE']*2))/3
    else:
        # FIRST FIT + PREDICT WITH BASE MODEL
        # Split features up
        base_x, meta_x, base_y, meta_y = train_test_split(feat_x, feat_y, test_size = 0.5, stratify=feat_y)
        
        base_x = base_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # Fit base training data to base model
        if base_sampling != "none":
            base_x_s, base_y_s = base_sampling.fit_resample(base_x, base_y)
            base_model_fit = base_model.fit(base_x_s, base_y_s)
        else:
            base_model_fit = base_model.fit(base_x, base_y)
        
        meta_train_x = meta_x[['SECONDARY_ML_SCORE']]
        meta_x = meta_x.drop(columns=['SECONDARY_ML_SCORE'])
        
        # Predict meta model training scores + exp scores via base model
        meta_train_x['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(meta_x)[:, [1]]
        exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:, [1]]
        
        # Fit meta training data to meta model
        meta_model_fitted = meta_model.fit(meta_train_x, meta_y)
        
        # Predict meta model score using primary and secondary scores
        exp_scores['META_ML_SCORE'] = meta_model_fitted.predict_proba(exp_scores)[:, [1]]
        
    return exp_scores      

In [14]:
ptps_torun = ['PTP1B_Binary', 'DEP1_Binary', 'TC-PTP_Binary', 'HD-PTP_Binary',
       'LAR_Binary', 'LyP_Binary', 'MEG-1_Binary', 'MEG-2_Binary',
       'PTP-PEST_Binary', 'PTPH1_Binary', 'rPTP-alpha_Binary',
       'rPTP-beta_Binary', 'SAP-1_Binary', 'SHP-1_Binary', 'SHP-2_Binary']

In [16]:
base_models = []
base_bals = []
meta_models = []
sirt_number = []
base_params = []
i = 0

while i < len(ptps_torun):
    # Isolate y data for running our ML fitting with
    name = ptps_torun[i]
    s_y = ptp_y[[name]]
    adj_name = name[:-7]
    
    # Output to user
    print('Now running', adj_name, 'with', str(sum(s_y[name])), 'positives -', str(round(sum(s_y[name])/len(s_y[name])*100, 2)), '% positive')
    
    # Format for input into ML models
    s_y = s_y.rename(columns={name:'EXPERIMENTALLY_ACTIVE'})
    
    savefile = './PTP_Files/'+adj_name+'/'+adj_name
    
    ### BASE MODEL FITTING ###
    # Initial base model fitting on testing data
    s_model, s_unfit, s_bal, s_metrics, s_train_x, s_train_y, s_params, s_scoring = model_fitting(ptp_x, s_y, True, None, None, (savefile+'_Base_Model_'))
    
    # Run base model fit on training data for over/underfitting analysis
    # Generate metrics
    s_metrics_tr, s_roc_tr, s_roc_auc_tr = model_metric_generation(s_train_x, s_train_y, s_train_x, s_train_y, s_model, s_bal)
    # Plot
    Plot(s_metrics_tr, (savefile+'_Training_Data_as_Test_Set_Base_Model_PR_'), s_roc_tr, s_roc_auc_tr, (savefile+'_Training_Data_as_Test_Set_Base_Model_ROC_'), (savefile+'_Training_Data_as_Test_Set_Base_Model_Metric_Curve_'))
    
    ### META MODEL FITTING ###
    # Run meta-model
    s_mmodel, s_unfit_m, s_mbal, s_mmetrics, s_base_train_x, s_base_train_y, s_meta_combo_train_x, s_meta_train_y, s_meta_test_x, s_meta_test_y = meta_model_fitting(s_unfit, s_bal, ptp_x, s_y, ptp_x, None, None, (savefile+'_Meta_Model_'), s_scoring)
    
    
    # Run meta model fit on training data
    # Generate metrics
    s_mmetrics_tr, s_mroc_tr, s_mroc_auc_tr = model_metric_generation(s_meta_combo_train_x, s_meta_train_y, s_meta_combo_train_x, s_meta_train_y, s_mmodel, s_mbal)
    # Plot
    Plot(s_mmetrics_tr, (savefile+'_Training_Data_as_Test_Set_Meta_Model_PR_'), s_mroc_tr, s_mroc_auc_tr, (savefile+'_Training_Data_as_Test_Set_Meta_Model_ROC_'), (savefile+'_Training_Data_as_Test_Set_Meta_Model_Metric_Curve_'))

    
    # Run fit assessment on secondary scores alone
    sec_score = ptp_x[['SECONDARY_ML_SCORE']]
    s_sec_roc_auc, s_sec_metrics, s_sec_roc = pre_scored_metric_generation(s_y, sec_score)
    # Plot
    s_sec_metrics_no_f = s_sec_metrics.drop(columns=['F-score'])
    Plot(s_sec_metrics_no_f, (savefile+'_MuSite_Deep_PR'), s_sec_roc, s_sec_roc_auc, (savefile+'_MuSite_Deep_ROC'), (savefile+'_MuSite_Deep_Metric_Curve'))
    
    ## Run experimental predictions
    exp_preds = experimental_prediction(s_mmodel, s_model, s_bal, ptp_x, s_y, exp)
    
    # Format experimental predictions df for output
    s_exp_lab = 'Base ' + adj_name + ' ML Model Score'
    m_exp_lab = adj_name +' Ensemble ML Score'
    
    new_exp_predicted = pd.DataFrame(exp_details)
    new_exp_predicted[s_exp_lab] = exp_preds['PRIMARY_ML_SCORE']
    new_exp_predicted['MuSite Deep Score'] = exp_preds['SECONDARY_ML_SCORE']
    new_exp_predicted[m_exp_lab] = exp_preds['META_ML_SCORE']
    
    # Declare file for later
    resfile = './PTP_Files/'+adj_name+'/'
    
    # Run MS experimental predictions on dataset from Ren et. al.
    if name in ['SHP-1_Binary', 'SHP-2_Binary', 'PTP1B_Binary']:
        # We have additional experimental sets to run on (mass-spec data)
        s_model = clone(s_model)
        if type(s_mmodel) != str:
            s_mmodel = clone(s_mmodel)
        s_bal = clone(s_bal)
        
        ms_preds = []
        ms_preds = experimental_prediction(s_mmodel, s_model, s_bal, ptp_x, s_y, ms)
        
        new_ms_predicted = ms_details[['Position', 'Uniprot ID', 'Dataset', 'Peptide', 'SECONDARY_ML_SCORE']]
        new_ms_predicted[s_exp_lab] = ms_preds['PRIMARY_ML_SCORE']
        new_ms_predicted['MuSite Deep Score'] = ms_preds['SECONDARY_ML_SCORE']
        new_ms_predicted[m_exp_lab] = ms_preds['META_ML_SCORE']
        
        # Dataframe of experimental predictions
        new_ms_predicted.to_csv((resfile+adj_name+'_ML_predictions_ms_set_substrates_PTP1B_SHP1_SHP2.csv'))
    
    
    # Cutoff for meta model positives
    f_max = s_mmetrics[s_mmetrics['F-score'] == s_mmetrics['F-score'].max()]
    threshold = f_max['Threshold'].max()
    t_idx = f_max['Threshold'].idxmax()
    m_bin_lab = adj_name + ' Deacetylation Prediction (0 = no, 1 = yes)'
    new_exp_predicted[m_bin_lab] = 0
    new_exp_predicted.loc[new_exp_predicted[m_exp_lab] >= threshold, m_bin_lab] = 1
    print(sum(new_exp_predicted[m_bin_lab]), 'Positives within dataset')
    
    ### NOW SAVE EVERYTHING TO OUTPUT
    df_file = resfile + 'dfs/' + adj_name
    # Dataframes of metrics
    s_metrics.to_csv((df_file + '_base_model_metrics.csv')) # base model metrics
    s_metrics_tr.to_csv((df_file + '_base_model_trainingset_as_testingset_metrics.csv')) # base model metrics with training data as test data
    s_mmetrics.to_csv((df_file + '_meta_model_metrics.csv')) # meta model metrics
    s_mmetrics_tr.to_csv((df_file + '_meta_model_trainingset_as_testingset_metrics.csv')) # meta model metrics with training data as test data
    s_sec_metrics_no_f.to_csv((df_file + '_secondary_score_musite_deep_metrics.csv')) # secondary score metrics
    
    # Dataframe of experimental predictions
    new_exp_predicted.to_csv((resfile+adj_name+'_ML_predictions_experimental_surface_exposed_arginine_set.csv'))
    
    # Text file with base model, meta model 
    metric_file_out = open((resfile + adj_name + '_ML_info.txt'), 'w')
    metric_file_out.write('Training Data: ' + str(sum(s_y['EXPERIMENTALLY_ACTIVE'])) + ' positives in ' +
                          str(len(s_y['EXPERIMENTALLY_ACTIVE'])) + ' negatives (' + 
                          str(round(sum(s_y['EXPERIMENTALLY_ACTIVE'])/len(s_y['EXPERIMENTALLY_ACTIVE'])*100, 2)) + '% positive)')
    metric_file_out.write('Base Model: ' + str(s_model) + ' with params: ' + str(s_params) +
                          'and balancing:' + str(s_bal) + '\n')
    metric_file_out.write('Meta Model: ' + str(s_mmodel) + '\n')
    metric_file_out.write('Meta Model Threshold of: ' + str(threshold) +
                          ', Precision: ' + str(round(s_mmetrics.loc[t_idx, 'Precision'], 2)) +
                          ', Recall: ' + str(round(s_mmetrics.loc[t_idx, 'Recall'], 2)) +
                          ', Sensitivity: ' + str(round(s_mmetrics.loc[t_idx, 'Sensitivity'], 2)) +
                          ', Specificity: ' + str(round(s_mmetrics.loc[t_idx, 'Specificity'], 2)) +
                          ', F-Score: '+ str(round(s_mmetrics.loc[t_idx, 'F-score'], 2)) + '\n'
                         )
    metric_file_out.write('Predicted positives: ' + str(sum(new_exp_predicted[m_bin_lab])) +
                         ' within a dataset of ' + str(len(new_exp_predicted[m_bin_lab])) + ' (' +
                         str(round((sum(new_exp_predicted[m_bin_lab])/len(new_exp_predicted[m_bin_lab]))*100, 4)) + 
                         '% positive)')
    metric_file_out.close()
    
    base_models.append(s_model)
    base_params.append(s_params)
    base_bals.append(s_bal)
    meta_models.append(s_mmodel)
    sirt_number.append(i)
    
    print('FINISHED ' + adj_name + '!')
    
    i+=1

Now running SHP-1 with 188 positives - 4.72 % positive
0.04718875502008032
Data is imbalanced ( 4.72 % pos) adjusting model metric to f-score to best assess fit...
0.04714086471408647
Data is imbalanced ( 4.71 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Model fitting of DummyClassifier(strategy='most_frequent')
Model fitting of DummyClassifier(strategy='most_frequent') resulted in an f-score of 0.0
Model fitting of LogisticRegression(max_iter=1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.05267906261041271
Model fitting of LinearDiscriminantAnalysis()
Model fitting of LinearDiscriminantAnalysis() resulted in an f-score of 0.2111613145490723
Model fitting of DecisionTreeClassifier()
Model fitting of DecisionTreeClassifier() resulted in an f-score of 0.12168811196481383
Model fitting of KNeighborsClassifier()
Model fitting of KNeighborsClassifier() resulted in an f-score of 0.013859649122807016
Model fitting of SVC()
Model fitting of SVC() resulted in an f-score of 0.0
Model fitting of BaggingClassifier()
Model fitting of BaggingClassifier() resulted in an f-score of 0.023561681982734613
Model fitting of RandomForestClassifier()
Model fitting of RandomForestClassifier() resulted in an f-score of 0.0
Model fitting of ExtraTreesClassifier()
Model fitting of ExtraTreesClassifier() resulted in an f-score of 0.0
Model fitting of GradientBoostingClassifier()
Model fitting of GradientBoostingClassifie



Basic balancing testing of KMeansSMOTE() failed, trying other methods...
Balancing testing of KMeansSMOTE() resulted in an f-score of 0
Basic balancing testing of ADASYN()
Balancing testing of ADASYN() resulted in an f-score of 0.0
Basic balancing testing of RandomUnderSampler()
Balancing testing of RandomUnderSampler() resulted in an f-score of 0.11742887663814343
Basic balancing testing of TomekLinks()
Balancing testing of TomekLinks() resulted in an f-score of 0.21316656241762638
Basic balancing testing of EditedNearestNeighbours()
Balancing testing of EditedNearestNeighbours() resulted in an f-score of 0.2501280429535639
Basic balancing testing of NeighbourhoodCleaningRule()
Balancing testing of NeighbourhoodCleaningRule() resulted in an f-score of 0.24602694820406826
Basic balancing testing of OneSidedSelection()
Balancing testing of OneSidedSelection() resulted in an f-score of 0.21327274922756426
Basic balancing testing of SMOTEENN()
Balancing testing of SMOTEENN() resulted in a

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 463, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
                   ^^^^^^^^^^^^^^^^^^^
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/s

Best hyperparameters {'solver': 'svd'}


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Running permutation importance...


  y = column_or_1d(y, warn=True)


Finished permutation importance... Now graphing...
           Mean Feature Importance  Standard Deviation
Gs(U)_NO                  0.000000            0.000000
Mw_NO                    -0.028653            0.054501
HP_NO                     0.000000            0.000000
IP_NO                    -0.002279            0.002791
ECI_NO                    0.000000            0.000000
...                            ...                 ...
162_maccs                 0.000000            0.000000
163_maccs                 0.000000            0.000000
164_maccs                 0.000000            0.000000
165_maccs                 0.000000            0.000000
166_maccs                 0.000000            0.000000

[443 rows x 2 columns]
              Mean Feature Importance  Standard Deviation  Abs Importance
ONE-HOT_5-K                 -0.066138            0.000000        0.066138
138_maccs                   -0.066138            0.000000        0.066138
ONE-HOT_11-W                 0.055749      

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Initial length of features - x: 3984 , y: 3984
Test set contains x: 399 , y: 399
After removing the test set, remaining features are - x: 3585 , y: 3585
Base train x set: 1792
Meta train x set: 1793
Balancing the dataset via EditedNearestNeighbours()
Base model: LinearDiscriminantAnalysis()


  y = column_or_1d(y, warn=True)


Base train x set: 1603
0.047406581148912434
Data is imbalanced ( 4.74 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Meta model fitting commencing...
Model fitting of LogisticRegression(max_iter=1000)
Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.0
Model fitting of simp_avg_eq
k_fold: simp_avg in model name
Model fitting of simp_avg_eq resulted in an f-score of 0.15067173653086963
Model fitting of simp_avg_pr
k_fold: simp_avg in model name
Model fitting of simp_avg_pr resulted in an f-score of 0.14176192400650606
Model fitting of simp_avg_sec
k_fold: simp_avg in model name
Model fitting of simp_avg_sec resulted in an f-score of 0.08698885699415745
Now onto the balancing methods...
Finished model fitting, proceeding with simp_avg_eq and none with an f1 of 0.15067173653086963
Simple average equal selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Sensitivity at 0.5: 0.11
Specificity at 0.5: 0.96
Precision at 0.5: 0.12
Recall at 0.5: 0.11
Maximised F-score of 0.12 at a threshold of 0.46 Recall: 0.21 Specificity: 0.88 Precision: 0.08 Sensitivity: 0.21
Simple average equal selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:,[1]]
A value is trying to be set on a c

4598 Positives within dataset
FINISHED SHP-1!
Now running SHP-2 with 148 positives - 3.71 % positive
0.03714859437751004
Data is imbalanced ( 3.71 % pos) adjusting model metric to f-score to best assess fit...
0.03709902370990237
Data is imbalanced ( 3.71 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Model fitting of DummyClassifier(strategy='most_frequent')
Model fitting of DummyClassifier(strategy='most_frequent') resulted in an f-score of 0.0
Model fitting of LogisticRegression(max_iter=1000)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.026818394024276382
Model fitting of LinearDiscriminantAnalysis()
Model fitting of LinearDiscriminantAnalysis() resulted in an f-score of 0.14019253421730202
Model fitting of DecisionTreeClassifier()
Model fitting of DecisionTreeClassifier() resulted in an f-score of 0.07019578252064831
Model fitting of KNeighborsClassifier()
Model fitting of KNeighborsClassifier() resulted in an f-score of 0.0
Model fitting of SVC()
Model fitting of SVC() resulted in an f-score of 0.0
Model fitting of BaggingClassifier()
Model fitting of BaggingClassifier() resulted in an f-score of 0.0
Model fitting of RandomForestClassifier()
Model fitting of RandomForestClassifier() resulted in an f-score of 0.0
Model fitting of ExtraTreesClassifier()
Model fitting of ExtraTreesClassifier() resulted in an f-score of 0.0
Model fitting of GradientBoostingClassifier()
Model fitting of GradientBoostingClassifier() resulted in an f-score of 0.



Basic balancing testing of KMeansSMOTE() failed, trying other methods...
Balancing testing of KMeansSMOTE() resulted in an f-score of 0
Basic balancing testing of ADASYN()
Balancing testing of ADASYN() resulted in an f-score of 0.0
Basic balancing testing of RandomUnderSampler()
Balancing testing of RandomUnderSampler() resulted in an f-score of 0.07539191187325386
Basic balancing testing of TomekLinks()
Balancing testing of TomekLinks() resulted in an f-score of 0.14296945896219013
Basic balancing testing of EditedNearestNeighbours()
Balancing testing of EditedNearestNeighbours() resulted in an f-score of 0.1513000203432967
Basic balancing testing of NeighbourhoodCleaningRule()
Balancing testing of NeighbourhoodCleaningRule() resulted in an f-score of 0.14121075490270535
Basic balancing testing of OneSidedSelection()
Balancing testing of OneSidedSelection() resulted in an f-score of 0.14326840199131222
Basic balancing testing of SMOTEENN()
Balancing testing of SMOTEENN() resulted in a

5 fits failed out of a total of 15.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 631, in fit
    self._solve_eigen(
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/site-packages/sklearn/discriminant_analysis.py", line 463, in _solve_eigen
    evals, evecs = linalg.eigh(Sb, Sw)
                   ^^^^^^^^^^^^^^^^^^^
  File "/Users/nashiragrigg/anaconda3/lib/python3.11/s

Best hyperparameters {'solver': 'svd'}


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Running permutation importance...


  y = column_or_1d(y, warn=True)


Finished permutation importance... Now graphing...
           Mean Feature Importance  Standard Deviation
Gs(U)_NO                 -0.000396            0.000792
Mw_NO                     0.023553            0.002891
HP_NO                    -0.000008            0.001240
IP_NO                     0.008955            0.010779
ECI_NO                    0.016536            0.002440
...                            ...                 ...
162_maccs                 0.000000            0.000000
163_maccs                 0.000000            0.000000
164_maccs                 0.000000            0.000000
165_maccs                 0.000000            0.000000
166_maccs                 0.000000            0.000000

[443 rows x 2 columns]
              Mean Feature Importance  Standard Deviation  Abs Importance
Mw_NO                        0.023553            0.002891        0.023553
ONE-HOT_5-Y                  0.019795            0.001116        0.019795
ONE-HOT_9-D                  0.018733      

  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Initial length of features - x: 3984 , y: 3984
Test set contains x: 399 , y: 399
After removing the test set, remaining features are - x: 3585 , y: 3585
Base train x set: 1792
Meta train x set: 1793
Balancing the dataset via RandomOverSampler()
Base model: LinearDiscriminantAnalysis()


  y = column_or_1d(y, warn=True)


Base train x set: 3452
0.0373675404350251
Data is imbalanced ( 3.74 % pos) applying various sampling methods to remedy the issue...
Now onto the automatic model fitting...
Meta model fitting commencing...
Model fitting of LogisticRegression(max_iter=1000)
Model fitting of LogisticRegression(max_iter=1000) resulted in an f-score of 0.0
Model fitting of simp_avg_eq
k_fold: simp_avg in model name
Model fitting of simp_avg_eq resulted in an f-score of 0.12587912494720846
Model fitting of simp_avg_pr
k_fold: simp_avg in model name
Model fitting of simp_avg_pr resulted in an f-score of 0.1368142857073081
Model fitting of simp_avg_sec
k_fold: simp_avg in model name
Model fitting of simp_avg_sec resulted in an f-score of 0.12021426646664007
Now onto the balancing methods...
Finished model fitting, proceeding with simp_avg_pr and none with an f1 of 0.1368142857073081
Simple average primary weighted selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)


Finished generating metrics. Now plotting...
Sensitivity at 0.5: 0.47
Specificity at 0.5: 0.84
Precision at 0.5: 0.1
Recall at 0.5: 0.47
Maximised F-score of 0.31 at a threshold of 0.89 Recall: 0.27 Specificity: 0.98 Precision: 0.36 Sensitivity: 0.27
Simple average primary weighted selected...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))

The `ci` parameter is deprecated. Use `errorbar=None` for the same effect.

  ax = sns.lineplot(x='Threshold', y='value', hue='variable', data=pd.melt(pr_metrics, 'Threshold'), ci=None)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exp_scores['PRIMARY_ML_SCORE'] = base_model_fit.predict_proba(exp)[:,[1]]
A value is trying to be set on a c

583 Positives within dataset
FINISHED SHP-2!


<Figure size 800x600 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 1200x1000 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>

<Figure size 800x600 with 0 Axes>