### Running RFE on RF (for source types that met BA threshold)

In [1]:
#Refer to 231205-NTA-Paper2-Classifiers for previous code that includes tuning hyperparameters + performance evaluation
#Importing libraries to open data and run RFE + model
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import RandomizedSearchCV
import pprint
from sklearn.metrics import confusion_matrix
from matplotlib import pyplot
from numpy import mean
from numpy import std
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score

#RF met BA threshold for GW, LL, WWTP, and PP so below are the RFE results for those sources

### For AFFF-GW

In [42]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [43]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: AFFF-GW


In [44]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [45]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
print(data_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

[[2.54570928 2.54570928 4.11539313 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 4.54451372 4.87498812 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 3.22038648 4.8459347  ... 2.54570928 2.54570928 2.54570928]
 ...
 [2.54570928 2.54570928 2.54570928 ... 3.02475995 2.54570928 2.54570928]
 [2.54570928 2.54570928 2.54570928 ... 3.2965849  2.54570928 2.93484187]
 [2.54570928 2.54570928 3.58667926 ... 3.33173239 2.54570928 2.54570928]]


In [46]:
#class_names=np.array([0.0,1.0])
# Function to get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):  # Start with 610 features and reduce by 2 at each step
        # Define the pipeline with RFE and RandomForestClassifier
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=100, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s', rfe), ('m', model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


# Get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s Features: %d, Balanced Accuracy: %.3f (%.3f)' % (name, int(name), np.mean(scores), np.std(scores)))

>30 Features: 30, Balanced Accuracy: 0.812 (0.223)
>20 Features: 20, Balanced Accuracy: 0.802 (0.225)
>10 Features: 10, Balanced Accuracy: 0.817 (0.229)


In [47]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=100, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.693 (0.227)
>2 0.709 (0.228)
>3 0.726 (0.222)
>4 0.789 (0.208)
>5 0.810 (0.232)
>6 0.783 (0.247)
>7 0.804 (0.224)
>8 0.827 (0.219)
>9 0.791 (0.222)
>10 0.798 (0.222)
>11 0.817 (0.219)
>12 0.804 (0.234)
>13 0.825 (0.222)
>14 0.827 (0.227)
>15 0.810 (0.223)
>16 0.827 (0.208)
>17 0.827 (0.223)
>18 0.823 (0.214)
>19 0.819 (0.226)
>20 0.815 (0.219)
>21 0.827 (0.218)
>22 0.787 (0.226)
>23 0.839 (0.209)
>24 0.802 (0.221)
>25 0.817 (0.219)
>26 0.808 (0.218)
>27 0.825 (0.226)
>28 0.825 (0.207)
>29 0.810 (0.214)
>30 0.815 (0.219)


In [48]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=2)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 4.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 11.000
Column: 4, Selected False, Rank: 16.000
Column: 5, Selected False, Rank: 23.000
Column: 6, Selected False, Rank: 13.000
Column: 7, Selected False, Rank: 2.000
Column: 8, Selected False, Rank: 8.000
Column: 9, Selected False, Rank: 15.000
Column: 10, Selected False, Rank: 5.000
Column: 11, Selected False, Rank: 25.000
Column: 12, Selected False, Rank: 7.000
Column: 13, Selected False, Rank: 14.000
Column: 14, Selected False, Rank: 21.000
Column: 15, Selected False, Rank: 6.000
Column: 16, Selected False, Rank: 9.000
Column: 17, Selected False, Rank: 18.000
Column: 18, Selected False, Rank: 28.000
Column: 19, Selected False, Rank: 27.000
Column: 20, Selected False, Rank: 29.000
Column: 21, Selected False, Rank: 12.000
Column: 22, Selected False, Rank: 3.000
Column: 23, Selected False, Rank: 20.000
Column: 24, Selected False, Rank: 22.0

In [49]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_GW_RF.xlsx", index=False)

In [50]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=5)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 2.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 7.000
Column: 4, Selected False, Rank: 12.000
Column: 5, Selected False, Rank: 15.000
Column: 6, Selected False, Rank: 13.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 3.000
Column: 9, Selected False, Rank: 9.000
Column: 10, Selected True, Rank: 1.000
Column: 11, Selected False, Rank: 24.000
Column: 12, Selected False, Rank: 5.000
Column: 13, Selected False, Rank: 11.000
Column: 14, Selected False, Rank: 17.000
Column: 15, Selected False, Rank: 6.000
Column: 16, Selected False, Rank: 8.000
Column: 17, Selected False, Rank: 20.000
Column: 18, Selected False, Rank: 25.000
Column: 19, Selected False, Rank: 26.000
Column: 20, Selected False, Rank: 22.000
Column: 21, Selected False, Rank: 10.000
Column: 22, Selected True, Rank: 1.000
Column: 23, Selected False, Rank: 18.000
Column: 24, Selected False, Rank: 14.000
Co

### LL

In [2]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [3]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: LL


In [4]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [5]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
print(data_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

[[2.54570928 2.54570928 4.11539313 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 4.54451372 4.87498812 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 3.22038648 4.8459347  ... 2.54570928 2.54570928 2.54570928]
 ...
 [2.54570928 2.54570928 2.54570928 ... 3.02475995 2.54570928 2.54570928]
 [2.54570928 2.54570928 2.54570928 ... 3.2965849  2.54570928 2.93484187]
 [2.54570928 2.54570928 3.58667926 ... 3.33173239 2.54570928 2.54570928]]


In [6]:
#class_names=np.array([0.0,1.0])
# Function to get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):  # Start with 610 features and reduce by 2 at each step
        # Define the pipeline with RFE and RandomForestClassifier
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=100, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s', rfe), ('m', model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


# Get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s Features: %d, Balanced Accuracy: %.3f (%.3f)' % (name, int(name), np.mean(scores), np.std(scores)))

>30 Features: 30, Balanced Accuracy: 0.925 (0.131)
>20 Features: 20, Balanced Accuracy: 0.925 (0.115)
>10 Features: 10, Balanced Accuracy: 0.925 (0.131)


In [7]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=100, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.832 (0.161)
>2 0.829 (0.151)
>3 0.863 (0.142)
>4 0.924 (0.109)
>5 0.907 (0.134)
>6 0.920 (0.131)
>7 0.912 (0.133)
>8 0.923 (0.131)
>9 0.925 (0.131)
>10 0.925 (0.131)
>11 0.917 (0.134)
>12 0.925 (0.131)
>13 0.920 (0.113)
>14 0.923 (0.131)
>15 0.925 (0.131)
>16 0.925 (0.131)
>17 0.931 (0.110)
>18 0.908 (0.137)
>19 0.925 (0.131)
>20 0.929 (0.127)
>21 0.933 (0.111)
>22 0.921 (0.113)
>23 0.920 (0.130)
>24 0.923 (0.131)
>25 0.923 (0.131)
>26 0.937 (0.128)
>27 0.933 (0.128)
>28 0.925 (0.115)
>29 0.925 (0.115)
>30 0.900 (0.138)


In [8]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=100, max_features='sqrt'), n_features_to_select=4)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 14.000
Column: 1, Selected False, Rank: 10.000
Column: 2, Selected False, Rank: 3.000
Column: 3, Selected False, Rank: 11.000
Column: 4, Selected False, Rank: 19.000
Column: 5, Selected False, Rank: 9.000
Column: 6, Selected False, Rank: 5.000
Column: 7, Selected False, Rank: 2.000
Column: 8, Selected True, Rank: 1.000
Column: 9, Selected True, Rank: 1.000
Column: 10, Selected True, Rank: 1.000
Column: 11, Selected False, Rank: 20.000
Column: 12, Selected True, Rank: 1.000
Column: 13, Selected False, Rank: 6.000
Column: 14, Selected False, Rank: 15.000
Column: 15, Selected False, Rank: 13.000
Column: 16, Selected False, Rank: 8.000
Column: 17, Selected False, Rank: 24.000
Column: 18, Selected False, Rank: 25.000
Column: 19, Selected False, Rank: 27.000
Column: 20, Selected False, Rank: 26.000
Column: 21, Selected False, Rank: 4.000
Column: 22, Selected False, Rank: 7.000
Column: 23, Selected False, Rank: 16.000
Column: 24, Selected False, Rank: 17.000
C

In [9]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_LL_RF.xlsx", index=False)

### WWTP

In [12]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [13]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: WWTP


In [14]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [15]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
print(target_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [16]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='log2'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=1000, max_features='log2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.901 (0.138)
>2 0.909 (0.134)
>3 0.911 (0.131)
>4 0.909 (0.135)
>5 0.909 (0.135)
>6 0.916 (0.133)
>7 0.918 (0.134)
>8 0.916 (0.137)
>9 0.916 (0.137)
>10 0.916 (0.137)
>11 0.914 (0.140)
>12 0.914 (0.140)
>13 0.914 (0.140)
>14 0.914 (0.140)
>15 0.928 (0.130)
>16 0.930 (0.131)
>17 0.930 (0.131)
>18 0.930 (0.131)
>19 0.930 (0.131)
>20 0.930 (0.131)
>21 0.930 (0.131)
>22 0.930 (0.131)
>23 0.930 (0.131)
>24 0.930 (0.131)
>25 0.930 (0.131)
>26 0.930 (0.131)
>27 0.930 (0.131)
>28 0.932 (0.131)
>29 0.930 (0.131)
>30 0.930 (0.131)


In [18]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='log2'), n_features_to_select=3)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 12.000
Column: 1, Selected False, Rank: 22.000
Column: 2, Selected False, Rank: 5.000
Column: 3, Selected True, Rank: 1.000
Column: 4, Selected False, Rank: 2.000
Column: 5, Selected False, Rank: 19.000
Column: 6, Selected False, Rank: 10.000
Column: 7, Selected False, Rank: 7.000
Column: 8, Selected False, Rank: 9.000
Column: 9, Selected False, Rank: 25.000
Column: 10, Selected False, Rank: 4.000
Column: 11, Selected False, Rank: 24.000
Column: 12, Selected False, Rank: 14.000
Column: 13, Selected False, Rank: 15.000
Column: 14, Selected False, Rank: 27.000
Column: 15, Selected False, Rank: 13.000
Column: 16, Selected False, Rank: 11.000
Column: 17, Selected False, Rank: 26.000
Column: 18, Selected False, Rank: 17.000
Column: 19, Selected False, Rank: 28.000
Column: 20, Selected False, Rank: 6.000
Column: 21, Selected False, Rank: 3.000
Column: 22, Selected True, Rank: 1.000
Column: 23, Selected True, Rank: 1.000
Column: 24, Selected False, Rank: 23.00

In [19]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_WWTP_RF.xlsx", index=False)

### PP

In [20]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [21]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: PP


In [22]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [23]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
print(data_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

[[2.54570928 2.54570928 4.11539313 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 4.54451372 4.87498812 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 3.22038648 4.8459347  ... 2.54570928 2.54570928 2.54570928]
 ...
 [2.54570928 2.54570928 2.54570928 ... 3.02475995 2.54570928 2.54570928]
 [2.54570928 2.54570928 2.54570928 ... 3.2965849  2.54570928 2.93484187]
 [2.54570928 2.54570928 3.58667926 ... 3.33173239 2.54570928 2.54570928]]


In [24]:
#class_names=np.array([0.0,1.0])
# Function to get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):  # Start with 610 features and reduce by 2 at each step
        # Define the pipeline with RFE and RandomForestClassifier
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s', rfe), ('m', model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


# Get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s Features: %d, Balanced Accuracy: %.3f (%.3f)' % (name, int(name), np.mean(scores), np.std(scores)))

>30 Features: 30, Balanced Accuracy: 0.983 (0.062)
>20 Features: 20, Balanced Accuracy: 0.967 (0.107)
>10 Features: 10, Balanced Accuracy: 1.000 (0.000)


In [25]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.840 (0.213)
>2 0.943 (0.134)
>3 0.985 (0.048)
>4 0.960 (0.106)
>5 0.985 (0.048)
>6 0.996 (0.017)
>7 0.992 (0.045)
>8 0.992 (0.045)
>9 1.000 (0.000)
>10 1.000 (0.000)
>11 1.000 (0.000)


KeyboardInterrupt: 

In [26]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=3)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 11.000
Column: 1, Selected False, Rank: 26.000
Column: 2, Selected False, Rank: 4.000
Column: 3, Selected False, Rank: 20.000
Column: 4, Selected False, Rank: 15.000
Column: 5, Selected False, Rank: 23.000
Column: 6, Selected False, Rank: 22.000
Column: 7, Selected False, Rank: 8.000
Column: 8, Selected False, Rank: 13.000
Column: 9, Selected False, Rank: 24.000
Column: 10, Selected False, Rank: 12.000
Column: 11, Selected False, Rank: 14.000
Column: 12, Selected False, Rank: 6.000
Column: 13, Selected False, Rank: 19.000
Column: 14, Selected False, Rank: 25.000
Column: 15, Selected False, Rank: 17.000
Column: 16, Selected False, Rank: 7.000
Column: 17, Selected False, Rank: 28.000
Column: 18, Selected False, Rank: 5.000
Column: 19, Selected False, Rank: 27.000
Column: 20, Selected False, Rank: 21.000
Column: 21, Selected False, Rank: 16.000
Column: 22, Selected False, Rank: 10.000
Column: 23, Selected False, Rank: 18.000
Column: 24, Selected False, Ran

In [27]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_PP_RF.xlsx", index=False)

In [28]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=5)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 8.000
Column: 1, Selected False, Rank: 24.000
Column: 2, Selected False, Rank: 2.000
Column: 3, Selected False, Rank: 15.000
Column: 4, Selected False, Rank: 11.000
Column: 5, Selected False, Rank: 22.000
Column: 6, Selected False, Rank: 20.000
Column: 7, Selected False, Rank: 6.000
Column: 8, Selected False, Rank: 10.000
Column: 9, Selected False, Rank: 21.000
Column: 10, Selected False, Rank: 13.000
Column: 11, Selected False, Rank: 12.000
Column: 12, Selected False, Rank: 5.000
Column: 13, Selected False, Rank: 19.000
Column: 14, Selected False, Rank: 23.000
Column: 15, Selected False, Rank: 18.000
Column: 16, Selected False, Rank: 4.000
Column: 17, Selected False, Rank: 26.000
Column: 18, Selected False, Rank: 3.000
Column: 19, Selected False, Rank: 25.000
Column: 20, Selected False, Rank: 14.000
Column: 21, Selected False, Rank: 16.000
Column: 22, Selected False, Rank: 9.000
Column: 23, Selected False, Rank: 17.000
Column: 24, Selected False, Rank:

In [29]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=9)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 5.000
Column: 1, Selected False, Rank: 20.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 10.000
Column: 4, Selected False, Rank: 9.000
Column: 5, Selected False, Rank: 16.000
Column: 6, Selected False, Rank: 15.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 7.000
Column: 9, Selected False, Rank: 18.000
Column: 10, Selected False, Rank: 6.000
Column: 11, Selected False, Rank: 8.000
Column: 12, Selected True, Rank: 1.000
Column: 13, Selected False, Rank: 14.000
Column: 14, Selected False, Rank: 19.000
Column: 15, Selected False, Rank: 17.000
Column: 16, Selected False, Rank: 2.000
Column: 17, Selected False, Rank: 22.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected False, Rank: 21.000
Column: 20, Selected False, Rank: 11.000
Column: 21, Selected False, Rank: 12.000
Column: 22, Selected False, Rank: 4.000
Column: 23, Selected False, Rank: 13.000
Column: 24, Selected False, Rank: 3.000
C

In [30]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [31]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: PG


In [32]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [33]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
print(data_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

[[2.54570928 2.54570928 4.11539313 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 4.54451372 4.87498812 ... 2.54570928 2.54570928 2.54570928]
 [2.54570928 3.22038648 4.8459347  ... 2.54570928 2.54570928 2.54570928]
 ...
 [2.54570928 2.54570928 2.54570928 ... 3.02475995 2.54570928 2.54570928]
 [2.54570928 2.54570928 2.54570928 ... 3.2965849  2.54570928 2.93484187]
 [2.54570928 2.54570928 3.58667926 ... 3.33173239 2.54570928 2.54570928]]


In [35]:
#class_names=np.array([0.0,1.0])
# Function to get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):  # Start with 610 features and reduce by 2 at each step
        # Define the pipeline with RFE and RandomForestClassifier
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s', rfe), ('m', model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores


# Get the models to evaluate
models = get_models()

# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s Features: %d, Balanced Accuracy: %.3f (%.3f)' % (name, int(name), np.mean(scores), np.std(scores)))

>30 Features: 30, Balanced Accuracy: 0.854 (0.227)
>20 Features: 20, Balanced Accuracy: 0.875 (0.217)
>10 Features: 10, Balanced Accuracy: 0.917 (0.186)


In [37]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=i)
        model = RandomForestClassifier(n_estimators=1000, max_features='sqrt')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='balanced_accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.496 (0.018)
>2 0.636 (0.218)
>3 0.822 (0.229)
>4 0.782 (0.240)
>5 0.829 (0.233)
>6 0.875 (0.217)
>7 0.917 (0.186)
>8 0.896 (0.203)
>9 0.896 (0.203)
>10 0.938 (0.165)
>11 0.875 (0.217)
>12 0.917 (0.186)
>13 0.833 (0.236)
>14 0.833 (0.236)


KeyboardInterrupt: 

In [38]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=3)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 16.000
Column: 1, Selected False, Rank: 26.000
Column: 2, Selected False, Rank: 8.000
Column: 3, Selected False, Rank: 22.000
Column: 4, Selected False, Rank: 18.000
Column: 5, Selected False, Rank: 15.000
Column: 6, Selected False, Rank: 6.000
Column: 7, Selected False, Rank: 3.000
Column: 8, Selected False, Rank: 11.000
Column: 9, Selected False, Rank: 28.000
Column: 10, Selected False, Rank: 9.000
Column: 11, Selected False, Rank: 25.000
Column: 12, Selected False, Rank: 10.000
Column: 13, Selected False, Rank: 19.000
Column: 14, Selected False, Rank: 27.000
Column: 15, Selected False, Rank: 12.000
Column: 16, Selected False, Rank: 5.000
Column: 17, Selected True, Rank: 1.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected True, Rank: 1.000
Column: 20, Selected False, Rank: 4.000
Column: 21, Selected False, Rank: 23.000
Column: 22, Selected False, Rank: 7.000
Column: 23, Selected False, Rank: 14.000
Column: 24, Selected False, Rank: 17.00

In [39]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_PG_RF.xlsx", index=False)

In [40]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=6)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 10.000
Column: 1, Selected False, Rank: 20.000
Column: 2, Selected False, Rank: 5.000
Column: 3, Selected False, Rank: 19.000
Column: 4, Selected False, Rank: 13.000
Column: 5, Selected False, Rank: 11.000
Column: 6, Selected False, Rank: 4.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 9.000
Column: 9, Selected False, Rank: 25.000
Column: 10, Selected False, Rank: 6.000
Column: 11, Selected False, Rank: 21.000
Column: 12, Selected False, Rank: 7.000
Column: 13, Selected False, Rank: 16.000
Column: 14, Selected False, Rank: 24.000
Column: 15, Selected False, Rank: 8.000
Column: 16, Selected False, Rank: 2.000
Column: 17, Selected True, Rank: 1.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected True, Rank: 1.000
Column: 20, Selected True, Rank: 1.000
Column: 21, Selected False, Rank: 23.000
Column: 22, Selected False, Rank: 3.000
Column: 23, Selected False, Rank: 12.000
Column: 24, Selected False, Rank: 17.000
Col

In [41]:
# Your existing code for feature selection
rfe = RFE(estimator=RandomForestClassifier(n_estimators=1000, max_features='sqrt'), n_features_to_select=7)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 10.000
Column: 1, Selected False, Rank: 22.000
Column: 2, Selected False, Rank: 4.000
Column: 3, Selected False, Rank: 18.000
Column: 4, Selected False, Rank: 13.000
Column: 5, Selected False, Rank: 8.000
Column: 6, Selected False, Rank: 2.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 7.000
Column: 9, Selected False, Rank: 24.000
Column: 10, Selected False, Rank: 5.000
Column: 11, Selected False, Rank: 19.000
Column: 12, Selected False, Rank: 6.000
Column: 13, Selected False, Rank: 15.000
Column: 14, Selected False, Rank: 23.000
Column: 15, Selected False, Rank: 11.000
Column: 16, Selected True, Rank: 1.000
Column: 17, Selected True, Rank: 1.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected True, Rank: 1.000
Column: 20, Selected True, Rank: 1.000
Column: 21, Selected False, Rank: 20.000
Column: 22, Selected False, Rank: 3.000
Column: 23, Selected False, Rank: 9.000
Column: 24, Selected False, Rank: 17.000
Colum