In [1]:
#Import libraries to open data file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from statistics import mean as stat_mean
from numpy import mean
from numpy import std

### One vs all classification: for feature selection

### Normalized Data

#### For AFFF-GW

In [19]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [20]:
#Prompt user for source type of interest (AFFF-GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: AFFF-GW


In [21]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [22]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]

#class_names=np.array([0.0,1.0])
#print(data_1.shape)
#print(data_1)
#data_1 = pd.DataFrame(data_1)
#data_1.to_csv('log10_dat.csv', index=False)

In [23]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [24]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.842 (0.056)
>2 0.841 (0.072)
>3 0.877 (0.091)
>4 0.881 (0.099)
>5 0.862 (0.098)
>6 0.880 (0.100)
>7 0.894 (0.105)
>8 0.919 (0.107)
>9 0.916 (0.106)
>10 0.912 (0.101)
>11 0.934 (0.079)
>12 0.930 (0.101)
>13 0.926 (0.101)
>14 0.934 (0.084)
>15 0.926 (0.096)
>16 0.926 (0.092)
>17 0.923 (0.096)
>18 0.919 (0.103)
>19 0.919 (0.103)
>20 0.923 (0.096)
>21 0.923 (0.096)
>22 0.927 (0.092)
>23 0.927 (0.092)
>24 0.930 (0.083)
>25 0.930 (0.083)
>26 0.930 (0.083)
>27 0.930 (0.083)
>28 0.927 (0.082)
>29 0.930 (0.083)
>30 0.930 (0.083)


In [30]:
# Your existing code for feature selection
rfe = RFE(estimator=LogisticRegression(solver='newton-cg', C=100, penalty='l2'), n_features_to_select=7)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 24.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 15.000
Column: 4, Selected False, Rank: 6.000
Column: 5, Selected False, Rank: 7.000
Column: 6, Selected True, Rank: 1.000
Column: 7, Selected False, Rank: 22.000
Column: 8, Selected False, Rank: 21.000
Column: 9, Selected True, Rank: 1.000
Column: 10, Selected False, Rank: 5.000
Column: 11, Selected False, Rank: 8.000
Column: 12, Selected True, Rank: 1.000
Column: 13, Selected False, Rank: 16.000
Column: 14, Selected False, Rank: 14.000
Column: 15, Selected False, Rank: 4.000
Column: 16, Selected True, Rank: 1.000
Column: 17, Selected False, Rank: 12.000
Column: 18, Selected False, Rank: 9.000
Column: 19, Selected False, Rank: 18.000
Column: 20, Selected False, Rank: 20.000
Column: 21, Selected False, Rank: 13.000
Column: 22, Selected True, Rank: 1.000
Column: 23, Selected False, Rank: 19.000
Column: 24, Selected False, Rank: 11.000
Colum

In [10]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_GW_NEW.xlsx", index=False)

#### For LL

In [31]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [32]:
#Prompt user for source type of interest (GW, LL, BL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: LL


In [33]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [34]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]
#class_names=np.array([0.0,1.0])
#print(data_1.shape)
#print(data_1)

In [35]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [36]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>30 0.949 (0.073)
>20 0.949 (0.073)
>10 0.927 (0.081)


In [17]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [18]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.866 (0.078)
>2 0.899 (0.079)
>3 0.892 (0.083)
>4 0.903 (0.093)
>5 0.913 (0.098)
>6 0.917 (0.095)
>7 0.931 (0.082)
>8 0.916 (0.091)
>9 0.920 (0.079)
>10 0.927 (0.081)
>11 0.931 (0.082)
>12 0.939 (0.070)
>13 0.943 (0.070)
>14 0.950 (0.060)
>15 0.950 (0.060)
>16 0.950 (0.060)
>17 0.949 (0.060)
>18 0.956 (0.072)
>19 0.960 (0.065)
>20 0.949 (0.073)
>21 0.949 (0.073)
>22 0.953 (0.073)
>23 0.949 (0.073)
>24 0.945 (0.073)
>25 0.949 (0.073)
>26 0.949 (0.073)
>27 0.949 (0.073)
>28 0.949 (0.073)
>29 0.949 (0.073)
>30 0.949 (0.073)


In [37]:
rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=4)
rfe.fit(data_1,target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))
# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 12.000
Column: 1, Selected False, Rank: 3.000
Column: 2, Selected False, Rank: 11.000
Column: 3, Selected False, Rank: 7.000
Column: 4, Selected False, Rank: 17.000
Column: 5, Selected False, Rank: 9.000
Column: 6, Selected False, Rank: 13.000
Column: 7, Selected False, Rank: 20.000
Column: 8, Selected False, Rank: 14.000
Column: 9, Selected True, Rank: 1.000
Column: 10, Selected True, Rank: 1.000
Column: 11, Selected False, Rank: 16.000
Column: 12, Selected False, Rank: 2.000
Column: 13, Selected False, Rank: 24.000
Column: 14, Selected False, Rank: 15.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected False, Rank: 4.000
Column: 17, Selected False, Rank: 19.000
Column: 18, Selected False, Rank: 18.000
Column: 19, Selected False, Rank: 25.000
Column: 20, Selected False, Rank: 21.000
Column: 21, Selected False, Rank: 8.000
Column: 22, Selected False, Rank: 27.000
Column: 23, Selected False, Rank: 5.000
Column: 24, Selected True, Rank: 1.000


In [38]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_LL.xlsx", index=False)

In [41]:
# Your existing code for feature selection
rfe = RFE(estimator=LogisticRegression(solver='newton-cg', C=100, penalty='l2'), n_features_to_select=4)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 12.000
Column: 1, Selected False, Rank: 3.000
Column: 2, Selected False, Rank: 11.000
Column: 3, Selected False, Rank: 7.000
Column: 4, Selected False, Rank: 17.000
Column: 5, Selected False, Rank: 9.000
Column: 6, Selected False, Rank: 13.000
Column: 7, Selected False, Rank: 20.000
Column: 8, Selected False, Rank: 14.000
Column: 9, Selected True, Rank: 1.000
Column: 10, Selected True, Rank: 1.000
Column: 11, Selected False, Rank: 16.000
Column: 12, Selected False, Rank: 2.000
Column: 13, Selected False, Rank: 24.000
Column: 14, Selected False, Rank: 15.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected False, Rank: 4.000
Column: 17, Selected False, Rank: 19.000
Column: 18, Selected False, Rank: 18.000
Column: 19, Selected False, Rank: 25.000
Column: 20, Selected False, Rank: 21.000
Column: 21, Selected False, Rank: 8.000
Column: 22, Selected False, Rank: 27.000
Column: 23, Selected False, Rank: 5.000
Column: 24, Selected True, Rank: 1.000


### For BL

In [42]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [43]:
#Prompt user for source type of interest (GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: BL


In [44]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [45]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]
#class_names=np.array([0.0,1.0])
#print(data_1.shape)
#print(data_1)

In [46]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30,0,-10):
        rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [47]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>30 0.964 (0.058)
>20 0.946 (0.061)
>10 0.906 (0.082)


In [48]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=i)
        model = LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [49]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.856 (0.063)
>2 0.888 (0.110)
>3 0.928 (0.075)
>4 0.887 (0.115)
>5 0.898 (0.102)
>6 0.891 (0.110)
>7 0.891 (0.114)
>8 0.894 (0.105)
>9 0.895 (0.096)
>10 0.895 (0.091)
>11 0.903 (0.081)
>12 0.906 (0.083)
>13 0.899 (0.089)
>14 0.906 (0.083)
>15 0.907 (0.083)
>16 0.913 (0.086)
>17 0.913 (0.086)
>18 0.921 (0.088)
>19 0.914 (0.086)
>20 0.917 (0.087)
>21 0.914 (0.086)
>22 0.917 (0.087)
>23 0.914 (0.086)
>24 0.917 (0.087)
>25 0.914 (0.086)
>26 0.917 (0.087)
>27 0.921 (0.088)
>28 0.914 (0.086)
>29 0.921 (0.084)
>30 0.921 (0.084)


In [50]:
rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=3)
rfe.fit(data_1,target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

Column: 0, Selected False, Rank: 28.000
Column: 1, Selected False, Rank: 2.000
Column: 2, Selected False, Rank: 14.000
Column: 3, Selected False, Rank: 27.000
Column: 4, Selected False, Rank: 24.000
Column: 5, Selected False, Rank: 13.000
Column: 6, Selected False, Rank: 17.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 6.000
Column: 9, Selected False, Rank: 19.000
Column: 10, Selected False, Rank: 8.000
Column: 11, Selected False, Rank: 20.000
Column: 12, Selected False, Rank: 10.000
Column: 13, Selected False, Rank: 22.000
Column: 14, Selected True, Rank: 1.000
Column: 15, Selected False, Rank: 5.000
Column: 16, Selected False, Rank: 3.000
Column: 17, Selected False, Rank: 25.000
Column: 18, Selected False, Rank: 12.000
Column: 19, Selected False, Rank: 26.000
Column: 20, Selected False, Rank: 21.000
Column: 21, Selected False, Rank: 7.000
Column: 22, Selected True, Rank: 1.000
Column: 23, Selected False, Rank: 23.000
Column: 24, Selected False, Rank: 16.0



In [52]:
# Your existing code for feature selection
rfe = RFE(estimator=LogisticRegression(solver='liblinear', C=100, penalty='l1'), n_features_to_select=4)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 27.000
Column: 1, Selected False, Rank: 13.000
Column: 2, Selected False, Rank: 2.000
Column: 3, Selected False, Rank: 22.000
Column: 4, Selected False, Rank: 21.000
Column: 5, Selected False, Rank: 7.000
Column: 6, Selected False, Rank: 9.000
Column: 7, Selected True, Rank: 1.000
Column: 8, Selected False, Rank: 15.000
Column: 9, Selected False, Rank: 19.000
Column: 10, Selected False, Rank: 11.000
Column: 11, Selected False, Rank: 18.000
Column: 12, Selected False, Rank: 14.000
Column: 13, Selected False, Rank: 23.000
Column: 14, Selected False, Rank: 4.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected True, Rank: 1.000
Column: 17, Selected False, Rank: 24.000
Column: 18, Selected False, Rank: 8.000
Column: 19, Selected False, Rank: 26.000
Column: 20, Selected False, Rank: 25.000
Column: 21, Selected False, Rank: 16.000
Column: 22, Selected False, Rank: 3.000
Column: 23, Selected False, Rank: 20.000
Column: 24, Selected False, Rank: 12.0

In [None]:
elected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_BL_NEW.xlsx", index=False)

### For PP

In [53]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [54]:
#Prompt user for source type of interest (GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: PP


In [55]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [56]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]
#class_names=np.array([0.0,1.0])
#print(data_1.shape)
#print(data_1)

In [57]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30,0,-10):
        rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=i)
        model = LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [58]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>30 0.996 (0.020)
>20 0.993 (0.028)
>10 0.989 (0.033)


In [59]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=i)
        model = LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [60]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.964 (0.063)
>2 0.964 (0.058)
>3 0.964 (0.058)
>4 0.967 (0.058)
>5 0.971 (0.056)
>6 0.971 (0.056)
>7 0.971 (0.056)
>8 0.971 (0.056)
>9 0.981 (0.041)
>10 0.989 (0.033)
>11 0.993 (0.028)
>12 0.989 (0.032)
>13 0.989 (0.032)
>14 0.989 (0.032)
>15 0.989 (0.032)
>16 0.989 (0.033)
>17 0.993 (0.028)
>18 0.989 (0.033)
>19 0.989 (0.032)
>20 0.993 (0.028)
>21 0.993 (0.028)
>22 0.993 (0.028)
>23 0.993 (0.028)
>24 0.993 (0.028)
>25 0.993 (0.028)
>26 0.989 (0.033)
>27 0.993 (0.028)
>28 0.996 (0.020)
>29 0.996 (0.020)
>30 0.996 (0.020)


In [None]:
rfe = RFE(estimator=LogisticRegression(solver = 'liblinear', C = 100, penalty = 'l1'), n_features_to_select=1)
rfe.fit(data_1,target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

In [64]:
# Your existing code for feature selection
rfe = RFE(estimator=LogisticRegression(solver='liblinear', C=100, penalty='l1'), n_features_to_select=4)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 27.000
Column: 1, Selected False, Rank: 15.000
Column: 2, Selected False, Rank: 10.000
Column: 3, Selected True, Rank: 1.000
Column: 4, Selected False, Rank: 25.000
Column: 5, Selected False, Rank: 2.000
Column: 6, Selected True, Rank: 1.000
Column: 7, Selected False, Rank: 24.000
Column: 8, Selected False, Rank: 23.000
Column: 9, Selected False, Rank: 11.000
Column: 10, Selected False, Rank: 7.000
Column: 11, Selected False, Rank: 21.000
Column: 12, Selected False, Rank: 20.000
Column: 13, Selected False, Rank: 19.000
Column: 14, Selected False, Rank: 13.000
Column: 15, Selected False, Rank: 12.000
Column: 16, Selected False, Rank: 22.000
Column: 17, Selected False, Rank: 5.000
Column: 18, Selected False, Rank: 17.000
Column: 19, Selected False, Rank: 4.000
Column: 20, Selected False, Rank: 14.000
Column: 21, Selected False, Rank: 9.000
Column: 22, Selected False, Rank: 16.000
Column: 23, Selected False, Rank: 3.000
Column: 24, Selected False, Rank: 26

In [None]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_PP.xlsx", index=False)

### For PG

In [65]:
data_rf = pd.read_csv(r'240617-NTA-AVG-EUC-30-CLOSEST-FEATURE-logT.csv', header=0) #Targets: 92 samples X 581 features
#del data_rf[data_rf.columns[0]] #Dropping sample information
#print(data_rf)

In [66]:
#Prompt user for source type of interest (GW, LF, BSL, WWTP, PP or PG)
preferred_type = input("Enter the source type of interest: ")

Enter the source type of interest: PG


In [67]:
#Manipulating data frame based on user input to make "Type" column read 1 for all samples of source of interest and 0 for all other samples
#Set up for binary classification (one-vs-all format)

# Define a function to apply to each row
def set_type(row):
    if row['Type'] == preferred_type:
        return 1
    else:
        return 0

# Create a new column "Type 2" with the updated values
data_rf['Type_2'] = data_rf.apply(set_type, axis=1)
del data_rf[data_rf.columns[0]] #Dropping original type column
#Reordering columns with Type_2 as first column
cols = list(data_rf.columns)
cols = [cols[-1]] + cols[:-1]
data_rf = data_rf[cols]

# Save the updated DataFrame to a new CSV file (if needed)
data_rf.to_csv('sample_data_with_labels_NEW10.csv', index=False)

In [68]:
#Changing pandas data frame to numpy for use in ML
data_rf_np = data_rf.to_numpy()
target_1 = data_rf_np[:,0].reshape(-1,1) #Convert target variables to 2D-array for sci-kit learn
data_1 = data_rf_np[:,1:]
#class_names=np.array([0.0,1.0])
#print(data_1.shape)
#print(data_1)

In [71]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(30, 0, -10):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [72]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>30 0.945 (0.060)
>20 0.945 (0.060)
>10 0.916 (0.069)


In [73]:
##Since this meets BA threshold. Let us run a quick RFE with large steps to estimate the exact number of features to retain
# get a list of models to evaluate
def get_models():
    models = dict()
    for i in range(1, 31):
        rfe = RFE(estimator=LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2'), n_features_to_select=i)
        model = LogisticRegression(solver = 'newton-cg', C = 100, penalty = 'l2')
        models[str(i)] = Pipeline(steps=[('s',rfe),('m',model)])
    return models

#Evaluate model
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=8, n_repeats=3, random_state=1)
    scores = cross_val_score(model, data_1, target_1, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [74]:
# get the models to evaluate
models = get_models()
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, data_1, target_1)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>1 0.952 (0.044)
>2 0.949 (0.051)
>3 0.934 (0.053)
>4 0.927 (0.060)
>5 0.912 (0.062)
>6 0.913 (0.075)
>7 0.906 (0.075)
>8 0.916 (0.069)
>9 0.923 (0.068)
>10 0.916 (0.069)
>11 0.930 (0.067)
>12 0.942 (0.060)
>13 0.942 (0.060)
>14 0.942 (0.060)
>15 0.945 (0.060)
>16 0.945 (0.060)
>17 0.945 (0.060)
>18 0.945 (0.060)
>19 0.945 (0.060)
>20 0.945 (0.060)
>21 0.945 (0.060)
>22 0.945 (0.060)
>23 0.945 (0.060)
>24 0.945 (0.060)
>25 0.945 (0.060)
>26 0.945 (0.060)
>27 0.945 (0.060)
>28 0.945 (0.060)
>29 0.945 (0.060)
>30 0.945 (0.060)


In [79]:
# Your existing code for feature selection
rfe = RFE(estimator=LogisticRegression(solver='newton-cg', C=100, penalty='l2'), n_features_to_select=5)
rfe.fit(data_1, target_1.ravel())
for i in range(data_1.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

# Load feature names from the second dataset
labels_dat = pd.read_csv("240128-NTA-Normalized-Labels-Feature.csv")
feature_names = labels_dat.columns

# Print selected features with their names
selected_feature_indices = [i for i in range(len(rfe.support_)) if rfe.support_[i]]

for index in selected_feature_indices:
    print('Feature Name: %s, Index: %d, Rank: %.3f' % (feature_names[index], index, rfe.ranking_[index]))

Column: 0, Selected False, Rank: 5.000
Column: 1, Selected False, Rank: 24.000
Column: 2, Selected False, Rank: 6.000
Column: 3, Selected False, Rank: 17.000
Column: 4, Selected False, Rank: 12.000
Column: 5, Selected False, Rank: 13.000
Column: 6, Selected False, Rank: 25.000
Column: 7, Selected False, Rank: 3.000
Column: 8, Selected False, Rank: 2.000
Column: 9, Selected False, Rank: 22.000
Column: 10, Selected False, Rank: 7.000
Column: 11, Selected False, Rank: 23.000
Column: 12, Selected False, Rank: 11.000
Column: 13, Selected False, Rank: 15.000
Column: 14, Selected False, Rank: 16.000
Column: 15, Selected False, Rank: 14.000
Column: 16, Selected False, Rank: 21.000
Column: 17, Selected True, Rank: 1.000
Column: 18, Selected True, Rank: 1.000
Column: 19, Selected True, Rank: 1.000
Column: 20, Selected False, Rank: 9.000
Column: 21, Selected False, Rank: 10.000
Column: 22, Selected False, Rank: 20.000
Column: 23, Selected False, Rank: 26.000
Column: 24, Selected False, Rank: 19.0

In [None]:
selected_features_df = pd.DataFrame({
    'Feature Name': [feature_names[index] for index in selected_feature_indices]
})

# Export the DataFrame to an Excel file
selected_features_df.to_excel("RFE_selected_features_norm_PG.xlsx", index=False)