                                    Forward Selection

Forward selection is an iterative method where features are added to the model one at a time, starting with the most significant or promising feature, and continuing until a stopping criterion is met.

In [7]:
import pandas as pd
import numpy as np
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SequentialFeatureSelector as sfs

The SequentialFeatureSelector (SFS) from sklearn.feature_selection is a versatile tool for feature selection in scikit-learn. It implements forward and backward feature selection algorithms based on different scoring functions.

Scoring Functions: It supports different scoring functions to evaluate the quality of subsets of features, such as accuracy, AUC, mean squared error, etc. You can specify these scoring functions using the scoring parameter.

In [40]:
#SFS_Feature: Performs feature selection using SequentialFeatureSelector with two models (LogisticRegression and RandomForestClassifier).
#It selects 2 features based on forward selection (direction="forward")
def SFS_Feature(indep_X,dep_Y):
    SFSlist1=[]
    log_model=LogisticRegression(solver='lbfgs')
    RF=RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    
    SFSmodellist=[log_model,RF]
    for i in SFSmodellist:
        log_sfs = sfs(i,n_features_to_select=2, tol=None, direction="forward", scoring=None, cv=None, n_jobs=None)
        log_fit = log_sfs.fit(indep_X,dep_Y)
        log_sfs_feature=log_fit.transform(indep_X)
        SFSlist1.append(log_sfs_feature)
    return SFSlist1
#(Retrieves the names of selected features after performing feature selection Retrieves the names of selected features
#after performing feature selection using SequentialFeatureSelector with the same models and parameters as SFS_Feature.)
def Feature_Name(indep_X,dep_Y):
    SFSlist=[]
    log_model=LogisticRegression(solver='lbfgs')
    RF=RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    
    SFSmodellist1=[log_model,RF]
    for i in SFSmodellist1:
        log_sfs = sfs(i,n_features_to_select=2, tol=None, direction="forward", scoring=None, cv=None, n_jobs=None)
        log_fit = log_sfs.fit(indep_X,dep_Y)
        log_sfs_feature=log_fit.transform(indep_X)
        op=log_sfs.get_feature_names_out(input_features=None)
        SFSlist.append(op)
    return SFSlist

def split_scalar(indep_X,dep_Y):
    X_train,X_test,Y_train,Y_test=train_test_split(indep_X,dep_Y,test_size=0.25,random_state=0)
    sc=StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.transform(X_test)
    return X_train,X_test,Y_train,Y_test

def cm_prediction(classifier,X_test):
    y_pred=classifier.predict(X_test)
    
#Making confusion matrix
    from sklearn.metrics import confusion_matrix
    cm=confusion_matrix(Y_test,y_pred)
    
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import classification_report
    
    Accuracy=accuracy_score(Y_test,y_pred)
    report=classification_report(Y_test,y_pred)
    return classifier,Accuracy,report,X_test,Y_test,cm

def logistic(X_train,Y_train,X_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

def svm_linear(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel="linear",random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm
    
def svm_NL(X_train,Y_train,X_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel="rbf",random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm
    
def Naive(X_train,Y_train,X_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm
    
def knn(X_train,Y_train,X_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

def DecisionTree(X_train,Y_train,X_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion="entropy",random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

def RandomForest(X_train,Y_train,X_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10, criterion="entropy",random_state=0)
    classifier.fit(X_train,Y_train)
    classifier,accuracy,report,X_test,Y_test,cm=cm_prediction(classifier,X_test)
    return classifier,accuracy,report,X_test,Y_test,cm

In [41]:
def rfe_classification(acclog,accsvm1,accsvmn1,accknn,accnav,accdes,accrf):
    rfedataframe=pd.DataFrame(index=["Logistic","Random"],columns=["Logistic","SVM1","SVMn1",
                                                        "KNN","Naive","Decision","Random"])
    for number,idex in enumerate(rfedataframe.index):
        rfedataframe["Logistic"][idex]=acclog[number]
        rfedataframe["SVM1"][idex]=accsvm1[number]
        rfedataframe["SVMn1"][idex]=accsvmn1[number]
        rfedataframe["KNN"][idex]=accknn[number]
        rfedataframe["Naive"][idex]=accnav[number]
        rfedataframe["Decision"][idex]=accdes[number]
        rfedataframe["Random"][idex]=accrf[number]
    return rfedataframe    

In [42]:
dataset1=pd.read_csv("prep.csv",index_col=None)
df2=dataset1
df=pd.get_dummies(df2,drop_first=True)
indep_X=df.drop("classification_yes",axis=1)
dep_Y=df["classification_yes"]

In [43]:
SFSlist=SFS_Feature(indep_X,dep_Y)
acclog=[]
accsvm1=[]
accsvmn1=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

In [12]:
SFSlist

[array([[  3.        ,   0.        , 137.52875399,  12.51815562,
          38.86890244,   0.        ],
        [  2.        ,   0.        , 137.52875399,  10.7       ,
          34.        ,   0.        ],
        [  1.        ,   0.        , 138.        ,  12.        ,
          34.        ,   0.        ],
        ...,
        [  3.        ,   0.        , 134.        ,   9.1       ,
          26.        ,   0.        ],
        [  0.        ,   0.        , 142.        ,   8.5       ,
          38.86890244,   0.        ],
        [  0.        ,   0.        , 140.        ,  16.3       ,
          53.        ,   0.        ]]),
 array([[  3.        ,  57.48210526,  38.86890244,   1.        ,
           0.        ,   0.        ],
        [  2.        ,  22.        ,  34.        ,   1.        ,
           0.        ,   0.        ],
        [  1.        ,  23.        ,  34.        ,   0.        ,
           0.        ,   0.        ],
        ...,
        [  3.        , 115.        ,  26.    

In [44]:
Feature_Name=Feature_Name(indep_X,dep_Y)

In [14]:
Feature_Name

[array(['al', 'hrmo'], dtype=object), array(['al', 'pcv'], dtype=object)]

In [45]:
for i in SFSlist:
    X_train,X_test,Y_train,Y_test=split_scalar(i,dep_Y)

    classifier,accuracy,report,X_test,Y_test,cm=logistic(X_train,Y_train,X_test)
    acclog.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=svm_linear(X_train,Y_train,X_test)
    accsvm1.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=svm_NL(X_train,Y_train,X_test)
    accsvmn1.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=knn(X_train,Y_train,X_test)
    accknn.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=Naive(X_train,Y_train,X_test)
    accnav.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=DecisionTree(X_train,Y_train,X_test)
    accdes.append(accuracy)

    classifier,accuracy,report,X_test,Y_test,cm=RandomForest(X_train,Y_train,X_test)
    accrf.append(accuracy)

result=rfe_classification(acclog,accsvm1,accsvmn1,accknn,accnav,accdes,accrf)


In [16]:
result
#6

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Naive,Decision,Random
Logistic,1.0,0.99,0.99,1.0,0.87,1.0,1.0
Random,0.97,0.96,0.96,0.98,0.96,0.99,0.99


In [24]:
result
#5

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Naive,Decision,Random
Logistic,0.98,0.98,0.99,0.98,0.87,0.98,0.98
Random,0.96,0.96,0.96,0.99,0.96,0.99,0.99


In [30]:
result
#4

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Naive,Decision,Random
Logistic,0.98,0.97,0.99,0.98,0.83,0.95,0.97
Random,0.95,0.94,0.96,0.99,0.93,0.99,0.99


In [38]:
result
#3

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Naive,Decision,Random
Logistic,0.97,0.94,0.97,0.99,0.82,0.99,0.99
Random,0.98,0.95,0.96,0.99,0.91,0.99,0.99


In [46]:
result
#2

Unnamed: 0,Logistic,SVM1,SVMn1,KNN,Naive,Decision,Random
Logistic,0.94,0.94,0.94,0.95,0.82,0.97,0.94
Random,0.93,0.92,0.93,0.96,0.81,0.96,0.96
