In [14]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [15]:
dataset=pd.read_csv("prep.csv")
dataset=pd.get_dummies(dataset,drop_first=True)

In [16]:
x_indep=dataset.drop('classification_yes',axis=1)
y_dep=dataset['classification_yes']

In [17]:
def rfeFeature(x_indep,y_dep,n):
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.svm import SVC
    
    rfeList=[]
    models=[
        LogisticRegression(solver='lbfgs'),
        RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0),
        DecisionTreeClassifier(criterion='gini',max_features='sqrt',splitter='best',random_state=0),
        SVC(kernel='linear',random_state=0)
    ]

    for model in models:
        rfe=RFE(estimator=model, n_features_to_select=n)
        transformed = rfe.fit_transform(x_indep, y_dep)
        rfeList.append(transformed)
    return rfeList

In [18]:
def split_scaler(x_indep,y_dep):
    from sklearn.model_selection import train_test_split
    x_train,x_test,y_train,y_test=train_test_split(x_indep,y_dep,test_size=0.25,random_state=0)
    
    from sklearn.preprocessing import StandardScaler
    sc=StandardScaler()
    x_train=sc.fit_transform(x_train)
    x_test=sc.transform(x_test)
    return x_train,x_test,y_train,y_test

def cm_prediction(classifier,x_test,y_test):
    y_pred=classifier.predict(x_test)
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    cm=confusion_matrix(y_test,y_pred)
    acscore=accuracy_score(y_test,y_pred)
    creport=classification_report(y_test,y_pred)
    return cm,acscore,creport

In [19]:
def logistic(x_train,x_test,y_train,y_test):
    from sklearn.linear_model import LogisticRegression
    classifier=LogisticRegression(random_state=0)
    classifier.fit(x_train,y_train)
    
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport

def svm_linear(x_train,x_test,y_train,y_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='linear', random_state=0)
    classifier.fit(x_train,y_train)
    
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport

def svm_NL(x_train,x_test,y_train,y_test):
    from sklearn.svm import SVC
    classifier=SVC(kernel='rbf', random_state=0)
    classifier.fit(x_train,y_train)
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport

def Navie(x_train,x_test,y_train,y_test):
    from sklearn.naive_bayes import GaussianNB
    classifier=GaussianNB()
    classifier.fit(x_train,y_train)
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport

def knn(x_train,x_test,y_train,y_test):
    from sklearn.neighbors import KNeighborsClassifier
    classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
    classifier.fit(x_train,y_train)
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport


def decision(x_train,x_test,y_train,y_test):
    from sklearn.tree import DecisionTreeClassifier
    classifier=DecisionTreeClassifier(criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport

def random(x_train,x_test,y_train,y_test):
    from sklearn.ensemble import RandomForestClassifier
    classifier=RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=0)
    classifier.fit(x_train,y_train)
    cm,acscore,creport=cm_prediction(classifier,x_test,y_test)
    return cm,acscore,creport


In [20]:
rfeList=rfeFeature(x_indep,y_dep,4)

acc_log=[]
acc_svml=[]
acc_svmnl=[]
acc_navie=[]
acc_knn=[]
acc_dec=[]
acc_rand=[]

for i in rfeList:
    x_train,x_test,y_train,y_test = split_scaler(i,y_dep)
    
    cm,acscore,creport = logistic(x_train,x_test,y_train,y_test)
    acc_log.append(acscore)
        
    cm,acscore,creport = svm_linear(x_train,x_test,y_train,y_test)
    acc_svml.append(acscore)
    
    cm,acscore,creport = svm_NL(x_train,x_test,y_train,y_test)
    acc_svmnl.append(acscore)
    
    cm,acscore,creport = Navie(x_train,x_test,y_train,y_test)
    acc_navie.append(acscore)
    
    cm,acscore,creport = knn(x_train,x_test,y_train,y_test)
    acc_knn.append(acscore)
    
    cm,acscore,creport = decision(x_train,x_test,y_train,y_test)
    acc_dec.append(acscore)
    
    cm,acscore,creport = random(x_train,x_test,y_train,y_test)
    acc_rand.append(acscore)

def cm_selection(acc_log,acc_svml,acc_svmnl,acc_navie,acc_knn,acc_dec,acc_rand):
    df=pd.DataFrame(index=['Logistic','Random','Decision','SVM'],columns=['Logistic','svm_linear','svm_NL','Navie','Knn','Decision','Random'])
    
    for number, idex in enumerate(df.index):
        df['Logistic'][idex]=acc_log[number]
        df['svm_linear'][idex]=acc_svml[number]
        df['svm_NL'][idex]=acc_svml[number]
        df['Navie'][idex]=acc_svml[number]
        df['Knn'][idex]=acc_svml[number]
        df['Decision'][idex]=acc_svml[number]
        df['Random'][idex]=acc_svml[number]
    return df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [21]:
df=cm_selection(acc_log,acc_svml,acc_svmnl,acc_navie,acc_knn,acc_dec,acc_rand)     
print(df)

         Logistic svm_linear svm_NL Navie   Knn Decision Random
Logistic     0.95       0.95   0.95  0.95  0.95     0.95   0.95
Random       0.97       0.97   0.97  0.97  0.97     0.97   0.97
Decision     0.98       0.98   0.98  0.98  0.98     0.98   0.98
SVM          0.96       0.96   0.96  0.96  0.96     0.96   0.96
