In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import RFE

In [2]:
def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

In [3]:
def r2_prediction(regressor, X_test, y_test):
    y_pred = regressor.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    return r2

In [4]:
def Linear(X_train, y_train, X_test, y_test):
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

In [5]:
def svm_linear(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='linear')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

In [6]:
def svm_NL(X_train, y_train, X_test, y_test):
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

In [7]:
def Decision(X_train, y_train, X_test, y_test):
    regressor = DecisionTreeRegressor(random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

In [8]:
def random(X_train, y_train, X_test, y_test):
    regressor = RandomForestRegressor(n_estimators=10, random_state=0)
    regressor.fit(X_train, y_train)
    return r2_prediction(regressor, X_test, y_test)

In [9]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    model = LogisticRegression(max_iter=10000)
    log_rfe = RFE(estimator=model, n_features_to_select=n)
    log_fit = log_rfe.fit(indep_X, dep_Y)
    log_rfe_feature = log_fit.transform(indep_X)
    rfelist.append(log_rfe_feature)
    return rfelist

In [10]:
# Load and preprocess dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = pd.get_dummies(dataset1, drop_first=True)
indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

In [27]:
# Feature selection with RFE
rfelist = rfeFeature(indep_X, dep_Y, 6)

In [28]:
# Initialize lists for storing results
acclin, accsvml, accsvmnl, accdes, accrf = [], [], [], [], []

In [29]:
# Loop through each RFE-selected feature set
for i in rfelist:
    X_train, X_test, y_train, y_test = split_scalar(i, dep_Y)
    acclin.append(Linear(X_train, y_train, X_test, y_test))
    accsvml.append(svm_linear(X_train, y_train, X_test, y_test))
    accsvmnl.append(svm_NL(X_train, y_train, X_test, y_test))
    accdes.append(Decision(X_train, y_train, X_test, y_test))
    accrf.append(random(X_train, y_train, X_test, y_test))

In [30]:
# Compile the results into a dictionary
result = {
    'Linear Regression': acclin,
    'SVM Linear': accsvml,
    'SVM Non-linear': accsvmnl,
    'Decision Tree': accdes,
    'Random Forest': accrf
}

In [26]:
# Output the result
# 3
result

{'Linear Regression': [0.6018452016791436],
 'SVM Linear': [0.415993665811769],
 'SVM Non-linear': [0.7482495830867761],
 'Decision Tree': [0.7756440140657468],
 'Random Forest': [0.7745261062636659]}

In [21]:
# Output the result
# 5
result

{'Linear Regression': [0.6354782339330636],
 'SVM Linear': [0.5825676210731331],
 'SVM Non-linear': [0.9017744940680847],
 'Decision Tree': [0.9177182227989892],
 'Random Forest': [0.9172744732842161]}

In [31]:
# Output the result
# 6
result

{'Linear Regression': [0.7058376937901304],
 'SVM Linear': [0.6678384748123201],
 'SVM Non-linear': [0.9314388008374008],
 'Decision Tree': [0.974312641723356],
 'Random Forest': [0.9518705772708939]}