In [1]:
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold, KFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, classification_report, r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif, RFE, chi2, SequentialFeatureSelector, f_regression
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from IPython.display import display
import statsmodels.api as sm
from sklearn.naive_bayes import GaussianNB
from statsmodels.api import OLS

pd.options.display.float_format = '{:,.4f}'.format

In [2]:
# Data Loader loads data from UCI-ML Repo
def load_datasets():
    # Classification Datasets
    c_cancer = pd.read_csv("./Classification.CancerMB.csv").iloc[:, :32]
    # Regression Datasets
    r_life_expectancy = pd.read_csv("./Regression.Life.Expectancy.csv")
    
    return c_cancer, r_life_expectancy

In [3]:
# This function performs a missing value analysis on each column of the dataset, helps you decide on what to do in cleaning process
def null_check(df):
    null_columns = []
    for column in df.columns:
        print("Column Name:", column)
        print("Column DataType:", df[column].dtype)
        if df[column].dtype != 'float64' and df[column].dtype != 'int64':
            print("Column unique values:", df[column].unique())
        print("Column has null:", df[column].isnull().any())

        
        if df[column].isnull().any() == True:
            print("Column Null Count:", df[column].isnull().sum())
            null_columns.append(column)
        print("\n")
    return null_columns

In [4]:
# This function drops any null columns and missing values
# This is where you decide whether to remove NULL rows (which will reduce the size of Dataset) or remove NULL columns entirely. You can also choose a combination of both.
def clean_data(df, drop_columns, missing_value = False):
    # Remove unnecessary columns
    df.drop(drop_columns, axis=1, inplace=True)
    # Drop rows with any missing values
    if missing_value == False:
        df.dropna(inplace=True)
    else:
        df.fillna(missing_value, inplace=True)
    return df

In [5]:
# Transforms categorical and numberical data into numerical data
def transform_data(df):
    # Encode categorical variables
    label_encoder = LabelEncoder()
    # print("Categorical columns:", df.select_dtypes(include=['object']).columns)
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = label_encoder.fit_transform(df[col])
    
    # Standardize numerical features
    scaler = StandardScaler()
    # print("Numerical columns:", df.select_dtypes(include=['float64', 'int64']).columns)
    numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
    if len(numerical_cols) > 0:
        df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    return df

# Classificiation Algorithms

In [6]:
def fwpMethodsKNN(df, target_name):
    def knn_classifier(X_selected, y, k_fold):
        knn_classifier = KNeighborsClassifier()
        knn_accuracy = cross_val_score(knn_classifier, X_selected, y, cv=k_fold, scoring='accuracy').mean()
        return knn_accuracy

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    metrics = ['Accuracy', 'F1 Score (Positive)', 'F1 Score (Negative)', 'AUC']

    # Apply Benchmarks
    benchmark_score = knn_classifier(X_scaled, y, k_fold)
    benchmarks.append({'Classifier': 'K-Nearest Neighbors', 'Accuracy': benchmark_score})

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(KNeighborsClassifier(), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(KNeighborsClassifier(), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            for metric in metrics:
                if metric == 'Accuracy':
                    score = knn_classifier(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'K-Nearest Neighbors', 'Metric': metric, 'Value': score})
                elif metric == 'F1 Score (Positive)' or metric == 'F1 Score (Negative)':
                    y_pred = cross_val_predict(KNeighborsClassifier(), X_selected, y, cv=k_fold)
                    f1 = f1_score(y, y_pred, pos_label=1 if metric == 'F1 Score (Positive)' else 0)
                    results.append({'k': k, 'Method': method, 'Classifier': 'K-Nearest Neighbors', 'Metric': metric, 'Value': f1})
                elif metric == 'AUC':
                    scores = cross_val_predict(KNeighborsClassifier(), X_selected, y, cv=k_fold, method='predict_proba')
                    auc = roc_auc_score(y, scores[:, 1])
                    results.append({'k': k, 'Method': method, 'Classifier': 'K-Nearest Neighbors', 'Metric': metric, 'Value': auc})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [7]:
def fwpMethodsLogisticRegression(df, target_name):
    def logistic_regression_classifier(X_selected, y, k_fold):
        lr_classifier = LogisticRegression()
        lr_accuracy = cross_val_score(lr_classifier, X_selected, y, cv=k_fold, scoring='accuracy').mean()
        return lr_accuracy

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    metrics = ['Accuracy', 'F1 Score (Positive)', 'F1 Score (Negative)', 'AUC']

    # Apply Benchmarks
    benchmark_score = logistic_regression_classifier(X_scaled, y, k_fold)
    benchmarks.append({'Classifier': 'Logistic Regression', 'Accuracy': benchmark_score})

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            for metric in metrics:
                if metric == 'Accuracy':
                    score = logistic_regression_classifier(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Logistic Regression', 'Metric': metric, 'Value': score})
                elif metric == 'F1 Score (Positive)' or metric == 'F1 Score (Negative)':
                    y_pred = cross_val_predict(LogisticRegression(), X_selected, y, cv=k_fold)
                    f1 = f1_score(y, y_pred, pos_label=1 if metric == 'F1 Score (Positive)' else 0)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Logistic Regression', 'Metric': metric, 'Value': f1})
                elif metric == 'AUC':
                    scores = cross_val_predict(LogisticRegression(), X_selected, y, cv=k_fold, method='predict_proba')
                    auc = roc_auc_score(y, scores[:, 1])
                    results.append({'k': k, 'Method': method, 'Classifier': 'Logistic Regression', 'Metric': metric, 'Value': auc})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [8]:
def fwpMethodsSVM(df, target_name):
    def svm_classifier(X_selected, y, k_fold):
        svm_classifier = SVC(probability=True)
        svm_accuracy = cross_val_score(svm_classifier, X_selected, y, cv=k_fold, scoring='accuracy').mean()
        return svm_accuracy

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    metrics = ['Accuracy', 'F1 Score (Positive)', 'F1 Score (Negative)', 'AUC']

    # Apply Benchmarks
    benchmark_score = svm_classifier(X_scaled, y, k_fold)
    benchmarks.append({'Classifier': 'SVM', 'Accuracy': benchmark_score})

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(SVC(probability=True), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(SVC(probability=True), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            for metric in metrics:
                if metric == 'Accuracy':
                    score = svm_classifier(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'SVM', 'Metric': metric, 'Value': score})
                elif metric == 'F1 Score (Positive)' or metric == 'F1 Score (Negative)':
                    y_pred = cross_val_predict(SVC(probability=True), X_selected, y, cv=k_fold)
                    f1 = f1_score(y, y_pred, pos_label=1 if metric == 'F1 Score (Positive)' else 0)
                    results.append({'k': k, 'Method': method, 'Classifier': 'SVM', 'Metric': metric, 'Value': f1})
                elif metric == 'AUC':
                    scores = cross_val_predict(SVC(probability=True), X_selected, y, cv=k_fold, method='predict_proba')
                    auc = roc_auc_score(y, scores[:, 1])
                    results.append({'k': k, 'Method': method, 'Classifier': 'SVM', 'Metric': metric, 'Value': auc})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [31]:
def fwpMethodsNaiveBayes(df, target_name):
    def naive_bayes_classifier(X_selected, y, k_fold):
        nb_classifier = GaussianNB()
        nb_accuracy = cross_val_score(nb_classifier, X_selected, y, cv=k_fold, scoring='accuracy').mean()
        return nb_accuracy
    
    def naive_bayes_auc(X_selected, y, k_fold):
        nb_classifier = GaussianNB()
        auc_scores = []
        for train_index, test_index in k_fold.split(X_selected, y):
            X_train, X_test = X_selected[train_index], X_selected[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            nb_classifier.fit(X_train, y_train)
            y_prob = nb_classifier.predict_proba(X_test)[:, 1]  # Probability estimates for the positive class
            auc = roc_auc_score(y_test, y_prob)
            auc_scores.append(auc)
        return np.mean(auc_scores)

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    metrics = ['Accuracy', 'F1 Score (Positive)', 'F1 Score (Negative)', 'AUC']

    # Apply Benchmarks
    benchmark_score = naive_bayes_classifier(X_scaled, y, k_fold)
    benchmarks.append({'Classifier': 'Naive Bayes', 'Accuracy': benchmark_score})

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(GaussianNB(), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(GaussianNB(), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            for metric in metrics:
                if metric == 'Accuracy':
                    score = naive_bayes_classifier(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Naive Bayes', 'Metric': metric, 'Value': score})
                elif metric == 'F1 Score (Positive)' or metric == 'F1 Score (Negative)':
                    y_pred = cross_val_predict(GaussianNB(), X_selected, y, cv=k_fold)
                    f1 = f1_score(y, y_pred, pos_label=1 if metric == 'F1 Score (Positive)' else 0)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Naive Bayes', 'Metric': metric, 'Value': f1})
                elif metric == 'AUC':
                    auc_score = naive_bayes_auc(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Naive Bayes', 'Metric': metric, 'Value': auc_score})
                    continue

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [10]:
def fwpMethodsDecisionTrees(df, target_name):
    def decision_trees_classifier(X_selected, y, k_fold):
        dt_classifier = DecisionTreeClassifier()
        dt_accuracy = cross_val_score(dt_classifier, X_selected, y, cv=k_fold, scoring='accuracy').mean()
        return dt_accuracy

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    metrics = ['Accuracy', 'F1 Score (Positive)', 'F1 Score (Negative)', 'AUC']

    # Apply Benchmarks
    benchmark_score = decision_trees_classifier(X_scaled, y, k_fold)
    benchmarks.append({'Classifier': 'Decision Trees', 'Accuracy': benchmark_score})

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(DecisionTreeClassifier(), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(DecisionTreeClassifier(), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            for metric in metrics:
                if metric == 'Accuracy':
                    score = decision_trees_classifier(X_selected, y, k_fold)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Decision Trees', 'Metric': metric, 'Value': score})
                elif metric == 'F1 Score (Positive)' or metric == 'F1 Score (Negative)':
                    y_pred = cross_val_predict(DecisionTreeClassifier(), X_selected, y, cv=k_fold)
                    f1 = f1_score(y, y_pred, pos_label=1 if metric == 'F1 Score (Positive)' else 0)
                    results.append({'k': k, 'Method': method, 'Classifier': 'Decision Trees', 'Metric': metric, 'Value': f1})
                elif metric == 'AUC':
                    scores = cross_val_predict(DecisionTreeClassifier(), X_selected, y, cv=k_fold, method='predict_proba')
                    auc = roc_auc_score(y, scores[:, 1])
                    results.append({'k': k, 'Method': method, 'Classifier': 'Decision Trees', 'Metric': metric, 'Value': auc})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [11]:
def fwpMethodsLogisticRegressionSMLOGIT(df, target_name):
    # Function to fit logistic regression using statsmodels and calculate evaluation metrics
    def logistic_regression_statsmodels(X_selected, y):
        logit_model = sm.Logit(y, X_selected)
        result = logit_model.fit()

        # Predict class labels
        y_pred = result.predict(X_selected)
        y_pred_binary = (y_pred > 0.5).astype(int)

        # Calculate evaluation metrics
        f1_pos = f1_score(y, y_pred_binary, pos_label=1)
        f1_neg = f1_score(y, y_pred_binary, pos_label=0)
        auc = roc_auc_score(y, y_pred)
        accuracy = accuracy_score(y, y_pred_binary)
        pseudo_rsquared = result.prsquared
        loglikelihood = result.llf

        return {'F1 Score (Positive)': f1_pos, 'F1 Score (Negative)': f1_neg, 
                'AUC': auc, 'Accuracy': accuracy, 
                'Pseudo-Rsquared': pseudo_rsquared, 'Loglikelihood': loglikelihood}

    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize k-fold cross-validation where folds = 5
    k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Loop through different feature selection methods
    for k in [3, 5, 10]:
        for method in ['Chi-Squared', 'Forward Feature Selection (FFS)', 'Backward Feature Elimination (BFE)', 'PCA']:
            # Apply feature selection methods
            if method == 'Chi-Squared':
                X_selected = SelectKBest(score_func=chi2, k=k).fit_transform(X_scaled, y)
            elif method == 'Forward Feature Selection (FFS)':
                selector = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=k, direction='forward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'Backward Feature Elimination (BFE)':
                selector = SequentialFeatureSelector(LogisticRegression(), n_features_to_select=k, direction='backward')
                X_selected = selector.fit_transform(X_scaled, y)
            elif method == 'PCA':
                pca = PCA(n_components=k)
                X_selected = pca.fit_transform(X_scaled)

            # Calculate evaluation metrics
            metrics = logistic_regression_statsmodels(X_selected, y)
            results.append({'k': k, 'Method': method, 'Classifier': 'Logistic Regression', **metrics})

    df_results = pd.DataFrame(results)
    return df_results

# Regression Algorithms

In [12]:
def fwpMethodsKNNRegressor(df, target_name):
    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize the regressors
    knn_regressor = KNeighborsRegressor()

    # Initialize k-fold cross-validation where folds = 5
    k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Apply Benchmarks:
    knn_benchmark_mae = cross_val_score(knn_regressor, X_scaled, y, cv=k_fold, scoring='neg_mean_absolute_error')
    knn_benchmark_mse = cross_val_score(knn_regressor, X_scaled, y, cv=k_fold, scoring='neg_mean_squared_error')
    knn_benchmark_rmse = np.sqrt(-knn_benchmark_mse)
    knn_benchmark_r2 = cross_val_score(knn_regressor, X_scaled, y, cv=k_fold, scoring='r2')
    benchmarks.append({
        'Regressor': 'K-Nearest Neighbors', 
        'MAE': -knn_benchmark_mae.mean(),
        'MSE': -knn_benchmark_mse.mean(), 
        'RMSE': knn_benchmark_rmse.mean(), 
        'R2': knn_benchmark_r2.mean()
    })

    # Loop through different k values (assuming you want to test different values of k)
    for k in [3, 5, 10]:
        # Apply PCA
        pca = PCA(n_components=k)
        X_pca = pca.fit_transform(X_scaled)
        knn_pca_mae = cross_val_score(knn_regressor, X_pca, y, cv=k_fold, scoring='neg_mean_absolute_error')
        knn_pca_mse = cross_val_score(knn_regressor, X_pca, y, cv=k_fold, scoring='neg_mean_squared_error')
        knn_pca_rmse = np.sqrt(-knn_pca_mse)
        knn_pca_r2 = cross_val_score(knn_regressor, X_pca, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Principal Component Analysis (PCA)', 'Regressor': 'K-Nearest Neighbors', 'MAE': -knn_pca_mae.mean(), 'MSE': -knn_pca_mse.mean(), 'RMSE': knn_pca_rmse.mean(), 'R2': knn_pca_r2.mean()})

        # Apply ANOVA filter
        X_anova = SelectKBest(score_func=f_regression, k=k).fit_transform(X_scaled, y)
        knn_anova_mae = cross_val_score(knn_regressor, X_anova, y, cv=k_fold, scoring='neg_mean_absolute_error')
        knn_anova_mse = cross_val_score(knn_regressor, X_anova, y, cv=k_fold, scoring='neg_mean_squared_error')
        knn_anova_rmse = np.sqrt(-knn_anova_mse)
        knn_anova_r2 = cross_val_score(knn_regressor, X_anova, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Anova', 'Regressor': 'K-Nearest Neighbors', 'MAE': -knn_anova_mae.mean(), 'MSE': -knn_anova_mse.mean(), 'RMSE': knn_anova_rmse.mean(), 'R2': knn_anova_r2.mean()})

        # Apply forward selection wrapper
        knn_forward_selector = SequentialFeatureSelector(knn_regressor, n_features_to_select=k, direction='forward')
        X_forward = knn_forward_selector.fit_transform(X_scaled, y)
        knn_forward_mae = cross_val_score(knn_regressor, X_forward, y, cv=k_fold, scoring='neg_mean_absolute_error')
        knn_forward_mse = cross_val_score(knn_regressor, X_forward, y, cv=k_fold, scoring='neg_mean_squared_error')
        knn_forward_rmse = np.sqrt(-knn_forward_mse)
        knn_forward_r2 = cross_val_score(knn_regressor, X_forward, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Forward Feature Selection (FFS)', 'Regressor': 'K-Nearest Neighbors', 'MAE': -knn_forward_mae.mean(), 'MSE': -knn_forward_mse.mean(), 'RMSE': knn_forward_rmse.mean(), 'R2': knn_forward_r2.mean()})

        # Apply backward selection wrapper
        knn_backward_selector = SequentialFeatureSelector(knn_regressor, n_features_to_select=k, direction='backward')
        X_backward = knn_backward_selector.fit_transform(X_scaled, y)
        knn_backward_mae = cross_val_score(knn_regressor, X_backward, y, cv=k_fold, scoring='neg_mean_absolute_error')
        knn_backward_mse = cross_val_score(knn_regressor, X_backward, y, cv=k_fold, scoring='neg_mean_squared_error')
        knn_backward_rmse = np.sqrt(-knn_backward_mse)
        knn_backward_r2 = cross_val_score(knn_regressor, X_backward, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Backward Feature Elimination (BFE)', 'Regressor': 'K-Nearest Neighbors', 'MAE': -knn_backward_mae.mean(), 'MSE': -knn_backward_mse.mean(), 'RMSE': knn_backward_rmse.mean(), 'R2': knn_backward_r2.mean()})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [13]:
def fwpMethodsLinearRegression(df, target_name):
    # Separate features and targets
    X = df.drop(target_name, axis=1)
    y = df[target_name]
    results = []
    benchmarks = []

    # Apply MinMax scaling to ensure non-negative values
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)

    # Initialize the regressors
    lr_regressor = LinearRegression()

    # Initialize k-fold cross-validation where folds = 5
    k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Apply Benchmarks:
    lr_benchmark_mae = cross_val_score(lr_regressor, X_scaled, y, cv=k_fold, scoring='neg_mean_absolute_error')
    lr_benchmark_mse = cross_val_score(lr_regressor, X_scaled, y, cv=k_fold, scoring='neg_mean_squared_error')
    lr_benchmark_rmse = np.sqrt(-lr_benchmark_mse)
    lr_benchmark_r2 = cross_val_score(lr_regressor, X_scaled, y, cv=k_fold, scoring='r2')
    benchmarks.append({
        'Regressor': 'Linear Regression', 
        'MAE': -lr_benchmark_mae.mean(),
        'MSE': -lr_benchmark_mse.mean(), 
        'RMSE': lr_benchmark_rmse.mean(), 
        'R2': lr_benchmark_r2.mean()
    })

    # Loop through different k values
    for k in [3, 5, 10]:
        # Apply ANOVA filter
        X_anova = SelectKBest(score_func=f_regression, k=k).fit_transform(X_scaled, y)
        lr_anova_mae = cross_val_score(lr_regressor, X_anova, y, cv=k_fold, scoring='neg_mean_absolute_error')
        lr_anova_mse = cross_val_score(lr_regressor, X_anova, y, cv=k_fold, scoring='neg_mean_squared_error')
        lr_anova_rmse = np.sqrt(-lr_anova_mse)
        lr_anova_r2 = cross_val_score(lr_regressor, X_anova, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Anova', 'Regressor': 'Linear Regression', 
                         'MAE': -lr_anova_mae.mean(), 'MSE': -lr_anova_mse.mean(), 
                         'RMSE': lr_anova_rmse.mean(), 'R2': lr_anova_r2.mean()})
        
        # Apply forward selection wrapper
        lr_forward_selector = SequentialFeatureSelector(lr_regressor, n_features_to_select=k, direction='forward')
        X_forward = lr_forward_selector.fit_transform(X_scaled, y)
        lr_forward_mae = cross_val_score(lr_regressor, X_forward, y, cv=k_fold, scoring='neg_mean_absolute_error')
        lr_forward_mse = cross_val_score(lr_regressor, X_forward, y, cv=k_fold, scoring='neg_mean_squared_error')
        lr_forward_rmse = np.sqrt(-lr_forward_mse)
        lr_forward_r2 = cross_val_score(lr_regressor, X_forward, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Forward Feature Selection (FFS)', 'Regressor': 'Linear Regression', 
                         'MAE': -lr_forward_mae.mean(), 'MSE': -lr_forward_mse.mean(), 
                         'RMSE': lr_forward_rmse.mean(), 'R2': lr_forward_r2.mean()})

        # Apply backward selection wrapper
        lr_backward_selector = SequentialFeatureSelector(lr_regressor, n_features_to_select=k, direction='backward')
        X_backward = lr_backward_selector.fit_transform(X_scaled, y)
        lr_backward_mae = cross_val_score(lr_regressor, X_backward, y, cv=k_fold, scoring='neg_mean_absolute_error')
        lr_backward_mse = cross_val_score(lr_regressor, X_backward, y, cv=k_fold, scoring='neg_mean_squared_error')
        lr_backward_rmse = np.sqrt(-lr_backward_mse)
        lr_backward_r2 = cross_val_score(lr_regressor, X_backward, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Backward Feature Elimination (BFE)', 'Regressor': 'Linear Regression', 
                         'MAE': -lr_backward_mae.mean(), 'MSE': -lr_backward_mse.mean(), 
                         'RMSE': lr_backward_rmse.mean(), 'R2': lr_backward_r2.mean()})

        # Apply PCA
        pca = PCA(n_components=k)
        X_pca = pca.fit_transform(X_scaled)
        lr_pca_mae = cross_val_score(lr_regressor, X_pca, y, cv=k_fold, scoring='neg_mean_absolute_error')
        lr_pca_mse = cross_val_score(lr_regressor, X_pca, y, cv=k_fold, scoring='neg_mean_squared_error')
        lr_pca_rmse = np.sqrt(-lr_pca_mse)
        lr_pca_r2 = cross_val_score(lr_regressor, X_pca, y, cv=k_fold, scoring='r2')
        results.append({ 'k': k, 'Method': 'Principal Component Analysis (PCA)', 'Regressor': 'Linear Regression', 
                         'MAE': -lr_pca_mae.mean(), 'MSE': -lr_pca_mse.mean(), 
                         'RMSE': lr_pca_rmse.mean(), 'R2': lr_pca_r2.mean()})

    df_results = pd.DataFrame(results)
    df_benchmarks = pd.DataFrame(benchmarks)
    return df_results, df_benchmarks

In [28]:
def OLSRegressionMethod(data, target_col, cv_splits):
    # Split dataset into features and target variable
    X = data.drop(target_col, axis=1)
    y = data[target_col]

    results = {}

    for cv in cv_splits:
        # Initialize KFold
        k_fold = KFold(n_splits=cv, shuffle=True, random_state=42)

        # Lists to store scores
        mae_scores = []
        mse_scores = []
        rmse_scores = []
        r_squared_scores = []
        f_statistic_scores = []
        prob_f_statistic_scores = []
        log_likelihood_scores = []
        aic_scores = []
        bic_scores = []
        p_values_list = []

        for train_index, test_index in k_fold.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            # Standardize features by removing the mean and scaling to unit variance
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)

            # Add constant to features for OLS
            X_train_sm = sm.add_constant(X_train_scaled)

            # OLS Regression
            ols_model = sm.OLS(y_train, X_train_sm)
            ols_results = ols_model.fit()

            # Predictions
            X_test_sm = sm.add_constant(X_test_scaled)
            y_pred = ols_results.predict(X_test_sm)

            # Calculate evaluation metrics
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            rmse = mean_squared_error(y_test, y_pred, squared=False)
            r_squared = r2_score(y_test, y_pred)
            f_statistic = ols_results.fvalue
            prob_f_statistic = ols_results.f_pvalue
            log_likelihood = ols_results.llf
            aic = ols_results.aic
            bic = ols_results.bic
            p_values = ols_results.pvalues.drop('const')  # Exclude p-value of intercept

            # Append scores
            mae_scores.append(mae)
            mse_scores.append(mse)
            rmse_scores.append(rmse)
            r_squared_scores.append(r_squared)
            f_statistic_scores.append(f_statistic)
            prob_f_statistic_scores.append(prob_f_statistic)
            log_likelihood_scores.append(log_likelihood)
            aic_scores.append(aic)
            bic_scores.append(bic)
            p_values_list.append(p_values)

        # Calculate the average scores for each metric
        avg_mae = np.mean(mae_scores)
        avg_mse = np.mean(mse_scores)
        avg_rmse = np.mean(rmse_scores)
        avg_r_squared = np.mean(r_squared_scores)
        avg_f_statistic = np.mean(f_statistic_scores)
        avg_prob_f_statistic = np.mean(prob_f_statistic_scores)
        avg_log_likelihood = np.mean(log_likelihood_scores)
        avg_aic = np.mean(aic_scores)
        avg_bic = np.mean(bic_scores)

        # Flatten the nested lists of p-values
        flattened_p_values = [item for sublist in p_values_list for item in sublist]

        # Calculate the average p-value for each feature
        avg_p_values = np.mean(flattened_p_values, axis=0)

        # Store results
        results[f'CV_{cv}'] = {
            'MAE': avg_mae,
            'MSE': avg_mse,
            'RMSE': avg_rmse,
            'R-Squared': avg_r_squared,
            'F-Statistic': avg_f_statistic,
            'Prob (F-Statistic)': avg_prob_f_statistic,
            'Log-Likelihood': avg_log_likelihood,
            'AIC': avg_aic,
            'BIC': avg_bic,
            'P-Values': avg_p_values.tolist()  # Convert numpy array to list
        }

    return pd.DataFrame.from_dict(results, orient='index')

In [15]:
# Loading all Datasets into the required variables
c_cancer, r_life_expectancy = load_datasets()

In [16]:
pd.set_option('display.max_rows', None)

# Classification Results:

In [17]:
c_cancer
null_check(c_cancer)
print(c_cancer.info())
c_cancer = transform_data(c_cancer)

Column Name: id
Column DataType: int64
Column has null: False


Column Name: diagnosis
Column DataType: object
Column unique values: ['M' 'B']
Column has null: False


Column Name: radius_mean
Column DataType: float64
Column has null: False


Column Name: texture_mean
Column DataType: float64
Column has null: False


Column Name: perimeter_mean
Column DataType: float64
Column has null: False


Column Name: area_mean
Column DataType: float64
Column has null: False


Column Name: smoothness_mean
Column DataType: float64
Column has null: False


Column Name: compactness_mean
Column DataType: float64
Column has null: False


Column Name: concavity_mean
Column DataType: float64
Column has null: False


Column Name: concave points_mean
Column DataType: float64
Column has null: False


Column Name: symmetry_mean
Column DataType: float64
Column has null: False


Column Name: fractal_dimension_mean
Column DataType: float64
Column has null: False


Column Name: radius_se
Column DataType: float64

In [18]:
c_cancer_knnresults, c_cancer_knnbenchmarks = fwpMethodsKNN(c_cancer, 'diagnosis')
display(c_cancer_knnresults)

Unnamed: 0,k,Method,Classifier,Metric,Value
0,3,Chi-Squared,K-Nearest Neighbors,Accuracy,0.9192
1,3,Chi-Squared,K-Nearest Neighbors,F1 Score (Positive),0.8905
2,3,Chi-Squared,K-Nearest Neighbors,F1 Score (Negative),0.9359
3,3,Chi-Squared,K-Nearest Neighbors,AUC,0.9495
4,3,Forward Feature Selection (FFS),K-Nearest Neighbors,Accuracy,0.9666
5,3,Forward Feature Selection (FFS),K-Nearest Neighbors,F1 Score (Positive),0.9544
6,3,Forward Feature Selection (FFS),K-Nearest Neighbors,F1 Score (Negative),0.9736
7,3,Forward Feature Selection (FFS),K-Nearest Neighbors,AUC,0.9853
8,3,Backward Feature Elimination (BFE),K-Nearest Neighbors,Accuracy,0.9631
9,3,Backward Feature Elimination (BFE),K-Nearest Neighbors,F1 Score (Positive),0.9496


In [19]:
c_cancer_lrresults, c_cancer_lrbenchmarks = fwpMethodsLogisticRegression(c_cancer, 'diagnosis')
display(c_cancer_lrresults)

Unnamed: 0,k,Method,Classifier,Metric,Value
0,3,Chi-Squared,Logistic Regression,Accuracy,0.9227
1,3,Chi-Squared,Logistic Regression,F1 Score (Positive),0.8905
2,3,Chi-Squared,Logistic Regression,F1 Score (Negative),0.9402
3,3,Chi-Squared,Logistic Regression,AUC,0.9661
4,3,Forward Feature Selection (FFS),Logistic Regression,Accuracy,0.9561
5,3,Forward Feature Selection (FFS),Logistic Regression,F1 Score (Positive),0.9383
6,3,Forward Feature Selection (FFS),Logistic Regression,F1 Score (Negative),0.9659
7,3,Forward Feature Selection (FFS),Logistic Regression,AUC,0.9896
8,3,Backward Feature Elimination (BFE),Logistic Regression,Accuracy,0.9561
9,3,Backward Feature Elimination (BFE),Logistic Regression,F1 Score (Positive),0.9383


In [20]:
c_cancer_svmresults, c_cancer_svmbenchmarks = fwpMethodsSVM(c_cancer, 'diagnosis')
display(c_cancer_svmresults)

Unnamed: 0,k,Method,Classifier,Metric,Value
0,3,Chi-Squared,SVM,Accuracy,0.9227
1,3,Chi-Squared,SVM,F1 Score (Positive),0.8916
2,3,Chi-Squared,SVM,F1 Score (Negative),0.9399
3,3,Chi-Squared,SVM,AUC,0.9504
4,3,Forward Feature Selection (FFS),SVM,Accuracy,0.9614
5,3,Forward Feature Selection (FFS),SVM,F1 Score (Positive),0.9474
6,3,Forward Feature Selection (FFS),SVM,F1 Score (Negative),0.9694
7,3,Forward Feature Selection (FFS),SVM,AUC,0.9891
8,3,Backward Feature Elimination (BFE),SVM,Accuracy,0.9667
9,3,Backward Feature Elimination (BFE),SVM,F1 Score (Positive),0.9549


In [32]:
c_cancer_nbresults, c_cancer_nbbenchmarks = fwpMethodsNaiveBayes(c_cancer, 'diagnosis')
display(c_cancer_nbresults)

Unnamed: 0,k,Method,Classifier,Metric,Value
0,3,Chi-Squared,Naive Bayes,Accuracy,0.9139
1,3,Chi-Squared,Naive Bayes,F1 Score (Positive),0.8831
2,3,Chi-Squared,Naive Bayes,F1 Score (Negative),0.9318
3,3,Chi-Squared,Naive Bayes,AUC,0.9638
4,3,Forward Feature Selection (FFS),Naive Bayes,Accuracy,0.9701
5,3,Forward Feature Selection (FFS),Naive Bayes,F1 Score (Positive),0.9592
6,3,Forward Feature Selection (FFS),Naive Bayes,F1 Score (Negative),0.9764
7,3,Forward Feature Selection (FFS),Naive Bayes,AUC,0.991
8,3,Backward Feature Elimination (BFE),Naive Bayes,Accuracy,0.9702
9,3,Backward Feature Elimination (BFE),Naive Bayes,F1 Score (Positive),0.9592


In [22]:
c_cancer_dtresults, c_cancer_dtbenchmarks = fwpMethodsDecisionTrees(c_cancer, 'diagnosis')
display(c_cancer_dtresults)

Unnamed: 0,k,Method,Classifier,Metric,Value
0,3,Chi-Squared,Decision Trees,Accuracy,0.8629
1,3,Chi-Squared,Decision Trees,F1 Score (Positive),0.8222
2,3,Chi-Squared,Decision Trees,F1 Score (Negative),0.8917
3,3,Chi-Squared,Decision Trees,AUC,0.8596
4,3,Forward Feature Selection (FFS),Decision Trees,Accuracy,0.9086
5,3,Forward Feature Selection (FFS),Decision Trees,F1 Score (Positive),0.8794
6,3,Forward Feature Selection (FFS),Decision Trees,F1 Score (Negative),0.9331
7,3,Forward Feature Selection (FFS),Decision Trees,AUC,0.8966
8,3,Backward Feature Elimination (BFE),Decision Trees,Accuracy,0.942
9,3,Backward Feature Elimination (BFE),Decision Trees,F1 Score (Positive),0.9277


In [23]:
c_cancer_lrsmlogitresults = fwpMethodsLogisticRegressionSMLOGIT(c_cancer, 'diagnosis')
display(c_cancer_lrsmlogitresults)

Optimization terminated successfully.
         Current function value: 0.560781
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.590177
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.590177
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.117785
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.278412
         Iterations 10
Optimization terminated successfully.
         Current function value: 0.162403
         Iterations 11
Optimization terminated successfully.
         Current function value: 0.521831
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.080252
         Iterations 11
Optimization terminated successfully.
         Current function value: 0.108051
         Iterations 12
Optimization terminated successfully.
         Current function value: 0.1141

Unnamed: 0,k,Method,Classifier,F1 Score (Positive),F1 Score (Negative),AUC,Accuracy,Pseudo-Rsquared,Loglikelihood
0,3,Chi-Squared,Logistic Regression,0.7229,0.8107,0.8105,0.775,0.1507,-319.0846
1,3,Forward Feature Selection (FFS),Logistic Regression,0.6277,0.6944,0.7671,0.6643,0.1062,-335.8107
2,3,Backward Feature Elimination (BFE),Logistic Regression,0.6277,0.6944,0.7671,0.6643,0.1062,-335.8107
3,3,PCA,Logistic Regression,0.9401,0.9631,0.9916,0.9543,0.8216,-67.0197
4,5,Chi-Squared,Logistic Regression,0.8792,0.9309,0.9418,0.9121,0.5784,-158.4165
5,5,Forward Feature Selection (FFS),Logistic Regression,0.926,0.9569,0.976,0.9455,0.7541,-92.4074
6,5,Backward Feature Elimination (BFE),Logistic Regression,0.7,0.7965,0.8212,0.7575,0.2097,-296.9216
7,5,PCA,Logistic Regression,0.9535,0.9718,0.9954,0.9649,0.8785,-45.6636
8,10,Chi-Squared,Logistic Regression,0.9594,0.9764,0.9893,0.9701,0.8364,-61.4813
9,10,Forward Feature Selection (FFS),Logistic Regression,0.9429,0.9666,0.9888,0.9578,0.8272,-64.9431


# Regression Results

In [24]:
r_life_expectancy
null_check(r_life_expectancy)
print(c_cancer.info())
clean_data(r_life_expectancy,[])
r_life_expectancy = transform_data(r_life_expectancy)

Column Name: Country
Column DataType: object
Column unique values: ['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Antigua and Barbuda'
 'Argentina' 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas'
 'Bahrain' 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin'
 'Bhutan' 'Bolivia (Plurinational State of)' 'Bosnia and Herzegovina'
 'Botswana' 'Brazil' 'Brunei Darussalam' 'Bulgaria' 'Burkina Faso'
 'Burundi' "Côte d'Ivoire" 'Cabo Verde' 'Cambodia' 'Cameroon' 'Canada'
 'Central African Republic' 'Chad' 'Chile' 'China' 'Colombia' 'Comoros'
 'Congo' 'Cook Islands' 'Costa Rica' 'Croatia' 'Cuba' 'Cyprus' 'Czechia'
 "Democratic People's Republic of Korea"
 'Democratic Republic of the Congo' 'Denmark' 'Djibouti' 'Dominica'
 'Dominican Republic' 'Ecuador' 'Egypt' 'El Salvador' 'Equatorial Guinea'
 'Eritrea' 'Estonia' 'Ethiopia' 'Fiji' 'Finland' 'France' 'Gabon' 'Gambia'
 'Georgia' 'Germany' 'Ghana' 'Greece' 'Grenada' 'Guatemala' 'Guinea'
 'Guinea-Bissau' 'Guyana' 'Haiti' 'Honduras' 'H

In [25]:
df_knnregresults, df_knnregbenchmarks = fwpMethodsKNNRegressor(r_life_expectancy, 'Life expectancy ')
display(df_knnregresults)

Unnamed: 0,k,Method,Regressor,MAE,MSE,RMSE,R2
0,3,Principal Component Analysis (PCA),K-Nearest Neighbors,0.4238,0.3427,0.5843,0.6549
1,3,Anova,K-Nearest Neighbors,0.184,0.0784,0.2787,0.9215
2,3,Forward Feature Selection (FFS),K-Nearest Neighbors,0.1775,0.0653,0.2553,0.9339
3,3,Backward Feature Elimination (BFE),K-Nearest Neighbors,0.1775,0.0653,0.2553,0.9339
4,5,Principal Component Analysis (PCA),K-Nearest Neighbors,0.3925,0.2923,0.54,0.7062
5,5,Anova,K-Nearest Neighbors,0.177,0.0702,0.2647,0.929
6,5,Forward Feature Selection (FFS),K-Nearest Neighbors,0.177,0.0655,0.2556,0.9337
7,5,Backward Feature Elimination (BFE),K-Nearest Neighbors,0.177,0.0655,0.2556,0.9337
8,10,Principal Component Analysis (PCA),K-Nearest Neighbors,0.2642,0.1565,0.3946,0.843
9,10,Anova,K-Nearest Neighbors,0.1808,0.0763,0.2758,0.9229


In [26]:
df_lrregresults, df_lrregbenchmarks = fwpMethodsLinearRegression(r_life_expectancy, 'Life expectancy ')
display(df_lrregresults)

Unnamed: 0,k,Method,Regressor,MAE,MSE,RMSE,R2
0,3,Anova,Linear Regression,0.3597,0.2617,0.5113,0.7363
1,3,Forward Feature Selection (FFS),Linear Regression,0.3505,0.2452,0.4939,0.7504
2,3,Backward Feature Elimination (BFE),Linear Regression,0.3538,0.2147,0.4629,0.784
3,3,Principal Component Analysis (PCA),Linear Regression,0.5586,0.4881,0.6978,0.5093
4,5,Anova,Linear Regression,0.3305,0.1902,0.4361,0.8075
5,5,Forward Feature Selection (FFS),Linear Regression,0.3305,0.1902,0.4361,0.8075
6,5,Backward Feature Elimination (BFE),Linear Regression,0.3305,0.1902,0.4361,0.8075
7,5,Principal Component Analysis (PCA),Linear Regression,0.4989,0.4189,0.6466,0.5784
8,10,Anova,Linear Regression,0.3277,0.1855,0.4307,0.8124
9,10,Forward Feature Selection (FFS),Linear Regression,0.3232,0.1791,0.4232,0.8191


In [29]:
df_olsresults = OLSRegressionMethod(r_life_expectancy, 'Life expectancy ', [3,5,10])
display(df_olsresults)

Unnamed: 0,MAE,MSE,RMSE,R-Squared,F-Statistic,Prob (F-Statistic),Log-Likelihood,AIC,BIC,P-Values
CV_3,0.3181,0.1722,0.4148,0.8268,270.4286,0.0,-550.7421,1145.4841,1255.5382,0.2243
CV_5,0.3157,0.1689,0.4109,0.8289,322.7617,0.0,-665.566,1375.1319,1489.1971,0.2145
CV_10,0.314,0.1668,0.4083,0.83,362.4719,0.0,-751.1354,1546.2709,1662.9273,0.2083
