# Functions for ML Algorithms

### Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def find_best_hyperparameters(X, y):
    """
    Finds the best C and max_iter for Logistic Regression using GridSearchCV.
    """
    # Step 1: Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 2: Define the hyperparameter grid
    param_grid = {
        "C": [0.1, 1, 10, 100],
        "max_iter": [500, 1000, 2000],
        "penalty": ["l1", "l2"],
        "solver": ["liblinear", "saga"]  # Use saga for better l1 and scalability
    }

    # Step 3: Perform GridSearchCV
    grid_search = GridSearchCV(
        LogisticRegression(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy"
    )
    grid_search.fit(X_scaled, y)

    # Return the best parameters
    return grid_search.best_params_

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

def logistic_regression_with_results(X, y, best_params):
    """
    Runs Logistic Regression with given hyperparameters and outputs accuracy results
    for three train-test splits (20/80, 50/50, 80/20).
    """
    # Step 1: Scale the data
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Step 2: Define splits
    splits = [(0.8, 0.2), (0.5, 0.5), (0.2, 0.8)]
    results = []

    for test_size, train_size in splits:
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X_scaled, y, test_size=test_size, random_state=42
        )

        # Initialize Logistic Regression with best hyperparameters
        logreg = LogisticRegression(**best_params, random_state=42)
        
        # Train the model
        logreg.fit(X_train, y_train)

        # Calculate accuracies
        train_accuracy = accuracy_score(y_train, logreg.predict(X_train))
        test_accuracy = accuracy_score(y_test, logreg.predict(X_test))
        cv_scores = cross_val_score(logreg, X_train, y_train, cv=5, scoring="accuracy")

        # Store results
        results.append({
            "Train Size": len(X_train),
            "Test Size": len(X_test),
            "Train Accuracy": train_accuracy,
            "Test Accuracy": test_accuracy,
            "CV Mean Accuracy": cv_scores.mean()
        })

    # Display results
    for i, res in enumerate(results):
        print(f"\nSplit {i + 1} (Train/Test Ratio: {splits[i][1]:.0%}/{splits[i][0]:.0%}):")
        print(f"  Train Size: {res['Train Size']}, Test Size: {res['Test Size']}")
        print(f"  Training Accuracy: {res['Train Accuracy']:.2%}")
        print(f"  Testing Accuracy: {res['Test Accuracy']:.2%}")
        print(f"  Cross-Validation Mean Accuracy: {res['CV Mean Accuracy']:.2%}")

### KNN 

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

def optimize_knn(X, y):
    """
    Optimize KNN hyperparameters using GridSearchCV.
    Parameters optimized: n_neighbors and weights.
    """
    param_grid = {
        "n_neighbors": [3, 5, 7, 9, 11],
        "weights": ["uniform", "distance"]
    }
    grid_search = GridSearchCV(
        KNeighborsClassifier(),
        param_grid,
        cv=5,
        scoring="accuracy"
    )
    grid_search.fit(X, y)
    return grid_search.best_params_

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

def knn_multiple_splits(X, y, n_neighbors, weights):
    """
    Perform KNN classification for three train-test splits (20/80, 50/50, 80/20).
    Outputs training, testing, and cross-validation accuracy for each split.
    
    Parameters:
    - X: Features
    - y: Target
    - n_neighbors: Optimized number of neighbors for KNN
    - weights: Optimized weighting strategy ("uniform" or "distance")
    """
    splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]  # 20/80, 50/50, 80/20
    results = []

    for train_size, test_size in splits:
        # Step 1: Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

        # Step 2: Initialize KNN with optimized parameters
        knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)

        # Step 3: Train the model
        knn.fit(X_train, y_train)

        # Step 4: Calculate accuracies
        train_accuracy = accuracy_score(y_train, knn.predict(X_train))
        test_accuracy = accuracy_score(y_test, knn.predict(X_test))
        cv_scores = cross_val_score(knn, X_train, y_train, cv=5, scoring="accuracy")

        # Store the results
        results.append({
            "Train Size": len(X_train),
            "Test Size": len(X_test),
            "Train Accuracy": train_accuracy,
            "Test Accuracy": test_accuracy,
            "CV Mean Accuracy": cv_scores.mean(),
            "CV Std Dev": cv_scores.std()
        })

    # Step 5: Display results
    for i, res in enumerate(results):
        print(f"\nSplit {i + 1} (Train/Test Ratio: {splits[i][0]:.0%}/{splits[i][1]:.0%}):")
        print(f"  Train Size: {res['Train Size']}, Test Size: {res['Test Size']}")
        print(f"  Training Accuracy: {res['Train Accuracy']:.2%}")
        print(f"  Testing Accuracy: {res['Test Accuracy']:.2%}")
        print(f"  Cross-Validation Mean Accuracy: {res['CV Mean Accuracy']:.2%}")

### Random Forest

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

def optimize_rf_hyperparameters(X, y):
    """
    Find the best hyperparameters for Random Forest using GridSearchCV.
    Returns the best n_estimators, max_depth, and other relevant parameters.
    """
    param_grid = {
        "n_estimators": [50, 100, 200],  # Number of trees
        "max_depth": [None, 10, 20, 30],  # Tree depth
        "min_samples_split": [2, 5, 10],  # Minimum samples to split a node
        "min_samples_leaf": [1, 2, 4]  # Minimum samples in a leaf node
    }
    
    grid_search = GridSearchCV(
        RandomForestClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy"
    )
    grid_search.fit(X, y)
    return grid_search.best_params_

In [16]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

def random_forest_multiple_splits(X, y, n_estimators, max_depth, min_samples_split, min_samples_leaf):
    """
    Perform Random Forest classification for three train-test splits.
    Outputs training, testing, and cross-validation accuracy for each split.

    Parameters:
    - X: Features
    - y: Target
    - n_estimators: Number of trees in the forest
    - max_depth: Maximum depth of the tree
    - min_samples_split: Minimum samples required to split a node
    - min_samples_leaf: Minimum samples required in a leaf node
    """
    splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]  # 20/80, 50/50, 80/20
    results = []

    for train_size, test_size in splits:
        # Step 1: Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

        # Step 2: Initialize Random Forest
        rf = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

        # Step 3: Train the model
        rf.fit(X_train, y_train)

        # Step 4: Calculate accuracies
        train_accuracy = accuracy_score(y_train, rf.predict(X_train))
        test_accuracy = accuracy_score(y_test, rf.predict(X_test))
        cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring="accuracy")

        # Store the results
        results.append({
            "Train Size": len(X_train),
            "Test Size": len(X_test),
            "Train Accuracy": train_accuracy,
            "Test Accuracy": test_accuracy,
            "CV Mean Accuracy": cv_scores.mean()
        })

    # Step 5: Display results
    for i, res in enumerate(results):
        print(f"\nSplit {i + 1} (Train/Test Ratio: {splits[i][0]:.0%}/{splits[i][1]:.0%}):")
        print(f"  Train Size: {res['Train Size']}, Test Size: {res['Test Size']}")
        print(f"  Training Accuracy: {res['Train Accuracy']:.2%}")
        print(f"  Testing Accuracy: {res['Test Accuracy']:.2%}")
        print(f"  Cross-Validation Mean Accuracy: {res['CV Mean Accuracy']:.2%}")

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

def optimize_dt_hyperparameters(X, y):
    """
    Find the best hyperparameters for Decision Tree using GridSearchCV.
    Returns the best max_depth, criterion, and other relevant parameters.
    """
    param_grid = {
        "max_depth": [None, 5, 10, 20],
        "criterion": ["gini", "entropy"],
        "min_samples_split": [2, 5, 10],
        "min_samples_leaf": [1, 2, 4]
    }
    
    grid_search = GridSearchCV(
        DecisionTreeClassifier(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy"
    )
    grid_search.fit(X, y)
    return grid_search.best_params_

In [21]:
def decision_tree_multiple_splits(X, y, max_depth=None, criterion='gini', min_samples_split=2, min_samples_leaf=1):
    """
    Perform Decision Tree classification for three train-test splits.
    Outputs training, testing, and cross-validation accuracy for each split.

    Parameters:
    - X: Features
    - y: Target
    - max_depth: Maximum depth of the tree (default = None)
    - criterion: Split quality measure ('gini' or 'entropy')
    - min_samples_split: Minimum samples required to split a node
    - min_samples_leaf: Minimum samples required in a leaf node
    """
    splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]  # Train-Test splits
    results = []

    for train_size, test_size in splits:
        # Step 1: Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

        # Step 2: Initialize Decision Tree
        dt = DecisionTreeClassifier(
            max_depth=max_depth,
            criterion=criterion,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

        # Step 3: Train the model
        dt.fit(X_train, y_train)

        # Step 4: Calculate accuracies
        train_accuracy = accuracy_score(y_train, dt.predict(X_train))
        test_accuracy = accuracy_score(y_test, dt.predict(X_test))
        cv_scores = cross_val_score(dt, X_train, y_train, cv=5, scoring="accuracy")

        # Store the results
        results.append({
            "Train Size": len(X_train),
            "Test Size": len(X_test),
            "Train Accuracy": train_accuracy,
            "Test Accuracy": test_accuracy,
            "CV Mean Accuracy": cv_scores.mean()
        })

    # Step 5: Display results
    for i, res in enumerate(results):
        print(f"\nSplit {i + 1} (Train/Test Ratio: {splits[i][0]:.0%}/{splits[i][1]:.0%}):")
        print(f"  Train Size: {res['Train Size']}, Test Size: {res['Test Size']}")
        print(f"  Training Accuracy: {res['Train Accuracy']:.2%}")
        print(f"  Testing Accuracy: {res['Test Accuracy']:.2%}")
        print(f"  Cross-Validation Mean Accuracy: {res['CV Mean Accuracy']:.2%}")

### SVM

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

def optimize_svm_hyperparameters(X, y):
    """
    Find the best hyperparameters for SVM using GridSearchCV.
    Returns the best kernel, C, and gamma parameters.
    """
    param_grid = {
        "kernel": ["linear", "poly", "rbf", "sigmoid"],
        "C": [0.1, 1, 10, 100],
        "gamma": ["scale", "auto", 0.1, 1, 10]
    }

    grid_search = GridSearchCV(
        SVC(random_state=42),
        param_grid,
        cv=5,
        scoring="accuracy"
    )
    grid_search.fit(X, y)
    return grid_search.best_params_

In [26]:
def svm_multiple_splits(X, y, kernel='rbf', C=1.0, gamma=0.1):
    """
    Perform SVM classification for three train-test splits.
    Outputs training, testing, and cross-validation accuracy for each split.

    Parameters:
    - X: Features
    - y: Target
    - kernel: SVM kernel type ('linear', 'poly', 'rbf', etc.)
    - C: Regularization parameter
    - gamma: Kernel coefficient (applicable for 'rbf', 'poly', and 'sigmoid' kernels)
    """
    splits = [(0.2, 0.8), (0.5, 0.5), (0.8, 0.2)]  # 20/80, 50/50, 80/20
    results = []

    for train_size, test_size in splits:
        # Step 1: Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42
        )

        # Step 2: Initialize SVM
        svm = SVC(kernel=kernel, C=C, gamma=gamma, random_state=42)

        # Step 3: Train the model
        svm.fit(X_train, y_train)

        # Step 4: Calculate accuracies
        train_accuracy = accuracy_score(y_train, svm.predict(X_train))
        test_accuracy = accuracy_score(y_test, svm.predict(X_test))
        cv_scores = cross_val_score(svm, X_train, y_train, cv=5, scoring="accuracy")

        # Store the results
        results.append({
            "Train Size": len(X_train),
            "Test Size": len(X_test),
            "Train Accuracy": train_accuracy,
            "Test Accuracy": test_accuracy,
            "CV Mean Accuracy": cv_scores.mean()
        })

    # Step 5: Display results
    for i, res in enumerate(results):
        print(f"\nSplit {i + 1} (Train/Test Ratio: {splits[i][0]:.0%}/{splits[i][1]:.0%}):")
        print(f"  Train Size: {res['Train Size']}, Test Size: {res['Test Size']}")
        print(f"  Training Accuracy: {res['Train Accuracy']:.2%}")
        print(f"  Testing Accuracy: {res['Test Accuracy']:.2%}")
        print(f"  Cross-Validation Mean Accuracy: {res['CV Mean Accuracy']:.2%}")

# Data

# Student Performance

#### Preprocessing the Student Performance dataset from UCI ML Repository

In [31]:
import pandas as pd

# Load the dataset
student_data = pd.read_csv('student/student-por.csv', sep=';')
student_data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,4,0,11,11
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,2,9,11,11
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,6,12,13,12
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,0,14,14,14
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,5,4,2,1,2,5,4,10,11,10
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,4,3,4,1,1,1,4,15,15,16
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,1,5,6,11,12,9
647,MS,M,17,U,LE3,T,3,1,services,services,...,2,4,5,3,4,2,6,10,10,10


In [33]:
# Step 1: Create G3_binary column for binary classification
median_g3 = student_data['G3'].mean()
student_data['G3_binary'] = (student_data['G3'] >= median_g3).astype(int)
student_data

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,G3_binary
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,1,1,3,4,0,11,11,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,3,1,1,3,2,9,11,11,0
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,2,2,3,3,6,12,13,12,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,2,2,1,1,5,0,14,14,14,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,3,2,1,2,5,0,11,13,13,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,MS,F,19,R,GT3,T,2,3,services,other,...,4,2,1,2,5,4,10,11,10,0
645,MS,F,18,U,LE3,T,3,1,teacher,services,...,3,4,1,1,1,4,15,15,16,1
646,MS,F,18,U,GT3,T,1,1,other,other,...,1,1,1,1,5,6,11,12,9,0
647,MS,M,17,U,LE3,T,3,1,services,services,...,4,5,3,4,2,6,10,10,10,0


In [35]:
# Step 2: Drop non-informative or target-related columns
columns_to_drop = ['G1', 'G2', 'G3', 'school']  # Dropping identifiers and redundant predictors
if 'G3_binary' in columns_to_drop:
    columns_to_drop.remove('G3_binary')  # Ensure we don't drop the target column
student_data = student_data.drop(columns=columns_to_drop)

student_data

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3_binary
0,F,18,U,GT3,A,4,4,at_home,teacher,course,...,no,no,4,3,4,1,1,3,4,0
1,F,17,U,GT3,T,1,1,at_home,other,course,...,yes,no,5,3,3,1,1,3,2,0
2,F,15,U,LE3,T,1,1,at_home,other,other,...,yes,no,4,3,2,2,3,3,6,1
3,F,15,U,GT3,T,4,2,health,services,home,...,yes,yes,3,2,2,1,1,5,0,1
4,F,16,U,GT3,T,3,3,other,other,home,...,no,no,4,3,2,1,2,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,F,19,R,GT3,T,2,3,services,other,course,...,yes,no,5,4,2,1,2,5,4,0
645,F,18,U,LE3,T,3,1,teacher,services,course,...,yes,no,4,3,4,1,1,1,4,1
646,F,18,U,GT3,T,1,1,other,other,course,...,no,no,1,1,1,1,1,5,6,0
647,M,17,U,LE3,T,3,1,services,services,course,...,yes,no,2,4,5,3,4,2,6,0


In [37]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 3: Encode categorical variables
categorical_columns = student_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    student_data[col] = le.fit_transform(student_data[col])
    label_encoders[col] = le

student_data

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3_binary
0,0,18,1,0,0,4,4,0,4,0,...,0,0,4,3,4,1,1,3,4,0
1,0,17,1,0,1,1,1,0,2,0,...,1,0,5,3,3,1,1,3,2,0
2,0,15,1,1,1,1,1,0,2,2,...,1,0,4,3,2,2,3,3,6,1
3,0,15,1,0,1,4,2,1,3,1,...,1,1,3,2,2,1,1,5,0,1
4,0,16,1,0,1,3,3,2,2,1,...,0,0,4,3,2,1,2,5,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,0,19,0,0,1,2,3,3,2,0,...,1,0,5,4,2,1,2,5,4,0
645,0,18,1,1,1,3,1,4,3,0,...,1,0,4,3,4,1,1,1,4,1
646,0,18,1,0,1,1,1,2,2,0,...,0,0,1,1,1,1,1,5,6,0
647,1,17,1,1,1,3,1,3,3,0,...,1,0,2,4,5,3,4,2,6,0


In [39]:
# Step 4: Scale numerical features (excluding 'G3_binary')
numerical_columns = student_data.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('G3_binary')  # Exclude target column
scaler = StandardScaler()
student_data[numerical_columns] = scaler.fit_transform(student_data[numerical_columns])

student_data

Unnamed: 0,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,...,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G3_binary
0,-0.833377,1.031695,0.660182,-0.648175,-2.666927,1.310216,1.540715,-1.556453,2.057248,-0.933974,...,-1.816043,-0.763496,0.072606,-0.171647,0.693785,-0.543555,-0.997695,-0.371042,0.073433,0
1,-0.833377,0.210137,0.660182,-0.648175,0.374963,-1.336039,-1.188832,-1.556453,-0.260728,-0.933974,...,0.550648,-0.763496,1.119748,-0.171647,-0.157380,-0.543555,-0.997695,-0.371042,-0.357863,0
2,-0.833377,-1.432980,0.660182,1.542792,0.374963,-1.336039,-1.188832,-1.556453,-0.260728,0.745109,...,0.550648,-0.763496,0.072606,-0.171647,-1.008546,0.538553,0.560678,-0.371042,0.504730,1
3,-0.833377,-1.432980,0.660182,-0.648175,0.374963,1.310216,-0.278983,-0.754756,0.898260,-0.094432,...,0.550648,1.309764,-0.974536,-1.123771,-1.008546,-0.543555,-0.997695,1.012903,-0.789159,1
4,-0.833377,-0.611422,0.660182,-0.648175,0.374963,0.428131,0.630866,0.046941,-0.260728,-0.094432,...,-1.816043,-0.763496,0.072606,-0.171647,-1.008546,-0.543555,-0.218508,1.012903,-0.789159,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
644,-0.833377,1.853254,-1.514733,-0.648175,0.374963,-0.453954,0.630866,0.848637,-0.260728,-0.933974,...,0.550648,-0.763496,1.119748,0.780478,-1.008546,-0.543555,-0.218508,1.012903,0.073433,0
645,-0.833377,1.031695,0.660182,1.542792,0.374963,0.428131,-1.188832,1.650334,0.898260,-0.933974,...,0.550648,-0.763496,0.072606,-0.171647,0.693785,-0.543555,-0.997695,-1.754987,0.073433,1
646,-0.833377,1.031695,0.660182,-0.648175,0.374963,-1.336039,-1.188832,0.046941,-0.260728,-0.933974,...,-1.816043,-0.763496,-3.068820,-2.075896,-1.859711,-0.543555,-0.997695,1.012903,0.504730,0
647,1.199937,0.210137,0.660182,1.542792,0.374963,0.428131,-1.188832,0.848637,0.898260,-0.933974,...,0.550648,-0.763496,-2.021678,0.780478,1.544950,1.620662,1.339864,-1.063015,0.504730,0


## Student Performance Results

In [42]:
# Step 5: Split into features (X) and target (y)
X = student_data.drop(columns=['G3_binary'])
y = student_data['G3_binary']  # Ensure this remains binary (0 or 1)

### Logistic Regression

In [45]:
# Step 1: Find the best hyperparameters
best_params = find_best_hyperparameters(X, y)
print(f"Best Hyperparameters: {best_params}")

# Step 2: Run logistic regression and print results
logistic_regression_with_results(X, y, best_params)

Best Hyperparameters: {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 129, Test Size: 520
  Training Accuracy: 75.97%
  Testing Accuracy: 70.19%
  Cross-Validation Mean Accuracy: 72.95%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 324, Test Size: 325
  Training Accuracy: 77.78%
  Testing Accuracy: 75.08%
  Cross-Validation Mean Accuracy: 72.86%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 519, Test Size: 130
  Training Accuracy: 77.26%
  Testing Accuracy: 71.54%
  Cross-Validation Mean Accuracy: 73.79%


### KNN 

In [48]:
# Step 1: Find the best hyperparameters
best_params = optimize_knn(X, y)
print("Best Parameters:", best_params)

# Step 2: Run KNN and print results
knn_multiple_splits(X, y, n_neighbors=best_params["n_neighbors"], weights=best_params["weights"])

Best Parameters: {'n_neighbors': 11, 'weights': 'uniform'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 129, Test Size: 520
  Training Accuracy: 68.99%
  Testing Accuracy: 63.27%
  Cross-Validation Mean Accuracy: 68.25%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 324, Test Size: 325
  Training Accuracy: 70.99%
  Testing Accuracy: 68.92%
  Cross-Validation Mean Accuracy: 66.96%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 519, Test Size: 130
  Training Accuracy: 74.18%
  Testing Accuracy: 66.15%
  Cross-Validation Mean Accuracy: 70.51%


### Random Forest

In [51]:
best_params_rf = optimize_rf_hyperparameters(X, y)
print("Best Parameters for Random Forest:", best_params_rf)

random_forest_multiple_splits(
    X, y,
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf']
)

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 129, Test Size: 520
  Training Accuracy: 88.37%
  Testing Accuracy: 69.42%
  Cross-Validation Mean Accuracy: 62.06%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 324, Test Size: 325
  Training Accuracy: 91.05%
  Testing Accuracy: 75.69%
  Cross-Validation Mean Accuracy: 72.24%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 519, Test Size: 130
  Training Accuracy: 90.94%
  Testing Accuracy: 72.31%
  Cross-Validation Mean Accuracy: 76.49%


### Decision Tree

In [53]:
best_params_dt = optimize_dt_hyperparameters(X, y)
print("Best Parameters for Decision Tree:", best_params_dt)

decision_tree_multiple_splits(
    X, y,
    max_depth=best_params_dt['max_depth'],
    criterion=best_params_dt['criterion'],
    min_samples_split=best_params_dt['min_samples_split'],
    min_samples_leaf=best_params_dt['min_samples_leaf']
)

Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 5}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 129, Test Size: 520
  Training Accuracy: 86.05%
  Testing Accuracy: 56.15%
  Cross-Validation Mean Accuracy: 61.32%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 324, Test Size: 325
  Training Accuracy: 81.17%
  Testing Accuracy: 66.15%
  Cross-Validation Mean Accuracy: 65.11%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 519, Test Size: 130
  Training Accuracy: 77.46%
  Testing Accuracy: 66.15%
  Cross-Validation Mean Accuracy: 69.75%


### SVM

In [55]:
best_params_svm = optimize_svm_hyperparameters(X, y)
print("Best Parameters for SVM:", best_params_svm)

svm_multiple_splits(
    X, y,
    kernel=best_params_svm['kernel'],
    C=best_params_svm['C'],
    gamma=best_params_svm['gamma']
)

Best Parameters for SVM: {'C': 0.1, 'gamma': 0.1, 'kernel': 'sigmoid'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 129, Test Size: 520
  Training Accuracy: 68.99%
  Testing Accuracy: 69.04%
  Cross-Validation Mean Accuracy: 68.25%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 324, Test Size: 325
  Training Accuracy: 69.14%
  Testing Accuracy: 74.77%
  Cross-Validation Mean Accuracy: 72.84%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 519, Test Size: 130
  Training Accuracy: 71.87%
  Testing Accuracy: 70.00%
  Cross-Validation Mean Accuracy: 73.21%


# Wine

In [66]:
import pandas as pd

# Define the column names based on the wine.names file
columns = [
    "Class",  # Target variable
    "Alcohol",
    "Malic_acid",
    "Ash",
    "Alcalinity_of_ash",
    "Magnesium",
    "Total_phenols",
    "Flavanoids",
    "Nonflavanoid_phenols",
    "Proanthocyanins",
    "Color_intensity",
    "Hue",
    "OD280_OD315_of_diluted_wines",
    "Proline",
]

# Read the .data file
wine_data = pd.read_csv("wine/wine.data", header=None, names=columns)

wine_data

Unnamed: 0,Class,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280_OD315_of_diluted_wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050
2,1,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,3,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740
174,3,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750
175,3,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835
176,3,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840


In [68]:
wine_data['Class_binary'] = (wine_data['Class'] == 1).astype(int)
wine_data = wine_data.drop(columns=['Class'])
wine_data

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280_OD315_of_diluted_wines,Proline,Class_binary
0,14.23,1.71,2.43,15.6,127,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.20,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050,1
2,13.16,2.36,2.67,18.6,101,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.50,16.8,113,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740,0
174,13.40,3.91,2.48,23.0,102,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750,0
175,13.27,4.28,2.26,20.0,120,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835,0
176,13.17,2.59,2.37,20.0,120,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840,0


In [70]:
numerical_columns = wine_data.columns.difference(['Class_binary'])
scaler = StandardScaler()
wine_data[numerical_columns] = scaler.fit_transform(wine_data[numerical_columns])
wine_data

Unnamed: 0,Alcohol,Malic_acid,Ash,Alcalinity_of_ash,Magnesium,Total_phenols,Flavanoids,Nonflavanoid_phenols,Proanthocyanins,Color_intensity,Hue,OD280_OD315_of_diluted_wines,Proline,Class_binary
0,1.518613,-0.562250,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.847920,1.013009,1
1,0.246290,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242,1
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.269020,0.318304,0.788587,1.395148,1
3,1.691550,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574,1
4,0.295700,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.876275,2.974543,0.305159,0.301803,-0.332922,-0.985614,-1.424900,1.274310,-0.930179,1.142811,-1.392758,-1.231206,-0.021952,0
174,0.493343,1.412609,0.414820,1.052516,0.158572,-0.793334,-1.284344,0.549108,-0.316950,0.969783,-1.129518,-1.485445,0.009893,0
175,0.332758,1.744744,-0.389355,0.151661,1.422412,-1.129824,-1.344582,0.549108,-0.422075,2.224236,-1.612125,-1.485445,0.280575,0
176,0.209232,0.227694,0.012732,0.151661,1.422412,-1.033684,-1.354622,1.354888,-0.229346,1.834923,-1.568252,-1.400699,0.296498,0


## Wine Results

In [73]:
X = wine_data[numerical_columns]
y = wine_data['Class_binary']

### Logistic Regression

In [76]:
# Step 1: Find the best hyperparameters
best_params = find_best_hyperparameters(X, y)
print(f"Best Hyperparameters: {best_params}")

# Step 2: Run logistic regression and print results
logistic_regression_with_results(X, y, best_params)



Best Hyperparameters: {'C': 100, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 35, Test Size: 143
  Training Accuracy: 100.00%
  Testing Accuracy: 100.00%
  Cross-Validation Mean Accuracy: 94.29%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 89, Test Size: 89
  Training Accuracy: 100.00%
  Testing Accuracy: 97.75%
  Cross-Validation Mean Accuracy: 100.00%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 142, Test Size: 36
  Training Accuracy: 100.00%
  Testing Accuracy: 100.00%
  Cross-Validation Mean Accuracy: 99.31%


### KNN 

In [79]:
# Step 1: Find the best hyperparameters
best_params = optimize_knn(X, y)
print("Best Parameters:", best_params)

# Step 2: Run KNN and print results
knn_multiple_splits(X, y, n_neighbors=best_params["n_neighbors"], weights=best_params["weights"])

Best Parameters: {'n_neighbors': 3, 'weights': 'uniform'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 35, Test Size: 143
  Training Accuracy: 100.00%
  Testing Accuracy: 96.50%
  Cross-Validation Mean Accuracy: 100.00%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 89, Test Size: 89
  Training Accuracy: 98.88%
  Testing Accuracy: 93.26%
  Cross-Validation Mean Accuracy: 98.89%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 142, Test Size: 36
  Training Accuracy: 97.89%
  Testing Accuracy: 97.22%
  Cross-Validation Mean Accuracy: 97.17%


### Random Forest

In [82]:
best_params_rf = optimize_rf_hyperparameters(X, y)
print("Best Parameters for Random Forest:", best_params_rf)

random_forest_multiple_splits(
    X, y,
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf']
)

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 35, Test Size: 143
  Training Accuracy: 100.00%
  Testing Accuracy: 95.10%
  Cross-Validation Mean Accuracy: 100.00%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 89, Test Size: 89
  Training Accuracy: 100.00%
  Testing Accuracy: 97.75%
  Cross-Validation Mean Accuracy: 97.78%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 142, Test Size: 36
  Training Accuracy: 100.00%
  Testing Accuracy: 100.00%
  Cross-Validation Mean Accuracy: 98.57%


### Decision Tree

In [86]:
best_params_dt = optimize_dt_hyperparameters(X, y)
print("Best Parameters for Decision Tree:", best_params_dt)

decision_tree_multiple_splits(
    X, y,
    max_depth=best_params_dt['max_depth'],
    criterion=best_params_dt['criterion'],
    min_samples_split=best_params_dt['min_samples_split'],
    min_samples_leaf=best_params_dt['min_samples_leaf']
)

Best Parameters for Decision Tree: {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 35, Test Size: 143
  Training Accuracy: 94.29%
  Testing Accuracy: 91.61%
  Cross-Validation Mean Accuracy: 82.86%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 89, Test Size: 89
  Training Accuracy: 95.51%
  Testing Accuracy: 89.89%
  Cross-Validation Mean Accuracy: 93.27%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 142, Test Size: 36
  Training Accuracy: 97.18%
  Testing Accuracy: 100.00%
  Cross-Validation Mean Accuracy: 92.24%


### SVM

In [89]:
best_params_svm = optimize_svm_hyperparameters(X, y)
print("Best Parameters for SVM:", best_params_svm)

svm_multiple_splits(
    X, y,
    kernel=best_params_svm['kernel'],
    C=best_params_svm['C'],
    gamma=best_params_svm['gamma']
)

Best Parameters for SVM: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 35, Test Size: 143
  Training Accuracy: 100.00%
  Testing Accuracy: 98.60%
  Cross-Validation Mean Accuracy: 100.00%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 89, Test Size: 89
  Training Accuracy: 100.00%
  Testing Accuracy: 98.88%
  Cross-Validation Mean Accuracy: 100.00%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 142, Test Size: 36
  Training Accuracy: 100.00%
  Testing Accuracy: 100.00%
  Cross-Validation Mean Accuracy: 99.31%


# Online Shoppers

In [92]:
import pandas as pd
shoppers_data = pd.read_csv('shoppers/online_shoppers_intention.csv')
shoppers_data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,Feb,3,3,1,4,Returning_Visitor,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,True,False
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,Nov,3,2,1,8,Returning_Visitor,True,False
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,Nov,3,2,1,13,Returning_Visitor,True,False
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,Nov,2,2,3,11,Returning_Visitor,False,False


In [95]:
# Step 1: Encode the target column `Weekend` (True -> 1, False -> 0)
shoppers_data['Weekend'] = shoppers_data['Weekend'].astype(int)
shoppers_data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,1,1,1,1,Returning_Visitor,0,False
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,Feb,2,2,1,2,Returning_Visitor,0,False
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,Feb,4,1,9,3,Returning_Visitor,0,False
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,Feb,3,2,2,4,Returning_Visitor,0,False
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,Feb,3,3,1,4,Returning_Visitor,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,Dec,4,6,1,1,Returning_Visitor,1,False
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,Nov,3,2,1,8,Returning_Visitor,1,False
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,Nov,3,2,1,13,Returning_Visitor,1,False
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,Nov,2,2,3,11,Returning_Visitor,0,False


In [97]:
# Step 2: Handle categorical columns
categorical_columns = ['Month', 'VisitorType', 'Revenue']

# Label Encoding for categorical features
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    shoppers_data[col] = le.fit_transform(shoppers_data[col])
    label_encoders[col] = le

shoppers_data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,1,1,1,1,2,0,0
1,0,0.0,0,0.0,2,64.000000,0.000000,0.100000,0.000000,0.0,2,2,2,1,2,2,0,0
2,0,0.0,0,0.0,1,0.000000,0.200000,0.200000,0.000000,0.0,2,4,1,9,3,2,0,0
3,0,0.0,0,0.0,2,2.666667,0.050000,0.140000,0.000000,0.0,2,3,2,2,4,2,0,0
4,0,0.0,0,0.0,10,627.500000,0.020000,0.050000,0.000000,0.0,2,3,3,1,4,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,3,145.0,0,0.0,53,1783.791667,0.007143,0.029031,12.241717,0.0,1,4,6,1,1,2,1,0
12326,0,0.0,0,0.0,5,465.750000,0.000000,0.021333,0.000000,0.0,7,3,2,1,8,2,1,0
12327,0,0.0,0,0.0,6,184.250000,0.083333,0.086667,0.000000,0.0,7,3,2,1,13,2,1,0
12328,4,75.0,0,0.0,15,346.000000,0.000000,0.021053,0.000000,0.0,7,2,2,3,11,2,0,0


### Trying without scaling - Online Shoppers Results without Scaling

In [100]:
import warnings
warnings.filterwarnings('default')
X =shoppers_data.drop(columns=['Weekend'])
y = shoppers_data['Weekend']

### Logistic Regression

In [102]:
'''# Step 1: Find the best hyperparameters
best_params = find_best_hyperparameters(X, y)
print(f"Best Hyperparameters: {best_params}")'''

best_params = {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}

# Step 2: Run logistic regression and print results
logistic_regression_with_results(X, y, best_params)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.12%
  Testing Accuracy: 76.90%
  Cross-Validation Mean Accuracy: 76.12%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 77.42%
  Testing Accuracy: 76.06%
  Cross-Validation Mean Accuracy: 77.42%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 77.21%
  Testing Accuracy: 74.86%
  Cross-Validation Mean Accuracy: 77.21%


### KNN 

In [106]:
'''# Step 1: Find the best hyperparameters
best_params = optimize_knn(X, y)
print("Best Parameters:", best_params)'''

best_params = {'n_neighbors': 11, 'weights': 'uniform'}
# Step 2: Run KNN and print results
knn_multiple_splits(X, y, n_neighbors=best_params["n_neighbors"], weights=best_params["weights"])


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.52%
  Testing Accuracy: 75.47%
  Cross-Validation Mean Accuracy: 74.45%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 77.88%
  Testing Accuracy: 74.99%
  Cross-Validation Mean Accuracy: 76.71%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 77.30%
  Testing Accuracy: 74.01%
  Cross-Validation Mean Accuracy: 75.90%


### Random Forest

In [109]:
'''best_params_rf = optimize_rf_hyperparameters(X, y)
print("Best Parameters for Random Forest:", best_params_rf)
'''

best_params = {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
random_forest_multiple_splits(
    X, y,
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf']
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 92.38%
  Testing Accuracy: 77.18%
  Cross-Validation Mean Accuracy: 76.32%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 91.44%
  Testing Accuracy: 77.02%
  Cross-Validation Mean Accuracy: 77.99%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 91.31%
  Testing Accuracy: 76.80%
  Cross-Validation Mean Accuracy: 78.08%


### Decision Tree

In [112]:
'''best_params_dt = optimize_dt_hyperparameters(X, y)
print("Best Parameters for Decision Tree:", best_params_dt)'''

best_params = {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}

decision_tree_multiple_splits(
    X, y,
    max_depth=best_params_dt['max_depth'],
    criterion=best_params_dt['criterion'],
    min_samples_split=best_params_dt['min_samples_split'],
    min_samples_leaf=best_params_dt['min_samples_leaf']
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 89.46%
  Testing Accuracy: 68.50%
  Cross-Validation Mean Accuracy: 68.69%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 89.76%
  Testing Accuracy: 69.98%
  Cross-Validation Mean Accuracy: 70.40%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 89.46%
  Testing Accuracy: 68.78%
  Cross-Validation Mean Accuracy: 71.20%


### SVM

In [114]:
'''Commented out because it kept timing out, default parameters used
best_params_svm = optimize_svm_hyperparameters(X, y)
print("Best Parameters for SVM:", best_params_svm)'''


svm_multiple_splits(
    X, y,
    kernel='rbf',
    C=1.0,
    gamma='scale'
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.12%
  Testing Accuracy: 76.90%
  Cross-Validation Mean Accuracy: 76.12%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 77.42%
  Testing Accuracy: 76.06%
  Cross-Validation Mean Accuracy: 77.42%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 77.22%
  Testing Accuracy: 74.86%
  Cross-Validation Mean Accuracy: 77.21%


### Online Shoppers With Scaling

In [116]:
# Step 4: Scale numerical features
numerical_columns = shoppers_data.select_dtypes(include=['int64', 'float64']).columns.difference(['Weekend'])
scaler = StandardScaler()
shoppers_data[numerical_columns] = scaler.fit_transform(shoppers_data[numerical_columns])
shoppers_data

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,-1.334959,-1.233426,-0.790293,-0.894178,-0.762629,0.407786,0,-0.427872
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,-1.334959,-0.136078,-0.207952,-0.894178,-0.514182,0.407786,0,-0.427872
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,-1.334959,2.058618,-0.790293,2.437081,-0.265735,0.407786,0,-0.427872
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.994610,-0.317178,-0.308821,-1.334959,0.961270,-0.207952,-0.477771,-0.017289,0.407786,0,-0.427872
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.296430,-0.045196,0.142551,-0.317178,-0.308821,-1.334959,0.961270,0.374389,-0.894178,-0.017289,0.407786,1,-0.427872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,0.206173,0.363075,-0.396478,-0.244931,0.478227,0.307822,-0.310366,-0.288966,0.342125,-0.308821,-1.756881,2.058618,2.121412,-0.894178,-0.762629,0.407786,1,-0.427872
12326,-0.696993,-0.457191,-0.396478,-0.244931,-0.601062,-0.380957,-0.457683,-0.447364,-0.317178,-0.308821,0.774654,0.961270,-0.207952,-0.894178,0.976499,0.407786,1,-0.427872
12327,-0.696993,-0.457191,-0.396478,-0.244931,-0.578577,-0.528063,1.261014,0.897093,-0.317178,-0.308821,0.774654,0.961270,-0.207952,-0.894178,2.218733,0.407786,1,-0.427872
12328,0.507228,-0.032916,-0.396478,-0.244931,-0.376210,-0.443536,-0.457683,-0.453140,-0.317178,-0.308821,0.774654,-0.136078,-0.207952,-0.061364,1.721839,0.407786,0,-0.427872


In [118]:
X =shoppers_data.drop(columns=['Weekend'])
y = shoppers_data['Weekend']

### Logistic Regression

In [120]:
'''# Step 1: Find the best hyperparameters
best_params = find_best_hyperparameters(X, y)
print(f"Best Hyperparameters: {best_params}")'''

best_params = {'C': 0.1, 'max_iter': 500, 'penalty': 'l1', 'solver': 'liblinear'}

# Step 2: Run logistic regression and print results
logistic_regression_with_results(X, y, best_params)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.12%
  Testing Accuracy: 76.90%
  Cross-Validation Mean Accuracy: 76.12%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 77.42%
  Testing Accuracy: 76.06%
  Cross-Validation Mean Accuracy: 77.42%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 77.21%
  Testing Accuracy: 74.86%
  Cross-Validation Mean Accuracy: 77.21%


### KNN 

In [122]:
'''# Step 1: Find the best hyperparameters
best_params = optimize_knn(X, y)
print("Best Parameters:", best_params)'''

best_params = {'n_neighbors': 11, 'weights': 'uniform'}
# Step 2: Run KNN and print results
knn_multiple_splits(X, y, n_neighbors=best_params["n_neighbors"], weights=best_params["weights"])


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.48%
  Testing Accuracy: 75.05%
  Cross-Validation Mean Accuracy: 74.49%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 78.93%
  Testing Accuracy: 75.20%
  Cross-Validation Mean Accuracy: 76.59%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 78.76%
  Testing Accuracy: 74.45%
  Cross-Validation Mean Accuracy: 76.24%


### Random Forest

In [126]:
'''best_params_rf = optimize_rf_hyperparameters(X, y)
print("Best Parameters for Random Forest:", best_params_rf)'''

best_params = {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}

random_forest_multiple_splits(
    X, y,
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf']
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 92.38%
  Testing Accuracy: 77.18%
  Cross-Validation Mean Accuracy: 76.28%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 91.45%
  Testing Accuracy: 77.05%
  Cross-Validation Mean Accuracy: 77.97%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 91.32%
  Testing Accuracy: 76.80%
  Cross-Validation Mean Accuracy: 78.11%


### Decision Tree

In [128]:
'''best_params_dt = optimize_dt_hyperparameters(X, y)
print("Best Parameters for Decision Tree:", best_params_dt)'''

best_params = {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2}

decision_tree_multiple_splits(
    X, y,
    max_depth=best_params_dt['max_depth'],
    criterion=best_params_dt['criterion'],
    min_samples_split=best_params_dt['min_samples_split'],
    min_samples_leaf=best_params_dt['min_samples_leaf']
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 89.46%
  Testing Accuracy: 68.54%
  Cross-Validation Mean Accuracy: 68.69%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 89.76%
  Testing Accuracy: 69.99%
  Cross-Validation Mean Accuracy: 70.40%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 89.46%
  Testing Accuracy: 68.65%
  Cross-Validation Mean Accuracy: 71.21%


### SVM

In [130]:
'''Commented out because it kept timing out, default parameters used
best_params_svm = optimize_svm_hyperparameters(X, y)
print("Best Parameters for SVM:", best_params_svm)'''


svm_multiple_splits(
    X, y,
    kernel='rbf',
    C=1.0,
    gamma='scale'
)


Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 2466, Test Size: 9864
  Training Accuracy: 76.97%
  Testing Accuracy: 77.31%
  Cross-Validation Mean Accuracy: 76.03%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 6165, Test Size: 6165
  Training Accuracy: 78.65%
  Testing Accuracy: 77.36%
  Cross-Validation Mean Accuracy: 78.30%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 9864, Test Size: 2466
  Training Accuracy: 78.54%
  Testing Accuracy: 76.32%
  Cross-Validation Mean Accuracy: 78.20%


# Dropout for Students

In [85]:
import pandas as pd

dropout_data = pd.read_csv('dropout/student_dropout_data.csv',sep=';')
dropout_data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


In [87]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

dropout_data['Target'] = dropout_data['Target'].map({'Graduate': 1, 'Enrolled': 1, 'Dropout': 0})
dropout_data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,1
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,1


In [89]:
# Step 3: Handle categorical features
categorical_columns = [
    'Marital status', 
    'Application mode', 
    'Daytime/evening attendance\t', 
    'Nacionality', 
    'Mother\'s qualification', 
    'Father\'s qualification'
]

# Use Label Encoding for simplicity
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    dropout_data[col] = le.fit_transform(dropout_data[col])
    label_encoders[col] = le

dropout_data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,0,7,5,171,1,1,122.0,0,12,9,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,0
1,0,5,1,9254,1,1,160.0,0,0,2,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,1
2,0,0,5,9070,1,1,122.0,0,21,26,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,0
3,0,7,2,9773,1,1,122.0,0,22,26,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,1
4,1,11,1,8014,0,1,100.0,0,21,27,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,0,0,6,9773,1,1,125.0,0,0,0,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,1
4420,0,0,2,9773,1,1,120.0,18,0,0,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,0
4421,0,0,1,9500,1,1,154.0,0,21,26,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,0
4422,0,0,1,9147,1,1,180.0,0,21,26,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,1


In [91]:
numerical_columns = dropout_data.select_dtypes(include=['int64', 'float64']).columns.difference(['Target'])
scaler = StandardScaler()
dropout_data[numerical_columns] = scaler.fit_transform(dropout_data[numerical_columns])
dropout_data

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,-0.294829,0.210069,2.490896,-4.209520,0.350082,-0.35023,-0.804841,-0.145586,0.075111,-0.584526,...,-0.282442,-2.838337,-2.042630,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761,0
1,-0.294829,-0.167406,-0.554068,0.192580,0.350082,-0.35023,2.076819,-0.145586,-1.254495,-1.218380,...,-0.282442,-0.105726,-0.522682,0.518904,0.659562,-0.199441,0.876222,-1.105222,0.347199,1
2,-0.294829,-1.111094,2.490896,0.103404,0.350082,-0.35023,-0.804841,-0.145586,1.072315,0.954834,...,-0.282442,-0.105726,-2.042630,-1.471527,-1.963489,-0.199441,-0.287638,0.124386,0.765761,0
3,-0.294829,0.210069,0.207173,0.444115,0.350082,-0.35023,-0.804841,-0.145586,1.183116,0.954834,...,-0.282442,-0.105726,0.490616,0.187165,0.416450,-0.199441,-0.813253,-1.466871,-1.375511,1
4,1.356212,0.965018,-0.554068,-0.408389,-2.856470,-0.35023,-2.473171,-0.145586,1.072315,1.045384,...,-0.282442,-0.105726,-0.522682,0.518904,0.531608,-0.199441,0.876222,-1.105222,0.347199,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,-0.294829,-1.111094,3.252137,0.444115,0.350082,-0.35023,-0.577342,-0.145586,-1.254495,-1.399481,...,-0.282442,-0.105726,-0.016033,0.187165,0.467631,-0.199441,1.476924,1.137005,-1.789667,1
4420,-0.294829,-1.111094,0.207173,0.444115,0.350082,-0.35023,-0.956508,10.150427,-1.254495,-1.399481,...,-0.282442,-0.105726,-0.522682,-0.808050,0.147747,-0.199441,-0.175007,-0.454253,0.889126,0
4421,-0.294829,-1.111094,-0.554068,0.311805,0.350082,-0.35023,1.621820,-0.145586,1.072315,0.954834,...,-0.282442,0.805144,0.237291,-1.139788,0.627573,-0.199441,0.876222,-1.105222,0.347199,0
4422,-0.294829,-1.111094,-0.554068,0.140722,0.350082,-0.35023,3.593483,-0.145586,1.072315,0.954834,...,-0.282442,-0.561161,-0.522682,0.187165,0.339678,-0.199441,-0.813253,-1.466871,-1.375511,1


### Dropout for Students Results

In [98]:
X = dropout_data[numerical_columns]
y = dropout_data['Target']

### Logistic Regression

In [101]:
# Step 1: Find the best hyperparameters
best_params = find_best_hyperparameters(X, y)
print(f"Best Hyperparameters: {best_params}")

# Step 2: Run logistic regression and print results
logistic_regression_with_results(X, y, best_params)

Best Hyperparameters: {'C': 0.1, 'max_iter': 500, 'penalty': 'l2', 'solver': 'liblinear'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 884, Test Size: 3540
  Training Accuracy: 89.14%
  Testing Accuracy: 87.23%
  Cross-Validation Mean Accuracy: 87.78%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 2212, Test Size: 2212
  Training Accuracy: 88.70%
  Testing Accuracy: 86.71%
  Cross-Validation Mean Accuracy: 88.29%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 3539, Test Size: 885
  Training Accuracy: 88.36%
  Testing Accuracy: 86.33%
  Cross-Validation Mean Accuracy: 87.88%


### KNN 

In [104]:
# Step 1: Find the best hyperparameters
best_params = optimize_knn(X, y)
print("Best Parameters:", best_params)

# Step 2: Run KNN and print results
knn_multiple_splits(X, y, n_neighbors=best_params["n_neighbors"], weights=best_params["weights"])

Best Parameters: {'n_neighbors': 7, 'weights': 'uniform'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 884, Test Size: 3540
  Training Accuracy: 85.75%
  Testing Accuracy: 82.18%
  Cross-Validation Mean Accuracy: 83.26%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 2212, Test Size: 2212
  Training Accuracy: 86.75%
  Testing Accuracy: 82.59%
  Cross-Validation Mean Accuracy: 84.13%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 3539, Test Size: 885
  Training Accuracy: 87.37%
  Testing Accuracy: 81.92%
  Cross-Validation Mean Accuracy: 84.06%


### Random Forest

In [107]:
best_params_rf = optimize_rf_hyperparameters(X, y)
print("Best Parameters for Random Forest:", best_params_rf)

random_forest_multiple_splits(
    X, y,
    n_estimators=best_params_rf['n_estimators'],
    max_depth=best_params_rf['max_depth'],
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf']
)

Best Parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 884, Test Size: 3540
  Training Accuracy: 95.25%
  Testing Accuracy: 86.13%
  Cross-Validation Mean Accuracy: 87.79%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 2212, Test Size: 2212
  Training Accuracy: 95.25%
  Testing Accuracy: 86.30%
  Cross-Validation Mean Accuracy: 87.66%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 3539, Test Size: 885
  Training Accuracy: 95.59%
  Testing Accuracy: 85.42%
  Cross-Validation Mean Accuracy: 87.60%


### Decision Tree

In [110]:
best_params_dt = optimize_dt_hyperparameters(X, y)
print("Best Parameters for Decision Tree:", best_params_dt)

decision_tree_multiple_splits(
    X, y,
    max_depth=best_params_dt['max_depth'],
    criterion=best_params_dt['criterion'],
    min_samples_split=best_params_dt['min_samples_split'],
    min_samples_leaf=best_params_dt['min_samples_leaf']
)

Best Parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 884, Test Size: 3540
  Training Accuracy: 89.59%
  Testing Accuracy: 81.16%
  Cross-Validation Mean Accuracy: 84.17%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 2212, Test Size: 2212
  Training Accuracy: 89.01%
  Testing Accuracy: 84.36%
  Cross-Validation Mean Accuracy: 85.80%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 3539, Test Size: 885
  Training Accuracy: 88.10%
  Testing Accuracy: 81.92%
  Cross-Validation Mean Accuracy: 87.26%


### SVM

In [113]:
best_params_svm = optimize_svm_hyperparameters(X, y)
print("Best Parameters for SVM:", best_params_svm)

svm_multiple_splits(
    X, y,
    kernel=best_params_svm['kernel'],
    C=best_params_svm['C'],
    gamma=best_params_svm['gamma']
)

Best Parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}

Split 1 (Train/Test Ratio: 20%/80%):
  Train Size: 884, Test Size: 3540
  Training Accuracy: 90.16%
  Testing Accuracy: 85.06%
  Cross-Validation Mean Accuracy: 84.96%

Split 2 (Train/Test Ratio: 50%/50%):
  Train Size: 2212, Test Size: 2212
  Training Accuracy: 90.24%
  Testing Accuracy: 85.17%
  Cross-Validation Mean Accuracy: 86.89%

Split 3 (Train/Test Ratio: 80%/20%):
  Train Size: 3539, Test Size: 885
  Training Accuracy: 89.97%
  Testing Accuracy: 85.65%
  Cross-Validation Mean Accuracy: 87.17%
