In [10]:
!pip install cupy-cuda12x



In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import cupy as cp
import numpy as np_cpu
from sklearn.metrics import accuracy_score
import time

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # Import train_test_split

path = "f1_cleaned.csv"
# Load the cleaned dataset
df = pd.read_csv(path)

# Features and target variable
X = df.drop(columns=["podium", "positionOrder"])
y = df["podium"]

# For now, we'll encode categorical variables using one-hot encoding
X = pd.get_dummies(X, columns=["raceName", "driverName", "constructorName"], drop_first=True)

# Train-test split with stratification (80% train, 20% split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df["podium"]) # stratify to maintain class distribution

# Convert to numpy arrays with proper data types
X_train = X_train.astype(np.float32).values
X_test = X_test.astype(np.float32).values
y_train = y_train.astype(np.int32).values
y_test = y_test.astype(np.int32).values

# Scale the array
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Training set shape:", X_train_scaled.shape)
print("Test set shape:", X_test_scaled.shape)

Training set shape: (21407, 1126)
Test set shape: (5352, 1126)


In [14]:
import numpy as np
import numpy.ma as ma # Used for masked array log operations in objective function (optional, but robust)
from scipy.special import expit

class CPULogisticRegression:
    # private:
    def __init__(self, eta=0.01, iterations=20, C=0.0):
        self.eta = eta
        self.iters = iterations
        self.C = C
        # internally we will store the weights as self.w_
        self.w_ = None
        self.classifiers_ = []
        self.unique_ = None

    def __str__(self):
        if(hasattr(self,'w_') and self.w_ is not None):
            return 'MultiClass Steepest Ascent Logistic Regression Object (Trained)'
        else:
            return 'Untrained MultiClass Steepest Ascent Logistic Regression Object'

    @property
    def coef_(self):
        if(hasattr(self,'w_') and self.w_ is not None):
            return self.w_[:,1:]
        else:
            return None

    @property
    def intercept_(self):
        if(hasattr(self,'w_') and self.w_ is not None):
            return self.w_[:,0]
        else:
            return None

    # ----------------------------------------------------
    # Core Binary Solver: Steepest Ascent Regression (Fixed)
    # ----------------------------------------------------
    def SAfit(self, X, y):
        """Performs Steepest Ascent (Gradient Ascent) for a single binary problem."""
        Xb = self._add_intercept(X)
        num_samples, num_features = Xb.shape

        # Initialize weights (specific to this binary fit instance)
        w_binary = np.zeros((num_features, 1))

        # Steepest Ascent loop
        for _ in range(self.iters):
            # Calculate gradient (must pass required state: Xb, y, w_binary)
            gradient = self._calculate_gradient(Xb, y, w_binary, self.C)
            w_binary += self.eta * gradient # gradient ascent

        # Store the final weights for this binary classifier instance
        self.w_ = w_binary

    # ----------------------------------------------------
    # Multi-Class Training Method (Fixed)
    # ----------------------------------------------------
    def fit(self, X, y):
        """Trains the multi-class model using One-vs-Rest strategy."""
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # Stores unique class labels
        self.classifiers_ = [] # Reset classifiers list

        # Train one binary classifier (SAfit) for each unique class (One-vs-Rest)
        for yval in self.unique_:
            y_binary = (y == yval).astype(int) # Create 0/1 binary labels

            # Create a NEW instance of the classifier for the binary problem
            blr_instance = CPULogisticRegression(eta=self.eta, iterations=self.iters, C=self.C)

            # Train the binary classifier using SAfit
            # Note: X is passed without intercept here, as SAfit adds it internally
            blr_instance.SAfit(X, y_binary)

            # Store the trained instance
            self.classifiers_.append(blr_instance)

        # Combine all binary classifier weights into one large matrix
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T

    # ----------------------------------------------------
    # Gradient Calculation (Fixed and Simplified)
    # ----------------------------------------------------
    @staticmethod
    def _calculate_gradient(Xb, y, w, C):
        """
        Calculates the regularized gradient for Steepest Ascent.
        Note: Xb includes the intercept term.
        """
        # Calculate probability: g = sigmoid(Xb * w)
        g = expit(Xb @ w)

        # ydiff = y - g
        ydiff = y.reshape(g.shape) - g

        # Unregularized gradient: Xb.T @ ydiff / num_samples
        gradient = Xb.T @ ydiff / Xb.shape[0]

        # Add L2 regularization term to all features (excluding the intercept at index 0)
        gradient[1:] -= 2 * C * w[1:]

        return gradient

    # ----------------------------------------------------
    # Convenience and Prediction Methods (Fixed)
    # ----------------------------------------------------
    @staticmethod
    def _sigmoid(theta):
        return expit(theta)

    @staticmethod
    def _add_intercept(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term

    def predict_proba(self,X):
        """Calculates probabilities for all classes."""
        Xb = self._add_intercept(X) # Add intercept to prediction data
        probs = []

        if not self.classifiers_:
             raise RuntimeError("Model must be fitted before calling predict_proba.")

        for blr in self.classifiers_:
            # Retrieve weights from the trained binary instance
            w_binary = blr.w_

            # Calculate P(Y=yval) = sigmoid(Xb * w)
            theta = Xb @ w_binary
            prob = self._sigmoid(theta)
            probs.append(prob)

        return np.hstack(probs) # Stack into a single matrix (N_samples x N_classes)

    def predict(self,X):
        """Returns the class label with the highest probability."""
        probs = self.predict_proba(X)
        max_indices = np.argmax(probs, axis=1) # Get the index of the max probability

        # Look up the class label using the unique classes array
        return self.unique_[max_indices]

In [15]:
# I used google gemini here to help out the gpu implementation
# The main difference here is that the GPU implementation uses cupy, which is like numpy but for GPUs

class GPULogisticRegression:

    def __init__(self, eta=0.01, iterations=20, C=0.0):
        self.eta = eta
        self.iters = iterations
        self.C = C
        self.w_ = None
        self.classifiers_ = []
        self.unique_ = None

    def __str__(self):
        if(hasattr(self,'w_') and isinstance(self.w_, cp.ndarray)):
            return 'MultiClass Logistic Regression Object (CuPy-Accelerated) with coefficients:\n' + str(self.w_.get())
        else:
            return 'Untrained MultiClass Logistic Regression Object (CuPy-Accelerated)'

    @property
    def coef_(self):
        if(hasattr(self,'w_') and isinstance(self.w_, cp.ndarray)):
            return self.w_[:,1:].get()
        return None

    @property
    def intercept_(self):
        if(hasattr(self,'w_') and isinstance(self.w_, cp.ndarray)):
            return self.w_[:,0].get()
        return None

    def SAfit(self, X, y):
        Xb = self._add_intercept(X)
        num_samples, num_features = Xb.shape

        self.w_ = cp.zeros((num_features, 1), dtype=Xb.dtype)

        for _ in range(self.iters):
            gradient = self._calculate_gradient(Xb, y, self.w_, self.C)

            self.w_ += self.eta * gradient

    def fit(self, X, y):
        if not isinstance(X, cp.ndarray):
             X_gpu = cp.asarray(X)
        else:
             X_gpu = X

        if not isinstance(y, cp.ndarray):
             y_gpu = cp.asarray(y)
        else:
             y_gpu = y

        self.unique_ = cp.unique(y_gpu)
        self.classifiers_ = []

        for yval in self.unique_:
            y_binary = (y_gpu == yval).astype(X_gpu.dtype)
            blr = GPULogisticRegression(eta=self.eta, iterations=self.iters, C=self.C)
            blr.SAfit(X_gpu, y_binary)

            self.classifiers_.append(blr)

        self.w_ = cp.hstack([x.w_ for x in self.classifiers_])

    @staticmethod
    def _sigmoid(theta):
        return expit(theta)

    @staticmethod
    def _calculate_gradient(Xb, y, w, C):
        g = expit(Xb @ w)

        y_reshaped = y.reshape(g.shape)
        ydiff = y_reshaped - g

        gradient = Xb.T @ ydiff / Xb.shape[0]

        # Add L2 regularization
        gradient[1:] -= 2 * C * w[1:]

        return gradient

    @staticmethod
    def _add_intercept(X):
        return cp.hstack((cp.ones((X.shape[0], 1), dtype=X.dtype), X))

    def predict_proba(self, X):
        if not isinstance(X, cp.ndarray):
             X_gpu = cp.asarray(X)
        else:
             X_gpu = X

        Xb = self._add_intercept(X_gpu)
        probs = []


        for blr in self.classifiers_:
            theta = Xb @ blr.w_
            prob = self._sigmoid(theta)
            probs.append(prob)

        return cp.hstack(probs).get()

    def predict(self,X):
        probs_cpu = self.predict_proba(X)

        max_indices = cp.argmax(cp.asarray(probs_cpu), axis=1).get()

        return self.unique_.get()[max_indices]

In [23]:
%%time
# CPU time
cpu_lr = CPULogisticRegression(eta=0.1389,iterations=50,C=0.1389)
cpu_lr.fit(X_train_scaled,y_train)

cpu_yhat = cpu_lr.predict(X_test_scaled)
cpu_acc = accuracy_score(y_test,cpu_yhat)
print("CPU Accuracy:", cpu_acc)


CPU Accuracy: 0.8729446935724963
CPU times: user 11.5 s, sys: 163 ms, total: 11.6 s
Wall time: 6.46 s


In [21]:
%%time
# GPU time
gpu_lr = GPULogisticRegression(eta=0.1389,iterations=50,C=0.1389)
gpu_lr.fit(X_train_scaled,y_train)

gpu_yhat = gpu_lr.predict(X_test_scaled)
gpu_acc = accuracy_score(y_test,gpu_yhat)
print("GPU Accuracy:", gpu_acc)



GPU Accuracy: 0.8729446935724963
CPU times: user 220 ms, sys: 16 ms, total: 236 ms
Wall time: 236 ms


As we can see, the GPU implementation on Steepest Ascent with L2 norm is much faster (27 times faster) relative to the CPU version. They achieve the same accuracy because nothing in their code changes. The only difference is that one of them uses a CPU while the other one exploits the GPU. Notice that the GPU not only has quicker wall time but also quicker CPU times (user, sys, and total are in ms whereas the CPU has these in seconds).
