In [9]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
def generate_data(seed=None):
    """
    [DO NOT CHANGE THIS METHOD]
    Generates synthetic dataset with Gaussian mixtures and saves it as CSV files.

    Input:
    - No input required for this method. It generates synthetic data based on pre-defined parameters.

    Process:
    - Creates a dataset with 3 classes where each class is represented by samples drawn from a multivariate normal distribution.
    - The data is shuffled and split into training (60%), validation (20%), and test (20%) sets.
    - The data and labels for each split are saved as CSV files:
        - 'train_features.csv', 'train_labels.csv'
        - 'val_features.csv', 'val_labels.csv'
        - 'test_features.csv', 'test_labels.csv'

    Output:
    - No direct output is returned by this method. The generated data is saved as CSV files.
    """
    np.random.seed(seed)
    
    # Parameters for Gaussian mixtures
    means = [np.random.rand(10) * 2 - 1 for _ in range(3)]
    covs = [np.eye(10) * 0.5 for _ in range(3)]
    
    # Generate data for each class
    X_class0 = np.random.multivariate_normal(means[0], covs[0], 500)
    X_class1 = np.random.multivariate_normal(means[1], covs[1], 500)
    X_class2 = np.random.multivariate_normal(means[2], covs[2], 500)
    
    # Create labels
    y_class0 = np.zeros(500)
    y_class1 = np.ones(500)
    y_class2 = np.ones(500) * 2
    
    # Concatenate the data
    X = np.vstack([X_class0, X_class1, X_class2])
    y = np.hstack([y_class0, y_class1, y_class2])
    
    # Shuffle the data
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    X = X[indices]
    y = y[indices]
    
    # Split the data into training, validation, and test sets
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    
    # Save to CSV
    pd.DataFrame(X_train).to_csv('train_features.csv', index=False)
    pd.DataFrame(y_train).to_csv('train_labels.csv', index=False)
    pd.DataFrame(X_val).to_csv('val_features.csv', index=False)
    pd.DataFrame(y_val).to_csv('val_labels.csv', index=False)
    pd.DataFrame(X_test).to_csv('test_features.csv', index=False)
    pd.DataFrame(y_test).to_csv('test_labels.csv', index=False)

In [12]:
def load_and_preprocess_data():
    """
    [DO NOT CHANGE THIS METHOD]
    Loads data from CSV files and standardizes the features using StandardScaler.

    Input:
    - The method expects CSV files generated by the 'generate_data' method:
        - 'train_features.csv', 'train_labels.csv'
        - 'val_features.csv', 'val_labels.csv'
        - 'test_features.csv', 'test_labels.csv'

    Process:
    - Reads the CSV files containing the training, validation, and test features and labels.
    - Standardizes the features (i.e., scales them to have zero mean and unit variance) using `StandardScaler`.
    - Applies the scaling transformation to the training, validation, and test sets.

    Output:
    - Returns six NumPy arrays:
        1. `X_train`: The standardized training feature set.
        2. `y_train`: The labels for the training set.
        3. `X_val`: The standardized validation feature set.
        4. `y_val`: The labels for the validation set.
        5. `X_test`: The standardized test feature set.
        6. `y_test`: The labels for the test set.
    """
    # Load the data
    X_train = pd.read_csv('train_features.csv').values
    y_train = pd.read_csv('train_labels.csv').values.flatten()
    X_val = pd.read_csv('val_features.csv').values
    y_val = pd.read_csv('val_labels.csv').values.flatten()
    X_test = pd.read_csv('test_features.csv').values
    y_test = pd.read_csv('test_labels.csv').values.flatten()

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [14]:
X_train, y_train, X_val, y_val, X_test, y_test = load_and_preprocess_data()

In [36]:
def make_one_versus_all_labels(y, num_classes):
    """
    Converts the given labels into a one-vs-all format for multi-class classification.

    Input:
    - y: Array of shape (n_samples,) containing the original class labels.
    - num_classes: Integer representing the total number of classes.

    Process:
    - Creates a binary label array where each row corresponds to one sample, and the columns
        represent one-vs-all labels for each class.

    Output:
    - Returns a label array of shape (n_samples, num_classes) with -1 for non-class columns
        and 1 for the true class column.
    """
    out = np.zeros((y.shape[0], num_classes))
    for c in  np.unique(y): 
        out[:,int(c)] = np.where(y==c, 1, -1)
    return out

In [42]:
y = make_one_versus_all_labels(y_train, len(np.unique(y_train)))
y

array([[-1.,  1., -1.],
       [ 1., -1., -1.],
       [ 1., -1., -1.],
       ...,
       [-1., -1.,  1.],
       [ 1., -1., -1.],
       [ 1., -1., -1.]])

In [41]:
for c in y.T :
    print(c.shape)

(900,)
(900,)
(900,)


In [69]:
def compute_loss(X, y, w, C):
    """
    Computes the logistic loss for multi-class classification.

    Input:
    - X: Feature matrix of shape (n_samples, n_features).
    - y: Label matrix of shape (n_samples, n_classes), formatted for one-vs-all classification.
    - w: Weight matrix of shape (n_features, n_classes).
    - C: Regularization parameter (float).

    Process:
    - Computes the loss using a logistic regression formulation.
    - Adds L2 regularization term based on the weight matrix.

    Output:
    - Returns the scalar loss value (float).
    """
    n_classes = y.shape[1]
    n_samples = X.shape[0]

    total_loss = 0.0

    for i, x in enumerate(X):
        loss = np.zeros_like(w)
        for j in range(n_classes):
            a = 2 - y[i, j]*(np.dot(x, w[:, j]))

            if a > 0:
                # print(a, y[i, j], x.shape, C, w[:, j].shape)
                loss[:, j] += -2 * a * y[i, j] *x + C*w[:, j]
        
        total_loss += np.sum(loss)

    total_loss/= n_samples
    return total_loss

In [50]:
X_train.shape
y.shape

(900, 3)

In [49]:
w = np.array([[1,2,3],[1,2,3], [1,2,3], [1,2,3], [1,2,3], [1,2,3], [1,2,3], [1,2,3], [1,2,3], [1,2,3]])
w.shape

(10, 3)

In [70]:
compute_loss(X=X_train, y=y, w=w, C=0.2)

UFuncTypeError: Cannot cast ufunc 'add' output from dtype('float64') to dtype('int32') with casting rule 'same_kind'

In [None]:
def compute_gradient(self, X, y, w, C):
    """
    Computes the gradient of the logistic loss for multi-class classification.

    Input:
    - X: Feature matrix of shape (n_samples, n_features).
    - y: Label matrix of shape (n_samples, n_classes), formatted for one-vs-all classification.
    - w: Weight matrix of shape (n_features, n_classes).
    - C: Regularization parameter (float).

    Process:
    - Computes the gradient of the logistic loss with respect to the weights.
    - Adds the gradient of the L2 regularization term.

    Output:
    - Returns the gradient matrix of shape (n_features, n_classes).
    """
    raise NotImplementedError

In [None]:
def infer(self, X, w):
    """
    Predicts the class labels for a given feature matrix.

    Input:
    - X: Feature matrix of shape (n_samples, n_features).
    - w: Weight matrix of shape (n_features, n_classes).

    Process:
    - Computes the predicted class probabilities for each sample.
    - Assigns the class with the highest probability as the predicted class label.

    Output:
    - Returns an array of predicted class labels of shape (n_samples,).
    """
    raise NotImplementedError

In [None]:
def compute_accuracy(self, X, y, w):
    """
    Computes the accuracy of predictions using the given weight matrix.

    Input:
    - X: Feature matrix of shape (n_samples, n_features).
    - y: True label vector of shape (n_samples,).
    - w: Weight matrix of shape (n_features, n_classes).

    Process:
    - Predicts the labels for the given feature matrix using the weights.
    - Compares predicted labels with true labels to compute accuracy.

    Output:
    - Returns the accuracy score (float) as a percentage of correct predictions.
    """
    raise NotImplementedError