# Week 11 Problem Set


In [None]:
%load_ext nb_mypy
%nb_mypy On

In [None]:
from typing import TypeAlias
from typing import Optional, Any    

Number: TypeAlias = int | float

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.axes as axes
from IPython.display import display


## Cohort Session

**CS0.** Do the following tasks before you start with the first cohort session.

**Task 1.** Paste the following functions from your previous work:
- `get_features_targets()`
- `normalize_z()`
- `prepare_feature()`
- `prepare_target()`
- `split_data()`


In [None]:
def normalize_z(array: np.ndarray, columns_means: Optional[np.ndarray]=None, 
                columns_stds: Optional[np.ndarray]=None) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
    assert columns_means is None or columns_means.shape == (1, array.shape[1])
    assert columns_stds is None or columns_stds.shape == (1, array.shape[1])
    ###
    ### YOUR CODE HERE
    ###
    assert out.shape == array.shape
    assert columns_means.shape == (1, array.shape[1])
    assert columns_stds.shape == (1, array.shape[1])
    return out, columns_means, columns_stds

def get_features_targets(df: pd.DataFrame, 
                         feature_names: list[str], 
                         target_names: list[str]) -> tuple[pd.DataFrame, pd.DataFrame]:
    ### BEGIN SOLUTION
    df_feature: pd.DataFrame = df[feature_names]
    df_target: pd.DataFrame = df[target_names]
    ### END SOLUTION
    pass
    return df_feature, df_target

def prepare_feature(np_feature: np.ndarray) -> np.ndarray:
    ### BEGIN SOLUTION
    # cols = len(df_feature.columns)
    cols: int = np_feature.shape[1]
    X: np.ndarray = np.concatenate((np.ones((np_feature.shape[0],1)), np_feature), axis=1)
    return X
    ### END SOLUTION
    pass

def split_data(df_feature: pd.DataFrame, 
               df_target: pd.DataFrame, 
               random_state: Optional[Number]=None, 
               test_size: float=0.5) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    ### BEGIN SOLUTION
    indexes: pd.Index = df_feature.index
    if random_state is not None:
        np.random.seed(random_state)
    k: int = int(test_size * len(indexes))
    test_index: pd.Index = np.random.choice(indexes, k, replace=False)
    train_index: pd.Index = indexes.drop(test_index) 
    df_feature_train: pd.DataFrame = df_feature.loc[train_index, :]
    df_feature_test: pd.DataFrame = df_feature.loc[test_index, :]
    df_target_train: pd.DataFrame = df_target.loc[train_index, :]
    df_target_test: pd.DataFrame = df_target.loc[test_index, :]
    ### END SOLUTION
    return df_feature_train, df_feature_test, df_target_train, df_target_test
 

**Task 2.** Load the breast cancer data from `breast_cancer_data.csv` into a Data Frame. 

In [None]:
# read breast_cancer_data.csv
# df: pd.DataFrame = None

### BEGIN SOLUTION
df: pd.DataFrame = pd.read_csv('breast_cancer_data.csv')
### END SOLUTION

df

**Task 3.** Do the following tasks.

- Read the following columns
    - feature: `radius_mean`
    - target: `diagnosis`
- Normalize the feature column using z normalization.

In [None]:
# extract the feature and the target
# df_feature, df_target = None, None

# normalize the feature
# array_feature,_,_ = None, None, None

### BEGIN SOLUTION
df_feature, df_target = get_features_targets(df, ["radius_mean"], ["diagnosis"])
array_feature,_,_ = normalize_z(df_feature.to_numpy())
### END SOLUTION


**Task 4.** Write a function `replace_target()` to replace the `diagnosis` column with the following mapping:
    - `M`: `1`, this means that malignant cell are indicated as `1` in our new column.
    - `B`: `0`, this means that benign cell are indicated as `0` in our new column.
    
The function should takes in the following:

- `df_target`: the target data frame
- `target_name`: which is the column name of the target data frame
- `map`: which is a dictionary containing the map
    
It should returns a new data frame with the same column name but with its values changed according to the mapping.

In [None]:
def replace_target(df_target: pd.DataFrame, 
                   target_name: str, map_vals: dict[Any, Any]) -> pd.DataFrame:
    ### BEGIN SOLUTION
    df_out: pd.DataFrame = df_target.copy()

    df_out.loc[:, target_name] = df_target[target_name].apply(lambda x: map_vals[x])
    ### END SOLUTION
    return df_out

In [None]:
df_target: pd.DataFrame = replace_target(df_target, "diagnosis", {'M': 1, 'B': 0})
df_target

**Task 5.** Do the following tasks.
- Change feature to Numpy array and append constant 1 column.
- Change target to Numpy array

In [None]:
# change feature data frame to numpy array and append column 1
# feature: np.ndarray = None

# change target data frame to numpy array
# target: np.ndarray = None

### BEGIN SOLUTION
feature: np.ndarray = prepare_feature(array_feature)
target: np.ndarray = df_target.to_numpy()
### END SOLUTION

**CS1.** *Logistic function:* Write a function to calculate the hypothesis using a logistic function. Recall that the hypothesis for a logistic regression model is written as:

$$\mathbf{p}(x) = \frac{1}{1 + e^{-\mathbf{X}\mathbf{b}}}$$

The shape of the input is as follows:
- $\mathbf{b}$: is a column vector for the parameters
- $\mathbf{X}$: is a matrix where the number of rows are the number of data points and the the number of columns must the same as the number of parameters in $\mathbf{b}$.

Note that you need to ensure that the output is a **column vector**. 

You can use the following functions:
- `np.matmul(array1, array2)`: which is to perform matrix multiplication on the two numpy arrays.
- `np.exp()`: which is to calculate the function $e^x$

In [None]:
def calc_logreg(X: np.ndarray, beta: np.ndarray) -> np.ndarray:
    X: np.ndarray = X.astype(float)
    beta: np.ndarray = beta.astype(float)
    ### BEGIN SOLUTION
    out: np.ndarray = 1.0/(1.0 + np.exp(np.matmul(X, -beta)))
    return out
    ### END SOLUTION
    pass

In [None]:
beta: np.ndarray = np.array([0])
x: np.ndarray = np.array([0])
ans: np.ndarray = calc_logreg(x, beta)
assert ans == 0.5

beta: np.ndarray  = np.array([2])
x: np.ndarray  = np.array([40])
ans: np.ndarray  = calc_logreg(x, beta)
assert np.isclose(ans, 1.0)

beta: np.ndarray  = np.array([2])
x: np.ndarray  = np.array([-40])
ans: np.ndarray  = calc_logreg(x, beta)
assert np.isclose(ans, 0.0)

beta: np.ndarray  = np.array([[1, 2, 3]])
x: np.ndarray  = np.array([[3, 2, 1]])
ans: np.ndarray  = calc_logreg(x, beta.T)
assert np.isclose(ans.all(), 1.0)

beta: np.ndarray  = np.array([[1, 2, 3]])
x: np.ndarray  = np.array([[3, 2, 1], [3, 2, 1]])
ans: np.ndarray  = calc_logreg(x, beta.T)
assert ans.shape == (2, 1)
assert np.isclose(ans.all(), 1.0)

In [None]:
### BEGIN HIDDEN TESTS
beta: np.ndarray = np.array([[3, 2, 1]])
x: np.ndarray  = np.array([[1, 2, 3]])
ans: np.ndarray  = calc_logreg(x, beta.T)
assert np.isclose(ans.all(), 1.0), "failed h11,1"

beta: np.ndarray  = np.array([[3, 2, 1]])
x: np.ndarray  = np.array([[1, 2, 3], [1, 2, 3]])
ans: np.ndarray  = calc_logreg(x, beta.T)
assert ans.shape == (2, 1), "failed h11,2"
assert np.isclose(ans.all(), 1.0), "failed h11,3"
### END HIDDEN TESTS

**CS2.** *Cost Function:* Write a function to calculate the cost function for logistic regression. Recall that the cost function for logistic regression is given by:

$$J(\beta) = -\frac{1}{m}\left[\Sigma_{i=1}^m y^i \log(p(x^i)) + (1 - y^i) \log(1 - p(x^i))\right]$$

You can use the following function in your code:
- `np.where(condition, then_expression, else_expression)`

In [None]:
def compute_cost_logreg(X: np.ndarray, 
                        y: np.ndarray,
                        beta: np.ndarray) -> Number:
    np.seterr(divide = 'ignore') 
    ### BEGIN SOLUTION
    m: int = len(y)
    J: Number = -(1/m)*np.sum(np.where(y==1, np.log(calc_logreg(X, beta)),np.log(1-calc_logreg(X, beta))))
    
    ### END SOLUTION
    np.seterr(divide = 'warn')
    return J

In [None]:
y: np.ndarray = np.array([[1]])
X: np.ndarray  = np.array([[10, 40]])
beta: np.ndarray  = np.array([[1, 1]]).T
ans: Number = compute_cost_logreg(X, y, beta)
print(ans)
assert np.isclose(ans, 0)

y: np.ndarray  = np.array([[0]])
X: np.ndarray  = np.array([[10, 40]])
beta: np.ndarray  = np.array([[-1, -1]]).T
ans: Number = compute_cost_logreg(X, y, beta)
print(ans)
assert np.isclose(ans, 0)

In [None]:
### BEGIN HIDDEN TESTS
y: np.ndarray  = np.array([[1]])
X: np.ndarray  = np.array([[20, 30]])
beta: np.ndarray  = np.array([[1, 1]]).T
ans: Number = compute_cost_logreg(X, y, beta)
assert np.isclose(ans, 0), "failed h21,1"

y: np.ndarray  = np.array([[0]])
X: np.ndarray  = np.array([[20, 30]])
beta: np.ndarray  = np.array([[-1, -1]]).T
ans: Number = compute_cost_logreg(X, y, beta)
assert np.isclose(ans, 0), "failed h21,2"
### END HIDDEN TESTS

**CS3.** *Gradient Descent:* Recall that the update functions can be written as a matrix multiplication.

$$\mathbf{b} = \mathbf{b} - \alpha\frac{1}{m}\mathbf{X}^T(\mathbf{p} - \mathbf{y}) $$

Write a function called `gradient_descent_logreg()` that takes in four parameters:
- `X`: is a 2-D numpy array for the features
- `y`: is a vector array for the target
- `beta`: is a column vector for the initial guess of the parameters
- `alpha`: is the learning rate
- `num_iters`: is the number of iteration to perform

The function should return two arrays:
- `beta`: is coefficient at the end of the iteration
- `J_storage`: is the array that stores the cost value at each iteration

The solution is similar to Linear Regression gradient descent function with two differences:
- you need to use `log_regression()` to calculate the hypothesis
- you need to use `compute_cost_logreg()` to calculate the cost

In [None]:
def gradient_descent_logreg(X: np.ndarray, 
                            y: np.ndarray, 
                            beta: np.ndarray, 
                            alpha: float,
                            num_iters: int) -> tuple[np.ndarray, np.ndarray]:
    ### BEGIN SOLUTION
    m: int = X.shape[0]
    J_storage: np.ndarray = np.zeros((num_iters, 1))
    for n in range(num_iters):
        deriv: np.ndarray = np.matmul(X.T, (calc_logreg(X, beta) - y))
        beta = beta - alpha * (1 / m) * deriv
        J_storage[n] = compute_cost_logreg(X, y, beta)
    ### END_SOLUTION 
    assert beta.shape == (X.shape[1], 1)
    assert J_storage.shape == (num_iters, 1)
    return beta.astype(float), J_storage

In [None]:
iterations: int = 1500
alpha: float = 0.01
beta: np.ndarray = np.zeros((2,1))
beta, J_storage = gradient_descent_logreg(feature, target, beta, alpha, iterations)

print(beta)
assert beta.shape == (2, 1)
assert np.isclose(beta[0][0], -0.56631, rtol=1e-3)
assert np.isclose(beta[1][0], 1.93742, rtol=1e-3)

In [None]:
### BEGIN HIDDEN TESTS
def prepare_beta() -> tuple[np.ndarray, np.ndarray]:
    iterations1: int = 500
    alpha1: float = 0.02
    beta1: np.ndarray = np.zeros((2,1))
    beta1, J1_storage = gradient_descent_logreg(feature, target, beta1, alpha1, iterations1)
    return beta1, J1_storage


beta1, J1_storage = prepare_beta()
# print(beta1)
assert beta1.shape == (2, 1), "failed h31,1"
assert np.isclose(beta1[0][0], -0.505984, rtol=1e-3), "failed h31,2"
assert np.isclose(beta1[1][0], 1.60153, rtol=1e-3), "failed h31,3"
### END HIDDEN TESTS

In [None]:
plt.plot(J_storage)

**CS4.** *Predict:* Write two functions `predict_logreg()` and `predict_norm()` that calculate the straight line equation given the features and its coefficient.
- `predict_logreg()`: this function should standardize the feature using z normalization, change it to a Numpy array, and add a column of constant 1s. You should use `prepare_feature()` for this purpose. Lastly, this function should also call `predict_norm()` to get the predicted y values.
- `predict_norm()`: this function should calculate the hypothesis or its probability using `calc_logreg()` and categorize it to either 0 or 1 based on its probability. If the probability is greater or equal to 0.5, it should be classified as class 1. Otherwise, it is classified as 0.

You can use the following function in your code:
- `np.where()`

In [None]:
def predict_norm(X: np.ndarray, beta: np.ndarray) -> np.ndarray:
    ### BEGIN SOLUTION
    p: np.ndarray = calc_logreg(X, beta)
    return np.where(p >= 0.5, 1, 0)
    ### END SOLUTION
    pass

In [None]:
def predict_logreg(array_feature: np.ndarray, 
                   beta: np.ndarray, 
                   means: Optional[np.ndarray]=None, 
                   stds: Optional[np.ndarray]=None) -> np.ndarray:
    ### BEGIN SOLUTION
    norm_data, _, _ = normalize_z(array_feature, means, stds)
    X: np.ndarray = prepare_feature(norm_data)
    result = predict_norm(X, beta)
    ### END SOLUTION
    assert result.shape == (array_feature.shape[0], 1)
    return result

In [None]:
df_feature, df_target = get_features_targets(df, ["radius_mean"], ["diagnosis"])
df_target: pd.DataFrame = replace_target(df_target, "diagnosis", {'M': 1, 'B': 0})
pred: np.ndarray = predict_logreg(df_feature.to_numpy(), beta)
print(pred.mean(), pred.std())
assert isinstance(pred, np.ndarray)
assert np.isclose(pred.mean(), 0.28998)
assert np.isclose(pred.std(), 0.45375)

In [None]:
means: np.ndarray = np.array([0]).reshape(1,1)
stds: np.ndarray = np.array([1]).reshape(1,1)
beta: np.ndarray =np.array([[-0.56630289], [ 1.93763591]])
input_1row: np.ndarray = np.array([[2.109139]])
pred_1row: np.ndarray = predict_logreg(input_1row, beta, means, stds)
assert pred_1row[0][0] == 1

In [None]:
### BEGIN HIDDEN TESTS
beta1, buf = prepare_beta()
pred1: np.ndarray = predict_logreg(array_feature, beta1)
#print(pred1.mean(), pred1.std())
assert isinstance(pred1, np.ndarray), "failed h41,1"
assert np.isclose(pred1.mean(), 0.286465), "failed h41,2"
assert np.isclose(pred1.std(), 0.45211), "failed h41,3"
### END HIDDEN TESTS


In [None]:
plt.scatter(df_feature, df_target)
plt.scatter(df_feature, pred)

**CS5.** *Multiple features and splitting of data set:* 

Do the following task in the code below:
- Read the following column names as the features: `"radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean"`
- Read the column `diagnosis` as the target. Change the value from `M` and `B` to `1` and `0` respectively.
- Split the data set with 30% test size and `random_state = 100`.
- Normalize the training feature data set using `normalize_z()` function.
- Convert to numpy array both the target and the features using `prepare_feature()` and `prepare_target()` functions.
- Call `gradient_descent()` function to get the parameters using the training data set.
- Call `predict()` function on the test data set to get the predicted values.

In [None]:
columns: list[str] = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean"]

# extract the features and the target columns
# df_features, df_target = None, None

# replace the target values using from string to integer 0 and 1
# df_target: pd.DataFrame = None

# split the data with random_state = 100 and 30% test size
# df_features_train, df_features_test, df_target_train, df_target_test = None, None, None, None

# normalize the features
# array_features_train_z, means, stds = None, None, None

# change the feature columns to numpy array and append column of 1s
# features: np.ndarray = None

# change the target column to numpy array
# target: np.ndarray = None

# iterations: int = 1500
# alpha: float = 0.01

# provide initial guess for beta
# beta: np.ndarray = None

# call the gradient descent method
# beta, J_storage = None, None

### BEGIN SOLUTION
columns = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean"]
df_features, df_target = get_features_targets(df, columns, ["diagnosis"])
df_target: pd.DataFrame = replace_target(df_target, "diagnosis", {'M': 1, 'B': 0})

df_features_train, df_features_test, df_target_train, df_target_test = split_data(df_features, df_target, random_state=100, test_size=0.3)
array_features_train_z, means, stds = normalize_z(df_features_train.to_numpy())

features: np.ndarray = prepare_feature(array_features_train_z)
target: np.ndarray = df_target_train.to_numpy()

iterations: int = 1500
alpha: float = 0.01
beta: np.ndarray = np.zeros((features.shape[1],1))

beta, J_storage = gradient_descent_logreg(features, target, beta, alpha, iterations)
### END SOLUTION
print(beta)



In [None]:
assert beta.shape == (8, 1)
ans: np.ndarray = np.array([[-0.6138507], 
                [ 0.8249164],
                [ 0.7274362],
                [ 0.8232587],
                [ 0.8161879],
                [ 0.5057594],
                [ 0.4411595],
                [ 0.7870175]])
assert np.isclose(beta, ans).all()

In [None]:
### BEGIN HIDDEN TESTS
def prepare_beta() -> tuple[np.ndarray, np.ndarray]:
    iterations1: int = 500
    alpha1: float = 0.02
    beta1: np.ndarray = np.zeros((features.shape[1],1))

    beta1, J1_storage = gradient_descent_logreg(features, target, beta1, alpha1, iterations1)
    return beta1, J1_storage

beta1, J1_storage = prepare_beta()
# print(beta1)
assert beta1.shape == (8, 1), "failed h51,1"
ans1: np.ndarray = np.array([[-0.5250625],
                [ 0.72218777],
                [ 0.59154237],
                [ 0.7239136],
                [ 0.71079841],
                [ 0.4117684],
                [ 0.43302351],
                [ 0.69142309]])
assert np.isclose(beta1, ans1).all(), "failed h51,2"
### END HIDDEN TESTS

In [None]:
plt.plot(J_storage)

**CS6.** Create a function `build_model_logreg()` that perform the following steps:
- change all data to numpy array.
- normalize the training feature data set using `normalize_z()` function.
- create X matrix.
- use `reshape(-1, 1)` on the target array to make sure it is a column vector. 
- run gradient descent by calling `gradient_descent_logreg()` function.

This function should output `model` and `J_storage` where `model` is a dictionary containing `beta`, `means` and `stds`. 

In [None]:
def build_model_logreg(df_feature_train: pd.DataFrame,
                       df_target_train: pd.DataFrame,
                       beta: Optional[np.ndarray] = None,
                       alpha: float = 0.01,
                       iterations: int = 1500) -> tuple[dict[str, Any], np.ndarray]:
    if beta is None:
        beta = np.zeros((df_feature_train.shape[1] + 1, 1)) 
    assert beta.shape == (df_feature_train.shape[1] + 1, 1)
    assert df_target_train.shape == (df_feature_train.shape[0], 1)
    model: dict[str, Any] = {}
    ### BEGIN SOLUTION
    array_feature_train_z, means, stds = normalize_z(df_feature_train.to_numpy())
    X: np.ndarray = prepare_feature(array_feature_train_z)
    target: np.ndarray = df_target_train.to_numpy().reshape(-1,1)
    beta, J_storage = gradient_descent_logreg(X, target, beta, alpha, iterations)
    model = {"beta": beta, "means": means, "stds": stds}
    ### END SOLUTION
    assert model["beta"].shape == (df_feature_train.shape[1] + 1, 1)
    assert model["means"].shape == (1, df_feature_train.shape[1])
    assert model["stds"].shape == (1, df_feature_train.shape[1])
    assert J_storage.shape == (iterations, 1)
    return model, J_storage

In [None]:
columns = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean"]
df_features, df_target = get_features_targets(df, columns, ["diagnosis"])
df_target: pd.DataFrame = replace_target(df_target, "diagnosis", {'M': 1, 'B': 0})

df_features_train, df_features_test, df_target_train, df_target_test = split_data(df_features, df_target, random_state=100, test_size=0.3)
model, J_storage = build_model_logreg(df_features_train, df_target_train)

assert isinstance(model, dict)
assert "beta" in model
assert "means" in model
assert "stds" in model
assert model["beta"].shape == (8, 1)
ans: np.ndarray = np.array([[-0.6138507], 
                [ 0.8249164],
                [ 0.7274362],
                [ 0.8232587],
                [ 0.8161879],
                [ 0.5057594],
                [ 0.4411595],
                [ 0.7870175]])
assert np.isclose(model["beta"], ans).all()
assert np.isclose(model['means'][0, 0], 1.40347594e+01)
assert np.isclose(model['stds'][-1, -1], 7.54400405e-02)

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###

In [None]:

# call predict() on one record to get the predicted values
# use the variable 'means' and 'stds' to normalize
input_1row: np.ndarray = np.array([[12.22, 20.04, 79.47, 453.1, 0.10960, 0.11520, 0.08175]])

# replace the None
# pred_1row: np.ndarray = None

### BEGIN SOLUTION
pred_1row: np.ndarray = predict_logreg(input_1row, model["beta"], model["means"], model["stds"])
### END SOLUTION

In [None]:
assert pred_1row[0][0] == 0

In [None]:
# call predict() on df_features test dataset to get the predicted values
# pred: np.ndarray = None

### BEGIN SOLUTION
pred: np.ndarray = predict_logreg(df_features_test.to_numpy(), beta, means, stds)
### END SOLUTION

In [None]:
plt.scatter(df_features_test["radius_mean"], df_target_test)
plt.scatter(df_features_test["radius_mean"], pred)

In [None]:
plt.scatter(df_features_test["texture_mean"], df_target_test)
plt.scatter(df_features_test["texture_mean"], pred)

In [None]:
plt.scatter(df_features_test["perimeter_mean"], df_target_test)
plt.scatter(df_features_test["perimeter_mean"], pred)

**CS7.** *Confusion Matrix:* Write a function `confusion_matrix()` that takes in:
- `ytrue`: which is the true target values
- `ypred`: which is the predicted target values
- `labels`: which is a list of the category. In the above case it will be `[1, 0]`. Put the positive case as the first element of the list. 

The function should return a dictionary containing the matrix with the following format.

|                 | predicted positive (1) | predicted negative (0) |
|-----------------|--------------------|--------------------|
| actual positive (1) | correct positive  (1, 1) | false negative (1, 0)    |
| actual negative (0) | false positive (0, 1)   | correct negative (0, 0)   |

The keys to the dictionary are the indices: `(0, 0), (0, 1), (1, 0), (1, 1)`.

You can use the following function in your code:
- `itertools.product()`: this is to create a combination of all the labels. 

In [None]:
import itertools
def confusion_matrix(ytrue: np.ndarray, 
                     ypred: np.ndarray, 
                     labels: list[int]) -> dict[tuple[int, int], int]:
    output: dict[tuple[int, int], int] = {}
    ### BEGIN SOLUTION
    keys: itertools.product[tuple] = itertools.product(labels, repeat=2)

    for k in keys:
        output[k] = 0
        
    for idx in range(ytrue.shape[0]):
        output[(ytrue[idx,0], ypred[idx,0])] +=1
    ### END SOLUTION
    return output


In [None]:
result: dict[tuple[int, int], int] = confusion_matrix(df_target_test.values, pred, [1,0])
print(result)
assert result == {(0, 0): 99, (0, 1): 2, (1, 0): 10, (1, 1): 59}

In [None]:
### BEGIN HIDDEN TESTS
beta1, J1_storage = prepare_beta()
pred1: np.ndarray = predict_logreg(df_features_test.to_numpy(), beta1, means, stds)
result1: dict[tuple[int, int], int] = confusion_matrix(df_target_test.values, pred1, [1,0])
# print(result1)
assert result1 == {(1, 1): 59, (1, 0): 10, (0, 1): 3, (0, 0): 98}, "failed h71,1"
### END HIDDEN TESTS

**CS8.** *Metrics:* Write a function `calc_accuracy()` that takes in a Confusion Matrix array and output a dictionary with the following keys and values:
- `accuracy`: total number of correct predictions / total number of records
- `sensitivity`: total correct positive cases / total positive cases
- `specificity`: total true negatives / total negative cases
- `precision`: total  of correct positive cases / total predicted positive cases

In [None]:
def calc_accuracy(cm: dict[tuple[int, int], int]) -> dict[str, float]:
    ### BEGIN SOLUTION
    negneg: int = cm[(0,0)]
    pospos: int = cm[(1,1)]
    negpos: int = cm[(0,1)]
    posneg: int = cm[(1,0)]
    accuracy: float = (negneg + pospos) / np.sum(list(cm.values()))
    sensitivity: float = pospos / (pospos + posneg)
    specificity: float = negneg / (negneg + negpos)
    precision: float = pospos / (pospos + negpos)
    ### END SOLUTION
    result: dict[str, float] = {'accuracy': accuracy, 'sensitivity': sensitivity,
              'specificity': specificity, 'precision': precision}
    return result


In [None]:
ans: dict[str, float] = calc_accuracy(result)
# print(ans)
expected = {'accuracy': 0.9294, 'sensitivity': 0.8551, 'specificity': 0.9802, 'precision': 0.9672}
assert np.isclose(ans['accuracy'], expected['accuracy'], rtol=1e-3)
assert np.isclose(ans['sensitivity'], expected['sensitivity'], rtol=1e-3)
assert np.isclose(ans['specificity'], expected['specificity'], rtol=1e-3)
assert np.isclose(ans['precision'], expected['precision'], rtol=1e-3)
 

In [None]:
### BEGIN HIDDEN TESTS
beta1, J1_storage = prepare_beta()
pred1: np.ndarray = predict_logreg(df_features_test.to_numpy(), beta1)
result1: dict[tuple[int, int], int] = confusion_matrix(df_target_test.values, pred1, [1,0])
# print(result)
ans: dict[str, float] = calc_accuracy(result1)
# print(ans)
expected: dict[str, float] = {'accuracy': 0.9294117647058824, 'sensitivity': 0.8260869565217391, 'specificity': 1.0, 'precision': 1.0}
assert np.isclose(ans['accuracy'], expected['accuracy']), "failed h81,1"
assert np.isclose(ans['sensitivity'], expected['sensitivity']), "failed h81,2"
assert np.isclose(ans['specificity'], expected['specificity']), "failed h81,3"
assert np.isclose(ans['precision'], expected['precision']), "failed h81,4"
### END HIDDEN TESTS

**CS9.** *Optional:* Redo the above tasks using Scikit Learn libraries. You will need to use the following:
- [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
- [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)
- [confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix as cm_sk

In [None]:
columns: list[str] = ["radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", "compactness_mean", "concavity_mean"]
# get the features and the columns
# df_features, df_target = None

# replace target values with 0 and 1
# df_target: pd.DataFrame = None

### BEGIN SOLUTION
# get the features and the targets
df_features, df_target = get_features_targets(df, columns, ["diagnosis"])

# replace target values with 0 and 1
df_target: pd.DataFrame = replace_target(df_target, "diagnosis", {'M':1, 'B':0})
### END SOLUTION

In [None]:
# split data set using random_state = 100 and 30% test size
# df_features_train, df_features_test, df_target_train, df_target_test = None, None, None, None

# change feature to numpy array and append column of 1s
# feature: np.ndarray = None

# change target to numpy array
# target: np.ndarray = None

# this is to ensure it is either 0 or 1

### BEGIN SOLUTION
df_features_train, df_features_test, df_target_train, df_target_test = train_test_split(df_features, df_target, random_state=100, test_size=0.3)

target: np.ndarray = df_target_train.to_numpy()
### END SOLUTION
target: np.ndarray = target.astype(int) 
print(target.shape)

In [None]:
# create LogisticRegression object instance, use newton-cg solver
# model: LogisticRegression = None

# build model
# pass

# get predicted value
# pred: np.ndarray = None

### BEGIN SOLUTION
skmodel: LogisticRegression = LogisticRegression(solver='newton-cg')

skmodel.fit(df_features_train.to_numpy(), target.flatten())
pred: np.ndarray = skmodel.predict(df_features_test.to_numpy())
### END SOLUTION 

In [None]:
# calculate confusion matrix
# cm: dict[tuple[int, int], int] = None

### BEGIN SOLUTION
cm: dict[tuple[int, int], int] = cm_sk(df_target_test.to_numpy().astype(int), pred, labels=[1, 0])
### END SOLUTION


In [None]:
expected: np.ndarray = np.array([[58,  11], [6, 96]])
assert np.array(cm == expected).all()

In [None]:
plt.scatter(df_features_test["radius_mean"], df_target_test)
plt.scatter(df_features_test["radius_mean"], pred)

In [None]:
plt.scatter(df_features_test["texture_mean"], df_target_test)
plt.scatter(df_features_test["texture_mean"], pred)

In [None]:
plt.scatter(df_features_test["perimeter_mean"], df_target_test)
plt.scatter(df_features_test["perimeter_mean"], pred)