In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
from pprint import pprint

from pandas.core.interchange.dataframe_protocol import DataFrame

In [None]:
with open("dataset/breast-cancer.labels", "r") as f:
    labels = [line.strip() for line in f.readlines()]
train_data = pd.io.parsers.read_csv("dataset/breast-cancer-train.dat", names=labels)
validate_data = pd.io.parsers.read_csv("dataset/breast-cancer-validate.dat", names=labels)
train_data_malignant = train_data[train_data["Malignant/Benign"] == "M"]
train_data_benign = train_data[train_data["Malignant/Benign"] == "B"]

In [None]:
test_column = 2
test_column_name = labels[test_column]
plt.figure(dpi=300)
plt.hist(train_data_malignant[labels[test_column]], alpha=0.8, label='Malignant')
plt.hist(train_data_benign[labels[test_column]], alpha=0.8, label='Benign')
plt.xlabel(test_column_name)
plt.ylabel("Ilość")
plt.title(f"Histogram dla kolumny {test_column_name}")
plt.legend()
plt.show()

In [None]:
sorted_train_data_malignant = train_data_malignant[labels[test_column]].sort_values().reset_index(drop=True)
sorted_train_data_benign = train_data_benign[labels[test_column]].sort_values().reset_index(drop=True)

In [None]:
test_column = 10
test_column_name = labels[test_column]
plt.plot(range(len(sorted_train_data_malignant)), sorted_train_data_malignant.values, 'o', label='Malignant')
plt.plot(range(len(sorted_train_data_benign)), sorted_train_data_benign.values, 'o', label='Benign')
plt.xlabel(test_column_name)
plt.ylabel("Ilość")
plt.title(f"Wykres dla posortowanej kolumny {test_column_name}")
plt.legend()
plt.show()

In [None]:
linear_train = train_data.drop(["patient ID", "Malignant/Benign"], axis=1).values
linear_validate = validate_data.drop(["patient ID", "Malignant/Benign"], axis=1).values

def create_quadratic_representation(data):
    df = data.copy()
    for i in range(len(quad_columns)):
        df[f"{i}^2"] = data[quad_columns[i]] ** 2
    for i in range(len(quad_columns)):
        for j in range(i + 1, len(quad_columns)):
            df[f"{i}_{j}"] = data[quad_columns[i]] * data[quad_columns[j]]
    return df.values

quad_columns = ["radius (mean)", "perimeter (mean)", "area (mean)", "symmetry (mean)"]
quadratic_train = create_quadratic_representation(train_data[quad_columns])
quadratic_validate = create_quadratic_representation(validate_data[quad_columns])

In [None]:
b_training = np.where(train_data[['Malignant/Benign']] == "M", 1, -1)
b_validate = np.where(validate_data[['Malignant/Benign']] == "M", 1, -1)

In [None]:
cov_mat_lin = linear_train.T @ linear_train
cov_mat_quad = quadratic_train.T @ quadratic_train

In [None]:
weights_linear = np.linalg.solve(cov_mat_lin, linear_train.T @ b_training)
weights_quadratic = np.linalg.solve(cov_mat_quad, quadratic_train.T @ b_training)

In [None]:
λ = 0.01
scipy.linalg.lstsq(cov_mat_lin + λ * np.eye(cov_mat_lin.shape[0]), linear_train.T @ b_training);

In [None]:
cond_lin = np.linalg.cond(cov_mat_lin)
cond_quad = np.linalg.cond(cov_mat_quad)

In [None]:
p_lin = linear_validate @ weights_linear
p_quad = quadratic_validate @ weights_quadratic

In [None]:
def calc_acc(p_vec, b_vec):
    tp = np.sum([1 for p, b in zip(p_vec, b_vec) if p > 0 and b > 0])
    tn = np.sum([1 for p, b in zip(p_vec, b_vec) if p <= 0 and b < 0])
    fp = np.sum([1 for p, b in zip(p_vec, b_vec) if p > 0 and b <= 0])
    fn = np.sum([1 for p, b in zip(p_vec, b_vec) if p <= 0 and b > 0])
    return int(tp), int(tn), int(fp), int(fn), float((tp + tn) / (tp + tn + fp + fn))

tp_lin, tn_lin, fp_lin, fn_lin, acc_lin = calc_acc(p_lin, b_validate)
tp_quad, tn_quad, fp_quad, fn_quad, acc_quad = calc_acc(p_quad, b_validate)
print(f"{tp_lin=} {tn_lin=} {fp_lin=} {fn_lin=} {acc_lin=}")
print(f"{tp_quad=} {tn_quad=} {fp_quad=} {fn_quad=} {acc_quad=}")

$$ x=10 $$