### **Differential Scorecards**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# datasets
from sklearn.datasets import load_iris
from ucimlrepo import fetch_ucirepo 

# discretization
from libraries.caimcaim import CAIMD # https://github.com/airysen/caimcaim/blob/master/caimcaim/caimcaim.py

# objective function
from scipy.optimize import least_squares
from scipy.optimize import minimize

# regularization
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

#### **binary data**

##### synthetic data

In [None]:
np.random.seed(0)
synth_X = np.random.rand(100)
synth_y = 2*synth_X + np.random.randn(100)

##### benchmark datasets

**iris**:

In [None]:
iris_data = load_iris()
iris_X = pd.DataFrame(iris_data.data)
iris_y = pd.DataFrame(iris_data.target)

print("num observations: ", iris_y.count())
print("target distribution: ", iris_y.value_counts())

**adult**: predict whether annual income of an individual exceeds $50K/yr based on census data. 

In [None]:
# fetch dataset 
adult_data = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
adult_X = adult_data.data.features 
adult_y = adult_data.data.targets
adult_y.loc[:,'income'] = adult_y['income'].map({'>50K': 1, '<=50K': 0})
  
# metadata 
# print(adult_data.metadata) 
  
# variable information 
# print(adult_data.variables) 

print("num observations: ", adult_y.count())
print("target distribution: ", adult_y.value_counts())

**mammo**: discrimination of benign and malignant mammographic masses based on BI-RADS attributes and the patient's age.

In [None]:
# fetch dataset 
mammo_data = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
mammo_X = mammo_data.data.features 
mammo_y = mammo_data.data.targets 

# drop rows with nulls
mammo_combined = pd.concat([mammo_X, mammo_y], axis=1)
print("num rows with nulls: ", mammo_combined.isnull().sum().sum())
mammo_combined = mammo_combined.dropna()
mammo_combined = mammo_combined.reset_index(drop=True)
mammo_X = mammo_combined.iloc[:, :-1]
mammo_y = mammo_combined.iloc[:, -1]


# metadata 
# print(mammo_data.metadata) 
  
# variable information 
# print(mammo_data.variables) 

print("num observations: ", mammo_y.count())
print("target distribution: ", mammo_y.value_counts())

**mushroom**: mushrooms described in terms of physical characteristics; classification: poisonous or edible

In [None]:
# fetch dataset 
mushroom_data = fetch_ucirepo(id=73) 

# data (as pandas dataframes) 
mushroom_X = mushroom_data.data.features 
mushroom_y = mushroom_data.data.targets 
mushroom_y.loc[:, 'poisonous'] = mushroom_y['poisonous'].map({'p': 1, 'e': 0})
  
# metadata 
# print(mushroom_data.metadata) 
  
# variable information 
# print(mushroom_data.variables) 

print("num observations: ", mushroom_y.count())
print("target distribution: ", mushroom_y.value_counts())

**spambase**: classifying Email as Spam or Non-Spam

In [None]:
# fetch dataset 
spambase_data = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
spambase_X = spambase_data.data.features 
spambase_y = spambase_data.data.targets 
  
# metadata 
# print(spambase_data.metadata) 
  
# variable information 
# print(spambase_data.variables) 

print("num observations: ", spambase_y.count())
print("target distribution: ", spambase_y.value_counts())

**telemarketing**: set of possible advertisements on Internet pages

In [None]:
# load dataset from file. target is last column
telemarketing_data = pd.read_csv('datasets/internet+advertisements/ad.data', dtype=str)
telemarketing_X = telemarketing_data.iloc[:, :-1]
telemarketing_y = telemarketing_data.iloc[:, -1]

# Map target values to binary
telemarketing_y = telemarketing_y.map({'nonad.': 0, 'ad.': 1})

print("num observations: ", telemarketing_y.count())
print("target distribution: ", telemarketing_y.value_counts())

In [None]:
sleep_apnea_data = pd.read_csv('datasets/bdsp_psg_master_20231101.csv')

#### **discretization thresholds**

CAIM

In [None]:
# discretize using CAIM
def discretize_caim_df(data, X, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    
    print("\nCut-off points: ", caim.split_scheme)
    print("Number of bins: ", end="")
    for i, (key, value) in enumerate(caim.split_scheme.items()):
        if i == len(caim.split_scheme) - 1:
            print(f" {key}: {len(value)+1}", end="")
        else:
            print(f" {key}: {len(value)+1}", end=",")
    print()
    
    X_disc_caim = pd.DataFrame(X_disc_caim, columns=X.columns).astype(int) # convert to pandas dataframe and int
        
    return X_disc_caim

def discretize_caim(X, cols, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    # get thresholds from caim.split_scheme (dict with column index : thresholds)
    # transform all values to floats
    # and keys with column indexes to column names
    thresholds = {cols[i]: [float(val) for val in value] for i, (key, value) in enumerate(caim.split_scheme.items())}
    return thresholds


In [None]:
mammo_thresholds_caim = discretize_caim(mammo_X, mammo_X.columns, mammo_y)
print("thresholds ", mammo_thresholds_caim)

print("num of bins: ")
for i, (key, value) in enumerate(mammo_thresholds_caim.items()):
        print(f"  {key}: {len(value)+1}")

infinitesimal bins

In [None]:
# discretize using infinitesimal bins:
# thresholds are the points in between 2 consecutive values in the sorted list

def discretize_infbins(X, cols):
    infbins_thresholds = {}
    for col in cols:
        # sort unique values
        sorted_col = np.unique(X[col])
        # get thresholds
        thresholds = (sorted_col[:-1] + sorted_col[1:]) / 2
        infbins_thresholds[col] = thresholds.tolist()

    return infbins_thresholds

In [None]:
mammo_thresholds_infbins = discretize_infbins(mammo_X, mammo_X.columns)
print("thresholds ", mammo_thresholds_infbins)
print("num of bins: ")
for i, (key, value) in enumerate(mammo_thresholds_infbins.items()):
        print(f"  {key}: {len(value)+1}")

#### discretized version

num of columns in the new df = (num thresholds + 1) * num features = num bins * num features

2 methods
- 1 out of k: 1 if the value is in the bin, 0 otherwise
- differential coding: 1 from bin 1 until bin where the value is in, 0 otherwise

In [None]:
def get_bins(thresholds, values):
    bins = np.digitize(values, thresholds)
    return bins
    # list of bin number for each row

def disc_1_out_of_k(X, cols, thresholds):
    disc_df = []
    for col in cols:
        bins = get_bins(thresholds[col], X[col]) # gets bin number of each row
        bins_df = pd.get_dummies(bins, prefix=f'feat{col}-bin').astype(int) # one hot encoding
        bins_df = bins_df.drop(columns=f'feat{col}-bin_0')
        disc_df.append(bins_df)
    return pd.concat(disc_df, axis=1)

def disc_diff_coding(X, cols, thresholds):
    bin_dfs = []
    for col in cols:
        bins = get_bins(thresholds[col], X[col]) # gets bin number of each row
        num_bins = len(thresholds[col]) + 1
        bin_df = pd.DataFrame(0, index=X.index, columns=[f'feat{col}-bin_{i}' for i in range(1, num_bins)])
        for i in range(1, num_bins):
            bin_df[f'feat{col}-bin_{i}'] = (bins >= i).astype(int)
        bin_dfs.append(bin_df)
    return pd.concat(bin_dfs, axis=1)



In [None]:
mammo_X_disc_infbins_1outofk = disc_1_out_of_k(mammo_X, mammo_X.columns, mammo_thresholds_infbins)
mammo_X_disc_infbins_1outofk.head()

In [None]:
mammo_X_disc_infbins_diffcod = disc_diff_coding(mammo_X, mammo_X.columns, mammo_thresholds_infbins)
mammo_X_disc_infbins_diffcod.head()

#### objective function

- Least Squares (RSS)
- Maximum Likelihood (GLM with binomial response and logit link function)
- margin maximization (linear SVM).

In [None]:
# RSS
# call least_squares(fun, x0)

# maximum likelihood
def max_lik(parameters):
    m = parameters[0]
    b = parameters[1]
    sigma = parameters[2]
    for i in np.arange(0, len(x)):
        y_exp = m * x + b
    L = (len(x)/2 * np.log(2 * np.pi) + len(x)/2 * np.log(sigma ** 2) + 1 /
         (2 * sigma ** 2) * sum((y - y_exp) ** 2))
    return L

x = 1
y = 2
lik_model = minimize(max_lik, 0, method='L-BFGS-B')


#### regularization

In [None]:
ridge = Ridge(alpha=0.7).fit(iris_X, iris_y)
print(f"Ridge Regression-Training set score: {ridge.score(iris_X, iris_y):.2f}")
print(f"Ridge Regression-Test set score: {ridge.score(iris_X, iris_y):.2f}")

In [None]:
lasso = Lasso(alpha=0.01).fit(iris_X, iris_y)
print(f"Lasso Regression-Training set score: {lasso.score(iris_X, iris_y):.2f}")
print(f"Lasso Regression-Test set score: {lasso.score(iris_X, iris_y):.2f}")
print(f"Number of features Lasso: {sum(lasso.coef_ != 0)}")

In [None]:
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.01).fit(iris_X, iris_y)
print(f"Elastic Net-Training set score: {elastic_net.score(iris_X, iris_y):.2f}")
print(f"Elastic Net-Test set score: {elastic_net.score(iris_X, iris_y):.2f}")

#### **Ordinal data**

In [None]:
aesthetic_evaluation_data = pd.read_csv('datasets/aesthetic_evaluation_data.csv')
aesthetic_evaluation_data = aesthetic_evaluation_data.drop(columns=['Image Filename','Author','Objective Evaluation'])
aesthetic_evaluation_X = aesthetic_evaluation_data.drop(columns='Subjective Evaluation')
aesthetic_evaluation_y = aesthetic_evaluation_data['Subjective Evaluation']

aesthetic_evaluation_data.head()

In [None]:
a = aesthetic_evaluation_data[['sX2L Value','sX2a Value','sX2b Value','sX2Lab Value','sEMDL Value','sEMDa Value','sEMDb Value','sEMDLab Value']]
a.sum()

In [None]:
aesthetic_evaluation_y.value_counts()

In [None]:
aesthetic_evaluation_y.hist()
plt.xlabel('Subjective Evaluation')
plt.ylabel('Frequency')
plt.title('Distribution of Subjective Evaluation')