### **Differential Scorecards**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# datasets
from sklearn.datasets import load_iris
from ucimlrepo import fetch_ucirepo 

# discretization
from libraries.caimcaim import CAIMD # https://github.com/airysen/caimcaim/blob/master/caimcaim/caimcaim.py

# objective function
from scipy.optimize import least_squares
from scipy.optimize import minimize

# regularization
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

#### **binary data**

##### synthetic data

In [None]:
np.random.seed(0)
synth_X = np.random.rand(100)
synth_y = 2*synth_X + np.random.randn(100)

##### benchmark datasets

**iris**:

In [None]:
iris_data = load_iris()
iris_X = pd.DataFrame(iris_data.data)
iris_y = pd.DataFrame(iris_data.target)

print("num observations: ", iris_y.count())
print("target distribution: ", iris_y.value_counts())

**adult**: predict whether annual income of an individual exceeds $50K/yr based on census data. 

In [None]:
# fetch dataset 
adult_data = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
adult_X = adult_data.data.features 
adult_y = adult_data.data.targets
adult_y['income'] = adult_y['income'].map({'>50K': 1, '<=50K': 0})
  
# metadata 
# print(adult_data.metadata) 
  
# variable information 
# print(adult_data.variables) 

print("num observations: ", adult_y.count())
print("target distribution: ", adult_y.value_counts())

**mammo**: discrimination of benign and malignant mammographic masses based on BI-RADS attributes and the patient's age.

In [None]:
# fetch dataset 
mammo_data = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
mammo_X = mammo_data.data.features 
mammo_y = mammo_data.data.targets 
  
# metadata 
# print(mammo_data.metadata) 
  
# variable information 
# print(mammo_data.variables) 


print("num observations: ", mammo_y.count())
print("target distribution: ", mammo_y.value_counts())

**mushroom**: mushrooms described in terms of physical characteristics; classification: poisonous or edible

In [None]:
# fetch dataset 
mushroom_data = fetch_ucirepo(id=73) 


# data (as pandas dataframes) 
mushroom_X = mushroom_data.data.features 
mushroom_y = mushroom_data.data.targets 
mushroom_y['poisonous'] = mushroom_y['poisonous'].map({'p': 1, 'e': 0})
  
# metadata 
# print(mushroom_data.metadata) 
  
# variable information 
# print(mushroom_data.variables) 

print("num observations: ", mushroom_y.count())
print("target distribution: ", mushroom_y.value_counts())

**spambase**: classifying Email as Spam or Non-Spam

In [None]:
# fetch dataset 
spambase_data = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
spambase_X = spambase_data.data.features 
spambase_y = spambase_data.data.targets 
  
# metadata 
# print(spambase_data.metadata) 
  
# variable information 
# print(spambase_data.variables) 

print("num observations: ", spambase_y.count())
print("target distribution: ", spambase_y.value_counts())

**telemarketing**: set of possible advertisements on Internet pages

In [None]:
# load dataset from file. target is last column
telemarketing_data = pd.read_csv('datasets/internet+advertisements/ad.data')
telemarketing_X = telemarketing_data.iloc[:, :-1]
telemarketing_y = telemarketing_data.iloc[:, -1]

#telemarketing_y['ad'] = telemarketing_y['ad'].map({'nonad.': 0, 'ad.': 1})


print("num observations: ", telemarketing_y.count())
print("target distribution: ", telemarketing_y.value_counts())

In [None]:
sleep_apnea_data = pd.read_csv('datasets/bdsp_psg_master_20231101.csv')

#### **discretization thresholds**

CAIM

In [None]:
# discretize using CAIM
def discretize_caim_df(data, X, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    
    print("\nCut-off points: ", caim.split_scheme)
    print("Number of bins: ", end="")
    for i, (key, value) in enumerate(caim.split_scheme.items()):
        if i == len(caim.split_scheme) - 1:
            print(f" {key}: {len(value)+1}", end="")
        else:
            print(f" {key}: {len(value)+1}", end=",")
    print()
    
    X_disc_caim = pd.DataFrame(X_disc_caim, columns=data.feature_names).astype(int) # convert to pandas dataframe and int
        
    return X_disc_caim

def discretize_caim(data, X, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    thresholds = caim.split_scheme
    return thresholds

''' prints of fit() method:
Categorical list_of_(indicies)_categorical_features
# feature_index  GLOBAL CAIM  best_caim_value 

in the returning dataframe:
    - columns represent the original features
    - rows represent each instance
    - values are the bin number each instance belongs to (starting from 0) 
'''


In [None]:
iris_thresholds_caim = discretize_caim(iris_data, iris_X, iris_y)
print(iris_thresholds_caim)

infinitesimal bins

In [None]:
# discretize using infinitesimal bins:
# thresholds are the points in between 2 consecutive values in the sorted list

def discretize_infbins(X):
    infbins_thresholds = {}
    for col in range(X.shape[1]):
        # sort 
        sorted_col = np.unique(np.sort(X.transpose()[col]))
        # get thresholds
        thresholds = []
        for i in range(len(sorted_col)-1):
            thresholds.append((sorted_col[i] + sorted_col[i+1])/2)
        infbins_thresholds[col] = thresholds

    return infbins_thresholds

In [None]:
iris_thresholds_infbins = discretize_infbins(iris_X)
print(iris_thresholds_infbins)

#### discretized version

colunas novas = (num intervalos + 1)*num features

In [None]:
# create new df with columns = (num of bins + 1) * num of features
# (filled with 0)

def get_discretized_cols(X, thresholds):
    col_names = []
    for col in range(X.shape[1]):
        for bin in range(len(thresholds[col]) + 1):
            new_col_name = 'feat' + str(col) + '-bin' + str(bin)
            col_names.append(new_col_name)
    
    X_disc = pd.DataFrame(columns=col_names)
    for i in range(len(X)):
        X_disc.loc[i] = 0
    
    return X_disc
    

In [None]:
iris_X_disc = get_discretized_cols(iris_X, iris_thresholds_infbins)
iris_X_disc.head()

- 1 out of k: preencher com bools (de pertencer ao intervalo)
- differential coding: 1 até ao bool

In [None]:
# given thresholds of a feature and a value
# return index of bin the values belongs to
def get_bin(thresholds, value):
    if(value < thresholds[0]):
        return 0
    if(value >= thresholds[len(thresholds)-1]):
        return len(thresholds)
    for i in range(len(thresholds)-1):
        if(value >= thresholds[i] and value < thresholds[i+1]):
            return i


# 1 out of k
def disc_1_out_of_k(X, thresholds):
    new_df = get_discretized_cols(X, thresholds)
    # iterate through features
    for instance in range(X.shape[0]):
        for col in range(X.shape[1]):
            bin = get_bin(thresholds[col], X[instance][col])
            new_df['feat' + str(col) + '-bin' + str(bin)][instance] = 1
    return new_df

# 1 out of k
# but take out 1st bin
def disc_1_out_of_k_V2(X, thresholds):
    new_df = get_discretized_cols(X, thresholds)
    col_names = []
    # delete columns of 1st bin for each feature
    for col in range(X.shape[1]):
        col_names.append('feat' + str(col) + '-bin' + str(0))
    new_df = new_df.drop(columns=col_names)
    
    # go through all instances
    for instance in range(X.shape[0]):
        # go through each feature
        for col in range(X.shape[1]):
            bin = get_bin(thresholds[col], X[instance][col])
            if(bin == 0): continue
            new_df['feat' + str(col) + '-bin' + str(bin)][instance] = 1
    return new_df


# differential coding
def disc_diff_coding(X, thresholds):
    new_df = get_discretized_cols(X, thresholds)
    # iterate through features
    for instance in range(X.shape[0]):
        for col in range(X.shape[1]):
            bin = get_bin(thresholds[col], X[instance][col])
            for i in range(0, bin+1):
                new_df['feat' + str(col) + '-bin' + str(i)][instance] = 1
    return new_df
            
def disc_diff_coding_V2(X, thresholds):
    new_df = get_discretized_cols(X, thresholds)
    
    col_names = []
    # delete columns of 1st bin for each feature
    for col in range(X.shape[1]):
        col_names.append('feat' + str(col) + '-bin' + str(0))
    new_df = new_df.drop(columns=col_names)
    
    # iterate through features
    for instance in range(X.shape[0]):
        for col in range(X.shape[1]):
            bin = get_bin(thresholds[col], X[instance][col])
            if(bin == 0): continue
            for i in range(1, bin+1):
                new_df['feat' + str(col) + '-bin' + str(i)][instance] = 1
    return new_df

In [None]:
iris_X_disc_infbins_1outofk = disc_1_out_of_k(iris_X, iris_thresholds_infbins)
iris_X_disc_infbins_1outofk.head()

In [None]:
iris_X_disc_infbins_1outofk_V2 = disc_1_out_of_k_V2(iris_X, iris_thresholds_infbins)
iris_X_disc_infbins_1outofk_V2.head()

In [None]:
iris_X_disc_infbins_diff_coding = disc_diff_coding(iris_X, iris_thresholds_infbins)
iris_X_disc_infbins_diff_coding.head()

In [None]:
iris_X_disc_infbins_diff_coding_V2 = disc_diff_coding_V2(iris_X, iris_thresholds_infbins)
iris_X_disc_infbins_diff_coding_V2.head()

#### objective function

- Least Squares (RSS)
- Maximum Likelihood (GLM with binomial response and logit link function)
- margin maximization (linear SVM).

In [None]:
# RSS
# call least_squares(fun, x0)

# maximum likelihood
def max_lik(parameters):
    m = parameters[0]
    b = parameters[1]
    sigma = parameters[2]
    for i in np.arange(0, len(x)):
        y_exp = m * x + b
    L = (len(x)/2 * np.log(2 * np.pi) + len(x)/2 * np.log(sigma ** 2) + 1 /
         (2 * sigma ** 2) * sum((y - y_exp) ** 2))
    return L

x = 1
y = 2
lik_model = minimize(max_lik, 0, method='L-BFGS-B')


#### regularization

In [None]:
ridge = Ridge(alpha=0.7).fit(iris_X, iris_y)
print(f"Ridge Regression-Training set score: {ridge.score(iris_X, iris_y):.2f}")
print(f"Ridge Regression-Test set score: {ridge.score(iris_X, iris_y):.2f}")

In [None]:
lasso = Lasso(alpha=0.01).fit(iris_X, iris_y)
print(f"Lasso Regression-Training set score: {lasso.score(iris_X, iris_y):.2f}")
print(f"Lasso Regression-Test set score: {lasso.score(iris_X, iris_y):.2f}")
print(f"Number of features Lasso: {sum(lasso.coef_ != 0)}")

In [None]:
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.01).fit(iris_X, iris_y)
print(f"Elastic Net-Training set score: {elastic_net.score(iris_X, iris_y):.2f}")
print(f"Elastic Net-Test set score: {elastic_net.score(iris_X, iris_y):.2f}")

#### **Ordinal data**

In [None]:
aesthetic_evaluation_data = pd.read_csv('datasets/aesthetic_evaluation_data.csv')
aesthetic_evaluation_data = aesthetic_evaluation_data.drop(columns=['Image Filename','Author','Objective Evaluation'])
aesthetic_evaluation_X = aesthetic_evaluation_data.drop(columns='Subjective Evaluation')
aesthetic_evaluation_y = aesthetic_evaluation_data['Subjective Evaluation']

aesthetic_evaluation_data.head()

In [None]:
a = aesthetic_evaluation_data[['sX2L Value','sX2a Value','sX2b Value','sX2Lab Value','sEMDL Value','sEMDa Value','sEMDb Value','sEMDLab Value']]
a.sum()

In [None]:
aesthetic_evaluation_y.value_counts()

In [None]:
aesthetic_evaluation_y.hist()
plt.xlabel('Subjective Evaluation')
plt.ylabel('Frequency')
plt.title('Distribution of Subjective Evaluation')