### **Differential Scorecards**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# datasets
from sklearn.datasets import load_iris
from ucimlrepo import fetch_ucirepo 
from io import StringIO

# discretization
from libraries.caimcaim import CAIMD # https://github.com/airysen/caimcaim/blob/master/caimcaim/caimcaim.py

# cv
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# grid search
from sklearn.model_selection import GridSearchCV

# objective function
from scipy.optimize import least_squares
from scipy.optimize import minimize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# regularization
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

#### **binary data**

##### synthetic data

##### benchmark datasets

**iris**:

In [None]:
iris_data = load_iris()
iris_X = pd.DataFrame(iris_data.data)
iris_y = pd.DataFrame(iris_data.target)

print("num observations: ", iris_y.count())
print("target distribution: ", iris_y.value_counts())

**adult**: predict whether annual income of an individual exceeds $50K/yr based on census data. 

In [None]:
# fetch dataset 
adult_data = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
adult_X = adult_data.data.features 
adult_y = adult_data.data.targets
adult_y.loc[:,'income'] = adult_y['income'].map({'>50K': 1, '<=50K': 0})
  
# metadata 
# print(adult_data.metadata) 
  
# variable information 
# print(adult_data.variables) 

print("num observations: ", adult_y.count())
print("target distribution: ", adult_y.value_counts())

**mammo**: discrimination of benign and malignant mammographic masses based on BI-RADS attributes and the patient's age.

In [None]:
# fetch dataset 
mammo_data = fetch_ucirepo(id=161) 
  
# data (as pandas dataframes) 
mammo_X = mammo_data.data.features 
mammo_y = mammo_data.data.targets 

# drop rows with nulls
mammo_combined = pd.concat([mammo_X, mammo_y], axis=1)
print("num rows with nulls: ", mammo_combined.isnull().sum().sum())
mammo_combined = mammo_combined.dropna()
mammo_combined = mammo_combined.reset_index(drop=True)
mammo_X = mammo_combined.iloc[:, :-1]
mammo_y = mammo_combined.iloc[:, -1]


# metadata 
# print(mammo_data.metadata) 
  
# variable information 
# print(mammo_data.variables) 

print("num observations: ", mammo_y.count())
print("target distribution: ", mammo_y.value_counts())

**mushroom**: mushrooms described in terms of physical characteristics; classification: poisonous or edible

In [None]:
# fetch dataset 
mushroom_data = fetch_ucirepo(id=73) 

# data (as pandas dataframes) 
mushroom_X = mushroom_data.data.features 
mushroom_y = mushroom_data.data.targets 
mushroom_y.loc[:, 'poisonous'] = mushroom_y['poisonous'].map({'p': 1, 'e': 0})
  
# metadata 
# print(mushroom_data.metadata) 
  
# variable information 
# print(mushroom_data.variables) 

print("num observations: ", mushroom_y.count())
print("target distribution: ", mushroom_y.value_counts())

**spambase**: classifying Email as Spam or Non-Spam

In [None]:
# fetch dataset 
spambase_data = fetch_ucirepo(id=94) 
  
# data (as pandas dataframes) 
spambase_X = spambase_data.data.features 
spambase_y = spambase_data.data.targets 
  
# metadata 
# print(spambase_data.metadata) 
  
# variable information 
# print(spambase_data.variables) 

print("num observations: ", spambase_y.count())
print("target distribution: ", spambase_y.value_counts())

**telemarketing**: set of possible advertisements on Internet pages

In [None]:
# load dataset from file. target is last column
telemarketing_data = pd.read_csv('datasets/internet+advertisements/ad.data', dtype=str)
telemarketing_X = telemarketing_data.iloc[:, :-1]
telemarketing_y = telemarketing_data.iloc[:, -1]

telemarketing_y = telemarketing_y.map({'nonad.': 0, 'ad.': 1})

print("num observations: ", telemarketing_y.count())
print("target distribution: ", telemarketing_y.value_counts())

**sleep apnea**:

In [None]:
sleep_apnea_data = pd.read_csv('datasets/bdsp_psg_master_20231101.csv')

**appendicitis**: https://sci2s.ugr.es/keel/dataset.php?cod=183#sub2

In [None]:
with open('datasets/appendicitis.dat', "r") as file:
    lines = file.readlines()

data_start_ind = lines.index("@data\n") + 1
app_data = lines[data_start_ind:]
app_data = pd.read_csv(StringIO("".join(app_data)), header=None)

app_data.columns = ["At1", "At2", "At3", "At4", "At5", "At6", "At7", "Class"]

app_X = app_data.iloc[:, :-1]
app_y = app_data.iloc[:, -1]

print("num observations: ", app_y.count())
print("target distribution: ", app_y.value_counts())

#### **discretization thresholds**

CAIM

In [None]:
# discretize using CAIM
def discretize_caim_df(data, X, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    
    print("\nCut-off points: ", caim.split_scheme)
    print("Number of bins: ", end="")
    for i, (key, value) in enumerate(caim.split_scheme.items()):
        if i == len(caim.split_scheme) - 1:
            print(f" {key}: {len(value)+1}", end="")
        else:
            print(f" {key}: {len(value)+1}", end=",")
    print()
    
    X_disc_caim = pd.DataFrame(X_disc_caim, columns=X.columns).astype(int) # convert to pandas dataframe and int
        
    return X_disc_caim

''' 
prints of fit() method: Categorical list_of_(indicies)_categorical_features
    # feature_index  GLOBAL CAIM  best_caim_value 
in the returning dataframe:\
    - columns represent the original features
    - rows represent each instance
    - values are the bin number each instance belongs to (starting from 0)
'''

def discretize_caim(X, cols, y):
    caim = CAIMD()
    X_disc_caim = caim.fit_transform(X, y) # fit() and transform()
    # get thresholds from caim.split_scheme (dict with column index : thresholds)
    # transform all values to floats
    # and keys with column indexes to column names
    thresholds = {cols[i]: [float(val) for val in value] for i, (key, value) in enumerate(caim.split_scheme.items())}
    return thresholds


In [None]:
app_thresholds_caim = discretize_caim(app_X, app_X.columns, app_y)
print("\nthresholds ", app_thresholds_caim)

print("num of bins: ")
for i, (key, value) in enumerate(app_thresholds_caim.items()):
        print(f"  {key}: {len(value)+1}")

infinitesimal bins

In [None]:
# discretize using infinitesimal bins:
# thresholds are the points in between 2 consecutive values in the sorted list

def discretize_infbins(X, cols):
    infbins_thresholds = {}
    for col in cols:
        # sort unique values
        sorted_col = np.unique(X[col])
        # get thresholds
        thresholds = (sorted_col[:-1] + sorted_col[1:]) / 2
        infbins_thresholds[col] = thresholds.tolist()

    return infbins_thresholds

In [None]:
app_thresholds_infbins = discretize_infbins(app_X, app_X.columns)
print("thresholds ", app_thresholds_infbins)
print("num of bins: ")
for i, (key, value) in enumerate(app_thresholds_infbins.items()):
        print(f"  {key}: {len(value)+1}")

#### discretized version

num of columns in the new df = (num thresholds + 1) * num features = num bins * num features

2 methods
- 1 out of k: 1 if the value is in the bin, 0 otherwise
- differential coding: 1 from bin 1 until bin where the value is in, 0 otherwise

In [None]:
def get_bins(thresholds, values):
    bins = np.digitize(values, thresholds)
    return bins
    # list of bin number for each row


def disc_1_out_of_k(X, cols, thresholds):
    disc_df = []
    for col in cols:
        bins = get_bins(thresholds[col], X[col]) # gets bin number of each row
        bins_df = pd.get_dummies(bins, prefix=f'feat{col}-bin', prefix_sep='').astype(int) # one hot encoding
        #for i in range(1, len(thresholds[col]) + 1):
        #    if f'feat{col}-bin_{i}' not in bins_df.columns:
        #        bins_df[f'feat{col}-bin{i}'] = 0
        bins_df = bins_df.drop(columns=f'feat{col}-bin0', errors='ignore')
        disc_df.append(bins_df)    
    return pd.concat(disc_df, axis=1)


def disc_diff_coding(X, cols, thresholds):
    bin_dfs = []
    for col in cols:
        bins = get_bins(thresholds[col], X[col]) # gets bin number of each row
        num_bins = len(thresholds[col]) + 1
        bin_df = pd.DataFrame(0, index=X.index, columns=[f'feat{col}-bin{i}' for i in range(1, num_bins)])
        for i in range(1, num_bins):
            bin_df[f'feat{col}-bin{i}'] = (bins >= i).astype(int)
        bin_dfs.append(bin_df)
    return pd.concat(bin_dfs, axis=1)


In [None]:
app_X_disc_caim_1outofk = disc_1_out_of_k(app_X, app_X.columns, app_thresholds_caim)
disc_app_X = app_X_disc_caim_1outofk
app_X_disc_caim_1outofk.head()

In [None]:
app_X_disc_caim_diffcod = disc_diff_coding(app_X, app_X.columns, app_thresholds_caim)
app_X_disc_caim_diffcod.head()

In [None]:
app_X_disc_infbins_1outofk = disc_1_out_of_k(app_X, app_X.columns, app_thresholds_infbins)
app_X_disc_infbins_1outofk.head()

In [None]:
app_X_disc_infbins_diffcod = disc_diff_coding(app_X, app_X.columns, app_thresholds_infbins)
app_X_disc_infbins_diffcod.head()

#### 10 fold CV

In [None]:
def cross_val_score(model, X, y, n_splits=10):
    kf = KFold(n_splits=n_splits)
    scores = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        scores.append(mean_squared_error(y_test, y_pred))
    return np.mean(scores)

#### regularization

In [None]:
alpha = [0.01, 0.1, 0.4, 0.6, 0.9, 0.99]
ridge = Ridge()
lasso = Lasso()
elastic_net = ElasticNet()

param_grid = {'alpha': alpha}

def grid_search(model, X, y, param_grid, cv=10):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=cv)
    grid_search.fit(X, y)
    return grid_search

#### objective function

- Least Squares (RSS)
- Maximum Likelihood (GLM with binomial response and logit link function)
- margin maximization (linear SVM)

LS and ML objective functions were regularized by means of an elastic net, with the a parameter
being determined by grid search over the range {0.01, 0.1, 0.4, 0.6, 0.9, 0.99}.

The linear SVM parameter, C, was determined by grid-search over the range {2^−10, 2^−9, . . . , 2^9, 2^10}.
All results involving SVM were obtained through the use of the library LIBSVM

In [None]:
def get_weights(model, disc_X, y):
    model.fit(disc_X, y)
    weights = model.coef_[0]
    feature_names = disc_X.columns
    weights_df = pd.DataFrame({'Feature': feature_names, 'Weight': weights})
    return weights_df

In [None]:
# RSS
grid_search_rss = grid_search(elastic_net, disc_app_X, app_y, param_grid)
best_model_rss = grid_search_rss.best_estimator_
weights_rss = get_weights(best_model_rss, disc_app_X, app_y)
print("RSS weights:\n", weights_rss)

In [None]:
# maximum likelihood (GLM with binomial response and logit link function)
logistic = LogisticRegression()
weights_ml = get_weights(logistic, disc_app_X, app_y)
print("ML weights:\n", weights_ml)

In [None]:
# margin maximization (liner SVM)
param_grid = {
    'C': [2**i for i in range(-10, 11)]
}
svm = SVC(kernel='linear')
grid_search_svm = grid_search(svm, disc_app_X, app_y, param_grid)
best_model_svm = grid_search_svm.best_estimator_
weights_svm = get_weights(best_model_svm, disc_app_X, app_y)
print("SVM weights:\n", weights_svm)

In [None]:
def scorecard(data, X, y, disc_scheme_method, disc_version_method, obj_function, regularization):
    disc_scheme = {}
    if(disc_scheme_method == 'caim'):
        disc_scheme = discretize_caim(X, X.columns, y)
    elif(disc_scheme_method == 'infbins'):
        disc_scheme = discretize_infbins(X, X.columns)
        
    disc_version = pd.DataFrame()
    if(disc_version_method == '1outofk'):
        disc_version = disc_1_out_of_k(X, X.columns, disc_scheme)
    elif(disc_version_method == 'diffcod'):
        disc_version = disc_diff_coding(X, X.columns, disc_scheme)
    
    model = None
    param_grid = {}
    if(obj_function == 'RSS'):
        model = ElasticNet()
        param_grid = {'alpha': [0.01, 0.1, 0.4, 0.6, 0.9, 0.99]}
    elif(obj_function == 'ML'):
        model = LogisticRegression()
        param_grid = {'alpha': [0.01, 0.1, 0.4, 0.6, 0.9, 0.99]}
    elif(obj_function == 'SVM'):
        model = SVC(kernel='linear')
        param_grid = {'C': [2**i for i in range(-10, 11)]}
    
    grid_search_model = grid_search(model, disc_version, y, param_grid)

    best_model = grid_search_model.best_estimator_
    weights = get_weights(best_model, disc_version, y)
    return weights


In [None]:
app_weights = scorecard(app_data, app_X, app_y, 'caim', '1outofk', 'SVM', 'ridge')
print("app weights: ", app_weights)

#### **Ordinal data**

In [None]:
aesthetic_evaluation_data = pd.read_csv('datasets/aesthetic_evaluation_data.csv')
aesthetic_evaluation_data = aesthetic_evaluation_data.drop(columns=['Image Filename','Author','Objective Evaluation'])
aesthetic_evaluation_X = aesthetic_evaluation_data.drop(columns='Subjective Evaluation')
aesthetic_evaluation_y = aesthetic_evaluation_data['Subjective Evaluation']

aesthetic_evaluation_data.head()

In [None]:
a = aesthetic_evaluation_data[['sX2L Value','sX2a Value','sX2b Value','sX2Lab Value','sEMDL Value','sEMDa Value','sEMDb Value','sEMDLab Value']]
a.sum()

In [None]:
aesthetic_evaluation_y.value_counts()

In [None]:
aesthetic_evaluation_y.hist()
plt.xlabel('Subjective Evaluation')
plt.ylabel('Frequency')
plt.title('Distribution of Subjective Evaluation')

#### SBC (single binary classifier) reduction