In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np
from collections import Counter

from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.utils import plot_model
from tensorflow.keras.metrics import AUC,Accuracy

from src.tensors.metrics import UtilMetric,F1Score
from src.tensors.losses import mean_regret
from src.tensors.utils import train
from src.tensors.models import create_ann_utadis_model,create_nn_model
from src.viz import show_class_counts,visualize_data,show_combinations,show_stats,show_history,show_monotone_blocks,show_criteria_weights
from src.utils import undersample

# Car evaluation

#### Łukasz Andryszewski 151930

The dataset used is the Car Evalutaion dataset which can be found [here](https://en.cs.uni-paderborn.de/is/research/research-projects/software/monotone-learning-datasets).

It consists of six criterions and four classes. The criterions are:
- price
- price of the maintenance
- number of doors
- capacity 
- size of luggage boot
- estimated safety

The criteria are normalized between 0 and 1.

Based on them the alternatives are assigned to four sorted classes, which are:
1. unacceptable 
2. acceptable
3. good
4. very good

However here they will be binerized between the second and third class into to:

1. Bad
2. Good

In [None]:
data = pd.read_csv("./data/monodata/car evaluation.csv",header=None)
features = len(data.columns)-1
crits = ["price","maintaince price","doors","capacity","size of luggage", "safety"]
data.columns = crits+["class"]#[f"crit_{i}" for i in range(features)]+["class"]
data

In [None]:
data.loc[data["class"]<=2,"class"] = 0
data.loc[data["class"]>=3,"class"] = 1
data_classless = data.drop(columns="class")
classes = 2
data

In [None]:
data.describe()

In [None]:
visualize_data(data_classless.to_numpy(),data["class"].to_numpy())

From the names and distribution of their values in different classes, it can be inferred that the price and maintaince price criterions are of cost types and the rest of the criterions are gain type.

In [None]:
show_class_counts(data["class"])

The data is highly imbalanced, so there is a need for undersampling.

In [None]:
show_combinations(data_classless)

The number of possible combinations of all values of the criterions and the number of alternatives is the same. Judging by that suspicious fact, it is safe to assume that the dataset is composed of all possible alternatives or that there is quite a number of repeated alternatives.

For the sake of performance and to avoid learning the most of the space of alternatives the first class needs to be heavily undersampled.

In [None]:
new_X,new_y = undersample(data_classless.to_numpy(),data["class"].to_numpy())

In [None]:
visualize_data(new_X,new_y)

Split into train and test data

In [None]:
X_train, X_rest, y_train, y_rest = train_test_split(new_X,new_y,test_size=0.40)
X_test, X_val, y_test, y_val = train_test_split(X_rest,y_rest,test_size=0.50)

## RankSVM method

### Calculate differencese between rows of different classes.

In [None]:
def calculate_2d_differences(array:np.array):
    difs = array[:,np.newaxis,:] - array 
    return difs.reshape(-1,array.shape[1])

def calculate_1d_differences(vector:np.array):
    difs = vector[:,np.newaxis] - vector
    return difs.reshape(-1)

In [None]:
X_train_difs = calculate_2d_differences(X_train)
X_test_difs = calculate_2d_differences(X_test)

y_train_difs = calculate_1d_differences(y_train)
y_test_difs = calculate_1d_differences(y_test)

In [None]:
y_train_filtered = y_train_difs[y_train_difs != 0]
y_test_filtered = y_test_difs[y_test_difs != 0]

X_train_filtered = X_train_difs[y_train_difs != 0]
X_test_filtered = X_test_difs[y_test_difs != 0]

In [None]:
rank_svm = LinearSVC()#make_pipeline(StandardScaler(),LinearSVC())

rank_svm.fit(X_train_filtered,y_train_filtered)

In [None]:
print("Performance on train set:\n")
show_stats(rank_svm,X_train_filtered,y_train_filtered)
print("\nPerformance on test set:\n")
show_stats(rank_svm,X_test_filtered,y_test_filtered)

## Tensorflow solutions

### ANN-UTADIS

In [None]:
from src.tensors.layers import MonotoneBlock

In [None]:
threshold = 0.9
ideal_alt = [0,0,1,1,1,1]
antiideal_alt = [1,1,0,0,0,0]

In [None]:
uta_model = create_ann_utadis_model(threshold,ideal_alt,antiideal_alt,classes,features,L=7)
uta_model.build(input_shape=(None,features))
uta_model.summary()

In [None]:
history = train(uta_model,X_train,y_train,mean_regret,
                val_data=(X_val,y_val),
                batch=len(X_train),
                epochs=20,
                patience=5,
                metrics=[UtilMetric(Accuracy()),UtilMetric(AUC(name="auc")),UtilMetric(F1Score())])

In [None]:
show_history(history)

In [None]:
show_monotone_blocks(uta_model.uta,features)

In [None]:
show_criteria_weights(uta_model)

### Conventional Neural Network

In [None]:
nn_model = create_nn_model(features)

nn_model.summary()

In [None]:
plot_model(nn_model)

In [None]:
history = train(nn_model,X_train,y_train,"binary_crossentropy",val_data=(X_val,y_val),patience=10)

In [None]:
show_history(history)

## References


<!--bibtex 

@Article{Tehrani2011/08,
  title={Choquistic Regression: Generalizing Logistic Regression using the Choquet Integral},
  author={Ali Fallah Tehrani and Weiwei Cheng and Eyke Hüllermeier},
  year={2011/08},
  booktitle={Proceedings of the 7th conference of the European Society for Fuzzy Logic and Technology (EUSFLAT-11)},
  pages={868-875},
  issn={1951-6851},
  isbn={978-90-78677-00-0},
  url={https://doi.org/10.2991/eusflat.2011.86},
  doi={10.2991/eusflat.2011.86},
  publisher={Atlantis Press}
}
-->