# AutoML project 2: ciu

In [92]:
from ciu import determine_ciu
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
import numpy as np
import os
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Define the path to the datasets folder
datasets_folder = "../datasets"

# Initialize empty lists to store dataframes for each file
attribute_names_list = []
categorical_indicator_list = []
X_list = []
y_list = []

# Loop through each folder in the datasets folder
for folder_name in os.listdir(datasets_folder):
    folder_path = os.path.join(datasets_folder, folder_name)
    
    # Check if it's a directory
    if os.path.isdir(folder_path):
        # Construct file paths for each CSV file in the folder
        attribute_names_path = os.path.join(folder_path, "attribute_names.csv")
        categorical_indicator_path = os.path.join(folder_path, "categorical_indicator.csv")
        X_path = os.path.join(folder_path, "X.csv")
        y_path = os.path.join(folder_path, "y.csv")
        
        # Read each CSV file into a pandas dataframe
        attribute_names_df = pd.read_csv(attribute_names_path)
        categorical_indicator_df = pd.read_csv(categorical_indicator_path)
        X_df = pd.read_csv(X_path)
        y_df = pd.read_csv(y_path)
        
        # Append dataframes to the lists
        attribute_names_list.append(attribute_names_df)
        categorical_indicator_list.append(categorical_indicator_df)
        X_list.append(X_df)
        y_list.append(y_df)

In [85]:
# Example: Accessing the first dataset
first_attribute_names_df = attribute_names_list[0]
first_categorical_indicator_df = categorical_indicator_list[0]
first_X_df = X_list[0]
first_y_df = y_list[0]

In [84]:
for i in range(len(X_list)):
    print(len(X_list[i]))

15545
1458
1563
10885
522
2109
1109
625
19020
10218
1372
748
2126
540
14980
583
5404
5456
699
1941
569
2000
6430
736
1473
2796
500
8124
690
3196
990
1000
10992
556
601
554
2310
768
9961
3772
500
683
500
11055
9873
797
672
958
846
20000


In [7]:
first_X_df

Unnamed: 0,id,start,end,event,size
0,1.0,0.0,913439.0,0,14.0
1,1.0,913439.0,1292405.0,1,14.0
2,1.0,1292405.0,1540749.0,0,14.0
3,2.0,0.0,50170.0,0,14.0
4,3.0,0.0,913439.0,0,18.0
...,...,...,...,...,...
15540,879.0,97360.0,130256.0,0,1511.0
15541,879.0,130256.0,132045.0,1,1513.0
15542,879.0,132045.0,132393.0,0,1502.0
15543,879.0,132393.0,207343.0,0,1517.0


In [8]:
first_y_df

Unnamed: 0,state
0,0
1,0
2,1
3,0
4,0
...,...
15540,1
15541,1
15542,1
15543,1


In [13]:
first_attribute_names_df

Unnamed: 0,0
0,id
1,start
2,end
3,event
4,size


In [14]:
first_categorical_indicator_df

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False


In [34]:
X_train, X_test, y_train, y_test = train_test_split(first_X_df, first_y_df, test_size = 0.2, random_state = 42)

In [38]:
le = LabelEncoder()
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        le.fit(X_train[col])
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

In [53]:
n_classes = len(np.unique(y_train))
feat_list = X_train.columns.tolist()

In [54]:
type(feat_list)

list

In [41]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

In [46]:
def exp_fn_blk(xtest):
    exp1 = []
    for i in range(len(xtest)):
        exp = determine_ciu(X_test.iloc[i:i+1], model.predict_proba, X_train.to_dict('list'), samples = 1000, prediction_index = 1)
        exp_list = [[feat_list.index(i), exp.ci[i]] for i in exp.ci]
        exp1.append(exp_list)
    return np.array(exp1)

In [78]:
exp1 = exp_fn_blk(X_test[:100])
exp2 = exp_fn_blk(X_test[:100])

In [88]:
def calc_identity(exp1, exp2):
    dis = np.array([np.array_equal(exp1[i],exp2[i]) for i in range(len(exp1))])
    total = dis.shape[0]
    true = np.sum(dis)
    score = (total-true)/total
    return score*100, true, total

def calc_separability(exp):
    wrong = 0
    for i in range(exp.shape[0]):
        for j in range(exp.shape[0]):
            if i == j:
                continue
            eq = np.array_equal(exp[i],exp[j])
            if eq:
                wrong = wrong + 1
    total = exp.shape[0]
    score = 100*abs(wrong)/total**2
    return wrong,total,total**2,score

def calc_stability(exp, labels):
    total = labels.shape[0]
    label_values = np.unique(labels)
    n_clusters = label_values.shape[0]
    init = np.array([[np.average(exp[np.where(labels == i)], axis = 0)] for i in label_values]).squeeze()
    ct = KMeans(n_clusters = n_clusters, random_state=1, n_init=10, init = init)
    ct.fit(exp)
    error = np.sum(np.abs(labels-ct.labels_))
    if error/total > 0.5:
        error = total-error
    return error, total

In [90]:
i = calc_identity(exp1,exp2)
print(i)

s = calc_separability(X_test.values[:100])
print(s)

def enc_exp(exp, feature_num):
    enc_exp = np.zeros((len(exp),feature_num))
    for i in range(len(exp)):
        for j in range(len(exp[i])):
            enc_exp[i][int(exp[i,j,0])] = exp[i,j,1]
    return enc_exp

enc1 = enc_exp(exp1, len(feat_list))
sb = calc_stability(enc1, y_test[:100])
print(sb)

(22.0, 78, 100)
(0, 100, 10000, 0.0)


ValueError: Expected 2D array, got 1D array instead:
array=[0.27586207 0.97183099].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [74]:
y_test[:100]

Unnamed: 0,state
4398,1
5987,1
12644,1
11864,0
9899,1
...,...
14424,0
10761,1
5838,0
2848,1


In [70]:
enc1[0]

array([1., 1., 1., 0., 0.])

In [82]:
def normalize_test(X_train, X_test):
    X_test_norm = X_test.copy()
    for i in X_train.columns:
        scaler = MinMaxScaler()
        scaler.fit(X_train[i].values.reshape(-1,1))
        X_test_norm[i] = scaler.transform(X_test[i].values.reshape(-1,1))

    return X_test_norm

def calc_similarity(exp, X_test_norm):
    dbscan = DBSCAN(eps=0.5, min_samples=10)
    dbscan.fit(X_test_norm[:400])
    labels = dbscan.labels_
    mean_dist = []
    for i in np.unique(labels):
        mean_dist.append(np.mean(pairwise_distances(exp[np.where(labels == i), :, 1].squeeze(), metric='euclidean')))
    return np.min(mean_dist)

In [83]:
X_test_norm = normalize_test(X_train, X_test)
sim = calc_similarity(exp1, X_test_norm[:100])

print(sim)

0.5
