In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import dill
# ignore warnings
import warnings

warnings.filterwarnings("ignore")
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn import metrics
from pathlib import Path


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with Path("/home/lcorbucci/fire360/artifacts/dutch/explanations/dt_tvae_100000_2500_1.pkl").open("rb") as f:
    explanations = dill.load(f)

In [10]:
explanations[300]

{'sample':        age  household_position  household_size  prev_residence_place  \
 59976    9                1122             114                     1   
 
        citizenship  country_birth  edu_level  economic_status  \
 59976            1              1          2              111   
 
        cur_eco_activity  Marital_status  sex_binary  occupation_binary  
 59976               134               2           0                  1  ,
 'explanation': ['(edu_level = 2) <= 3.5',
  '(cur_eco_activity = 134) > 132.5',
  '(age = 9) <= 10.5',
  '(prev_residence_place = 1) <= 1.5',
  '(Marital_status = 2) <= 3.0',
  'Leaf node 8 reached, prediction: 1'],
 'prediction_bb': 1,
 'confidence_bb': 0.9448537230491638,
 'fidelity_neighbours': 1.0,
 'fidelity': 1,
 'robustness_top_3': array([0.5]),
 'robustness_top_5': array([0.5]),
 'robustness_top_8': array([0.5]),
 'robustness_top_10': array([0.5]),
 'robustness_top_20': array([0.58944444]),
 'stability': 1.0}

# Dataset size

- Letter: (20000, 17)
- Covertype: (581012, 55)
- House16: (22784, 17)
- Shuffle: (43500, 10)

# Covertype

In [None]:
dataset = load_dataset("mstz/covertype", "covertype")["train"]

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df["cover_type"].value_counts()

In [None]:
# train logistic regression model

X = df.drop(columns=["cover_type"])
y = df["cover_type"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

In [None]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)

# House 16

In [None]:
dataset = load_dataset("mstz/house16", "house16")["train"]

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# train logistic regression model

X = df.drop(columns=["class"])
y = df["class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()
print(f"Accuracy: {accuracy}")

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()
print(f"Majority class accuracy: {majority_accuracy}")

# Shuttle

In [None]:
dataset = load_dataset("mstz/shuttle", "shuttle")["train"]

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df["class"].value_counts()

In [None]:
# train logistic regression model

X = df.drop(columns=["class"])
y = df["class"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

In [None]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)

# Letter

In [None]:
dataset = load_dataset("mstz/letter", "letter")["train"]

In [None]:
df = pd.DataFrame(dataset)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df["letter"].value_counts()

In [None]:
# train logistic regression model

X = df.drop(columns=["letter"])
y = df["letter"]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

model = LogisticRegression()
# Define the parameter grid
param_grid = {"C": [0.1, 1, 10, 100], "solver": ["liblinear", "saga"]}

# Initialize GridSearchCV
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# Fit GridSearchCV
grid_search.fit(X_train, Y_train)

# Get the best model
model = grid_search.best_estimator_
model.fit(X_train, Y_train)

# make predictions
predictions = model.predict(X_test)

# evaluate model
accuracy = (predictions == Y_test).mean()

# Majority classifier
majority_class = Y_train.mode()[0]
majority_accuracy = (Y_test == majority_class).mean()

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Majority class accuracy: {majority_accuracy}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average=None)}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='macro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='micro')}")

In [None]:
print(f"F1 Score : {f1_score(Y_test, predictions, average='weighted')}")

In [None]:
print(f"Classification Report : {classification_report(Y_test, predictions)}")

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(Y_test, predictions, pos_label=2)
metrics.auc(fpr, tpr)

In [None]:
import torch

a = torch.tensor([2., 3.], requires_grad=True)
b = torch.tensor([6., 4.], requires_grad=True)
