In [60]:
import functions_ml as fml
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics import hamming_loss, f1_score, jaccard_score, accuracy_score, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV

In [61]:
# PARAMETERS
BINARY_CLASSIFICATON = False
TRAIN_TOLERANCE_LIMIT = 0.01 / 100

In [62]:
# 1. Get dataset
results = pd.read_csv('datasets/instances_results.csv')
features = pd.read_csv('datasets/multi_plant_instance_features.csv')
dataset = fml.create_dataset(features, results)

In [48]:
# 2. Create target columns for multi label classification
dataset = fml.create_multi_label_target(dataset, TRAIN_TOLERANCE_LIMIT)

In [49]:
target_col = [t for t in dataset.columns if t.startswith('RF')]
for t in target_col:
    print(t, ':', dataset[t].sum())

RF_1_0 : 51
RF_2_0 : 99
RF_2_1 : 93
RF_3_0 : 122
RF_3_1 : 125
RF_3_2 : 119
RF_4_0 : 123
RF_4_1 : 162
RF_4_2 : 168
RF_4_3 : 166
RF_6_0 : 182
RF_6_1 : 180
RF_6_2 : 219
RF_6_3 : 235
RF_6_4 : 235
RF_6_5 : 215
RF_T_0 : 267


In [50]:
# Extract features (X) and labels (y)
X = dataset.drop(columns=target_col + ['instance'])
X = fml.binary_feature_selection(X)
y = dataset[target_col]

In [51]:
# Initialize MultilabelStratifiedShuffleSplit
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

# Get train-test split indices
for train_index, test_index in msss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [52]:
# Train ClassifierChain with RandomForest
model = ClassifierChain(RandomForestClassifier(random_state=2112, n_estimators=100, max_depth=20, max_features='log2', min_samples_leaf=1, min_samples_split=2), order=[i for i in range(len(y.columns))][::-1])
model.fit(X_train, y_train)

In [53]:
# Predict on the test set
y_pred = pd.DataFrame(model.predict(X_test), columns=y_test.columns, index=y_test.index)
y_pred_proba = pd.DataFrame(model.predict_proba(X_test), columns=y_test.columns, index=y_test.index)

# Solution quality

## Model predictions

In [54]:
y_pred_proba.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
114,0.0,0.06,0.09,0.07,0.06,0.51,0.38,0.18,0.95,0.88,0.92,0.82,1.0,0.98,0.99,0.98,0.99
460,0.61,0.09,0.02,0.08,0.01,0.0,0.02,0.0,0.02,0.01,0.02,0.01,0.0,0.01,0.1,0.0,0.02
199,0.01,0.02,0.07,0.01,0.34,0.08,0.02,0.1,0.02,0.0,0.0,0.0,0.03,0.03,0.04,0.03,0.03
112,0.0,0.14,0.05,0.64,0.03,0.63,0.24,0.39,0.98,0.89,1.0,0.99,0.97,1.0,0.96,0.98,0.99
349,0.0,0.03,0.0,0.0,0.35,0.01,0.0,0.03,0.22,0.0,0.02,0.0,0.02,0.0,0.0,0.01,0.03


Select top 3

In [55]:
ranked_methods = np.argsort(-y_pred_proba, axis=1)
# Top-K Selection: Select top-K methods based on ranking
TOP_K = 3
top_k_methods = np.zeros_like(y_pred_proba, dtype=int)

# Set top-K methods as selected (binary)
for i, row in enumerate(ranked_methods):
    top_indices = row[:TOP_K]  # Get top-K indices for this instance
    top_k_methods[i, top_indices] = 1

# Convert to DataFrame for comparison
top_k_methods_df = pd.DataFrame(top_k_methods, columns=y_test.columns, index=y_test.index)

# Display the top-K selected methods
top_k_methods_df.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
114,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
460,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
199,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0
349,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0


## Validation set

Adaptação dos actual values para refletirem o $\tau=0,01\%$

In [56]:
VALIDATION_TOLERANCE_LIMIT = 0.01 / 100
results = pd.read_csv('datasets/instances_results.csv')
features = pd.read_csv('datasets/multi_plant_instance_features.csv')
validation_set = fml.create_dataset(features, results)
validation_set = fml.create_multi_label_target(validation_set, VALIDATION_TOLERANCE_LIMIT)
validation_set = validation_set[target_col].loc[y_test.index]

In [57]:
validation_set.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
114,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1
460,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
199,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
349,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [58]:
# Element-wise multiplication to find correct predictions
correct_predictions = top_k_methods_df * validation_set

# Sum the total number of correct predictions
total_correct = correct_predictions.sum().sum()  # Sum across all rows and columns
print(f"Total Correct Predictions: {total_correct}")

# Optionally, calculate the number of correct predictions per instance
correct_per_instance = correct_predictions.sum(axis=1)
print("\nCorrect Predictions Per Instance:")
print(correct_per_instance)

# Optionally, calculate the accuracy as a percentage
total_possible = validation_set.sum().sum()  # Total number of actual positive labels
accuracy = (total_correct / total_possible) * 100
print(f"\nAccuracy: {accuracy:.2f}%")

Total Correct Predictions: 111

Correct Predictions Per Instance:
114    3
460    1
199    1
112    3
349    1
      ..
16     1
20     1
376    3
116    3
523    1
Length: 69, dtype: int64

Accuracy: 39.36%


In [59]:
# Step 1: Calculate the number of correct predictions per row (instance)
correct_per_instance = correct_predictions.sum(axis=1)

# Step 2: Identify rows with zero correct predictions
missed_all = (correct_per_instance == 0).sum()  # Count rows with no correct predictions
total_instances = len(correct_predictions)  # Total number of rows

# Step 3: Calculate the accuracy for missed rows
missed_accuracy = (missed_all / total_instances) * 100

# Output the results
print(f"Number of Rows with All Incorrect Predictions: {missed_all}")
print(f"Percentage of Rows with All Incorrect Predictions: {missed_accuracy:.2f}%")

Number of Rows with All Incorrect Predictions: 9
Percentage of Rows with All Incorrect Predictions: 13.04%
