In [292]:
import functions_ml as fml
import pandas as pd
import numpy as np
import pickle
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics import hamming_loss, f1_score, jaccard_score, accuracy_score, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

In [293]:
# PARAMETERS
TRAIN_TOLERANCE_LIMIT = 0.05 / 100

In [294]:
# 1. Get dataset
results = pd.read_csv('datasets/instances_results.csv')
features = pd.read_csv('datasets/multi_plant_instance_features.csv')
dataset = fml.create_dataset(features, results)

In [295]:
# 2. Create target columns for multi label classification
dataset = fml.create_multi_label_target(dataset, TRAIN_TOLERANCE_LIMIT)

In [296]:
target_col = [t for t in dataset.columns if t.startswith('RF')]
for t in target_col:
    print(t, ':', dataset[t].sum())

RF_1_0 : 160
RF_2_0 : 244
RF_2_1 : 239
RF_3_0 : 248
RF_3_1 : 255
RF_3_2 : 228
RF_4_0 : 246
RF_4_1 : 275
RF_4_2 : 258
RF_4_3 : 243
RF_6_0 : 255
RF_6_1 : 284
RF_6_2 : 294
RF_6_3 : 302
RF_6_4 : 286
RF_6_5 : 257
RF_T_0 : 308


In [297]:
# Extract features (X) and labels (y)
X = dataset.drop(columns=target_col + ['instance'])
X = fml.binary_feature_selection(X)
y = dataset[target_col]

In [298]:
# Initialize MultilabelStratifiedShuffleSplit
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

# Get train-validation split indices
for train_index, validation_index in msss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[validation_index]
    y_train, y_test = y.iloc[train_index], y.iloc[validation_index]

In [299]:
# Define the base RandomForestClassifier
base_rf_model = RandomForestClassifier(
    random_state=2112,
    n_estimators=100,
    max_depth=20,
    max_features='log2',
    min_samples_leaf=1,
    min_samples_split=2,
    class_weight='balanced'
)

# Wrap the RandomForestClassifier with CalibratedClassifierCV
calibrated_rf_model = CalibratedClassifierCV(
    estimator=base_rf_model,
    method='sigmoid',  # Can also use 'isotonic' for larger datasets
    cv=5               # 5-fold cross-validation for calibration
)

# Define the ClassifierChain
model = ClassifierChain(
    base_estimator=calibrated_rf_model,  # Use the calibrated classifier
    order=[i for i in range(len(y.columns))][::-1]  # Reverse order
)

# Train the ClassifierChain
model.fit(X_train, y_train)

In [300]:
# Train classifier on full data and save
# oracle = ClassifierChain(RandomForestClassifier(random_state=2112, n_estimators=100, max_depth=20, max_features='log2', min_samples_leaf=1, min_samples_split=2), order=[i for i in range(len(y.columns))][::-1])
# oracle.fit(X, y)
# with open('trained_models/oracle.pkl','wb') as f:
#     pickle.dump(oracle,f)

# Predictions on validation set

In [301]:
# Predict on the validation set
y_pred = pd.DataFrame(model.predict(X_test), columns=y_test.columns, index=y_test.index)
y_pred_proba = pd.DataFrame(model.predict_proba(X_test), columns=y_test.columns, index=y_test.index)

In [302]:
hamming = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamming)

f1 = f1_score(y_test, y_pred, average='micro')
print("Micro-Averaged F1 Score:", f1)

jaccard = jaccard_score(y_test, y_pred, average='samples')
print("Jaccard Similarity Score:", jaccard)

subset_accuracy = accuracy_score(y_test, y_pred)
print("Subset Accuracy:", subset_accuracy)

Hamming Loss: 0.09056956115779645
Micro-Averaged F1 Score: 0.8870779976717112
Jaccard Similarity Score: 0.5208063372979339
Subset Accuracy: 0.30158730158730157


In [303]:
method_evaluation_results = []
for method in y_test.columns:
    # Calculate true positives (correct assignments)
    true_positives = ((y_pred == 1) & (y_test == 1))[method].sum()

    # Calculate false positives (predicted as 1 but actual is 0)
    false_positives = ((y_pred == 1) & (y_test == 0))[method].sum()

    # Calculate false negatives (predicted as 0 but actual is 1)
    false_negatives = ((y_pred == 0) & (y_test == 1))[method].sum()

    # Calculate Precision
    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0  # Handle division by zero

    # Calculate Recall
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0  # Handle division by zero

    # Append results to the list
    method_evaluation_results.append({
        'Method': method,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Precision (%)': precision * 100,
        'Recall (%)': recall * 100,
        'F1-Score': (2 * precision * recall) / (precision + recall)
    })

DÁ PARA MELHORAR ISSO AQUI SE CONSEGUIR REBALANCEAR DATASET

In [304]:
method_evaluation_results_df = pd.DataFrame(method_evaluation_results)
method_evaluation_results_df

Unnamed: 0,Method,True Positives,False Positives,False Negatives,Precision (%),Recall (%),F1-Score
0,RF_1_0,8,4,8,66.666667,50.0,0.571429
1,RF_2_0,16,2,8,88.888889,66.666667,0.761905
2,RF_2_1,20,3,4,86.956522,83.333333,0.851064
3,RF_3_0,22,1,3,95.652174,88.0,0.916667
4,RF_3_1,21,3,5,87.5,80.769231,0.84
5,RF_3_2,22,2,1,91.666667,95.652174,0.93617
6,RF_4_0,20,2,5,90.909091,80.0,0.851064
7,RF_4_1,24,0,4,100.0,85.714286,0.923077
8,RF_4_2,24,0,2,100.0,92.307692,0.96
9,RF_4_3,24,2,1,92.307692,96.0,0.941176


## Model probabilities

In [305]:
y_pred_proba.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
452,0.039149,0.052969,0.054675,0.04397,0.042644,0.033138,0.042407,0.05499,0.043568,0.031971,0.036875,0.036411,0.037957,0.056628,0.067337,0.058512,0.873199
295,0.180274,0.135183,0.065129,0.647152,0.0771,0.031997,0.648667,0.181846,0.06354,0.031583,0.042085,0.0543,0.038781,0.03965,0.066201,0.037723,0.047659
369,0.243648,0.742389,0.226871,0.375644,0.121584,0.063517,0.049261,0.053497,0.050865,0.034599,0.042586,0.059218,0.046116,0.045255,0.049107,0.038803,0.055366
83,0.042352,0.385198,0.965559,0.94957,0.965115,0.974979,0.966994,0.95408,0.968681,0.971087,0.957921,0.966014,0.967468,0.971092,0.964141,0.959011,0.954009
347,0.732472,0.113393,0.053417,0.047416,0.040659,0.031371,0.04419,0.074205,0.043044,0.031162,0.043538,0.036397,0.036849,0.036913,0.075599,0.036957,0.045867


### Select top K

In [306]:
ranked_methods = np.argsort(-y_pred_proba, axis=1)
# Top-K Selection: Select top-K methods based on ranking
TOP_K = 3
top_k_methods = np.zeros_like(y_pred_proba, dtype=int)

# Set top-K methods as selected (binary)
for i, row in enumerate(ranked_methods):
    top_indices = row[:TOP_K]  # Get top-K indices for this instance
    top_k_methods[i, top_indices] = 1

# Convert to DataFrame for comparison
top_k_methods_df = pd.DataFrame(top_k_methods, columns=y_test.columns, index=y_test.index)

# Display the top-K selected methods
top_k_methods_df.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
452,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1
295,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0
369,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
83,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0
347,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [307]:
# Element-wise multiplication to find correct predictions
correct_predictions = top_k_methods_df * y_test

# Sum the total number of correct predictions
total_correct = correct_predictions.sum().sum()  # Sum across all rows and columns
print(f"Total Correct Predictions: {total_correct}")

# Optionally, calculate the number of correct predictions per instance
correct_per_instance = correct_predictions.sum(axis=1)
print("\nCorrect Predictions Per Instance:")
print(correct_per_instance)

# Optionally, calculate the accuracy as a percentage
total_possible = y_test.sum().sum()  # Total number of actual positive labels
accuracy = (total_correct / total_possible) * 100
print(f"\nAccuracy: {accuracy:.2f}%")

# Precision
if TOP_K == 1:
    precision = total_correct / len(y_test) * 100
    print(f"Precision: {precision:.2f}%")

Total Correct Predictions: 119

Correct Predictions Per Instance:
452    1
295    0
369    1
83     3
347    1
      ..
16     1
20     1
116    3
573    3
385    3
Length: 63, dtype: int64

Accuracy: 26.86%


In [308]:
# Step 1: Calculate the number of correct predictions per row (instance)
correct_per_instance = correct_predictions.sum(axis=1)

# Step 2: Identify rows with zero correct predictions
missed_all = (correct_per_instance == 0).sum()  # Count rows with no correct predictions
total_instances = len(correct_predictions)  # Total number of rows

# Step 3: Calculate the accuracy for missed rows
missed_accuracy = (missed_all / total_instances) * 100

# Output the results
print(f"Number of Rows with All Incorrect Predictions: {missed_all}")
print(f"Percentage of Rows with All Incorrect Predictions: {missed_accuracy:.2f}%")

Number of Rows with All Incorrect Predictions: 5
Percentage of Rows with All Incorrect Predictions: 7.94%


### Performance what if only 1 method

In [309]:
# Step 1: Initialize a DataFrame to store results
method_evaluation_results = []

# Step 2: Loop through each method
for method in y_test.columns:
    # Create a binary matrix where the current method is always 1 and others are 0
    single_method_matrix = np.zeros_like(y_test, dtype=int)
    single_method_matrix[:, y_test.columns.get_loc(method)] = 1  # Set current method column to 1
    
    # Convert to DataFrame
    single_method_df = pd.DataFrame(single_method_matrix, columns=y_test.columns, index=y_test.index)

    # Calculate true positives (correct assignments)
    true_positives = ((single_method_df == 1) & (y_test == 1))[method].sum()

    # Calculate false positives (predicted as 1 but actual is 0)
    false_positives = ((single_method_df == 1) & (y_test == 0))[method].sum()

    # Calculate false negatives (predicted as 0 but actual is 1)
    false_negatives = ((single_method_df == 0) & (y_test == 1))[method].sum()

    # Calculate Precision
    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0  # Handle division by zero

    # Calculate Recall
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0  # Handle division by zero

    # Append results to the list
    method_evaluation_results.append({
        'Method': method,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Precision (%)': precision * 100,
        'Recall (%)': recall * 100
    })

In [310]:
method_evaluation_results_df = pd.DataFrame(method_evaluation_results)
method_evaluation_results_df

Unnamed: 0,Method,True Positives,False Positives,False Negatives,Precision (%),Recall (%)
0,RF_1_0,16,47,0,25.396825,100.0
1,RF_2_0,24,39,0,38.095238,100.0
2,RF_2_1,24,39,0,38.095238,100.0
3,RF_3_0,25,38,0,39.68254,100.0
4,RF_3_1,26,37,0,41.269841,100.0
5,RF_3_2,23,40,0,36.507937,100.0
6,RF_4_0,25,38,0,39.68254,100.0
7,RF_4_1,28,35,0,44.444444,100.0
8,RF_4_2,26,37,0,41.269841,100.0
9,RF_4_3,25,38,0,39.68254,100.0
