In [20]:
import functions_ml as fml
import pandas as pd
import numpy as np
import pickle
from sklearn.multioutput import MultiOutputClassifier, ClassifierChain
from sklearn.ensemble import RandomForestClassifier
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.metrics import hamming_loss, f1_score, jaccard_score, accuracy_score, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV

In [21]:
# PARAMETERS
BINARY_CLASSIFICATON = False
TRAIN_TOLERANCE_LIMIT = 0.01 / 100

In [22]:
# 1. Get dataset
results = pd.read_csv('datasets/instances_results.csv')
features = pd.read_csv('datasets/multi_plant_instance_features.csv')
dataset = fml.create_dataset(features, results)

In [23]:
# 2. Create target columns for multi label classification
dataset = fml.create_multi_label_target(dataset, TRAIN_TOLERANCE_LIMIT)

In [24]:
target_col = [t for t in dataset.columns if t.startswith('RF')]
for t in target_col:
    print(t, ':', dataset[t].sum())

RF_1_0 : 51
RF_2_0 : 99
RF_2_1 : 93
RF_3_0 : 122
RF_3_1 : 125
RF_3_2 : 119
RF_4_0 : 123
RF_4_1 : 162
RF_4_2 : 168
RF_4_3 : 166
RF_6_0 : 182
RF_6_1 : 180
RF_6_2 : 219
RF_6_3 : 235
RF_6_4 : 235
RF_6_5 : 215
RF_T_0 : 267


In [25]:
# Extract features (X) and labels (y)
X = dataset.drop(columns=target_col + ['instance'])
X = fml.binary_feature_selection(X)
y = dataset[target_col]

In [26]:
# Initialize MultilabelStratifiedShuffleSplit
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)

# Get train-validation split indices
for train_index, validation_index in msss.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[validation_index]
    y_train, y_test = y.iloc[train_index], y.iloc[validation_index]

In [27]:
# Train ClassifierChain with RandomForest
model = ClassifierChain(RandomForestClassifier(random_state=2112, n_estimators=100, max_depth=20, max_features='log2', min_samples_leaf=1, min_samples_split=2), order=[i for i in range(len(y.columns))][::-1])
model.fit(X_train, y_train)

In [28]:
# # Train classifier on full data and save
# oracle = ClassifierChain(RandomForestClassifier(random_state=2112, n_estimators=100, max_depth=20, max_features='log2', min_samples_leaf=1, min_samples_split=2), order=[i for i in range(len(y.columns))][::-1])
# oracle.fit(X, y)
# with open('trained_models/oracle.pkl','wb') as f:
#     pickle.dump(oracle,f)

# Predictions on validation set

In [29]:
# Predict on the validation set
y_pred = pd.DataFrame(model.predict(X_test), columns=y_test.columns, index=y_test.index)
y_pred_proba = pd.DataFrame(model.predict_proba(X_test), columns=y_test.columns, index=y_test.index)

In [30]:
hamming = hamming_loss(y_test, y_pred)
print("Hamming Loss:", hamming)

f1 = f1_score(y_test, y_pred, average='micro')
print("Micro-Averaged F1 Score:", f1)

jaccard = jaccard_score(y_test, y_pred, average='samples')
print("Jaccard Similarity Score:", jaccard)

subset_accuracy = accuracy_score(y_test, y_pred)
print("Subset Accuracy:", subset_accuracy)

Hamming Loss: 0.09548167092924126
Micro-Averaged F1 Score: 0.7886792452830189
Jaccard Similarity Score: 0.4336441939702809
Subset Accuracy: 0.21739130434782608


In [31]:
method_evaluation_results = []
for method in y_test.columns:
    # Calculate true positives (correct assignments)
    true_positives = ((y_pred == 1) & (y_test == 1))[method].sum()

    # Calculate false positives (predicted as 1 but actual is 0)
    false_positives = ((y_pred == 1) & (y_test == 0))[method].sum()

    # Calculate false negatives (predicted as 0 but actual is 1)
    false_negatives = ((y_pred == 0) & (y_test == 1))[method].sum()

    # Calculate Precision
    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0  # Handle division by zero

    # Calculate Recall
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0  # Handle division by zero

    # Append results to the list
    method_evaluation_results.append({
        'Method': method,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Precision (%)': precision * 100,
        'Recall (%)': recall * 100,
        'F1-Score': (2 * precision * recall) / (precision + recall)
    })

DÁ PARA MELHORAR ISSO AQUI SE CONSEGUIR REBALANCEAR DATASET

In [32]:
method_evaluation_results_df = pd.DataFrame(method_evaluation_results)
method_evaluation_results_df

Unnamed: 0,Method,True Positives,False Positives,False Negatives,Precision (%),Recall (%),F1-Score
0,RF_1_0,5,3,0,62.5,100.0,0.769231
1,RF_2_0,3,2,7,60.0,30.0,0.4
2,RF_2_1,4,1,5,80.0,44.444444,0.571429
3,RF_3_0,6,2,6,75.0,50.0,0.6
4,RF_3_1,5,1,8,83.333333,38.461538,0.526316
5,RF_3_2,10,3,2,76.923077,83.333333,0.8
6,RF_4_0,6,2,6,75.0,50.0,0.6
7,RF_4_1,9,2,9,81.818182,50.0,0.62069
8,RF_4_2,12,1,5,92.307692,70.588235,0.8
9,RF_4_3,13,4,4,76.470588,76.470588,0.764706


## Model probabilities

In [33]:
y_pred_proba.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
114,0.0,0.06,0.09,0.07,0.06,0.51,0.38,0.18,0.95,0.88,0.92,0.82,1.0,0.98,0.99,0.98,0.99
460,0.61,0.09,0.02,0.08,0.01,0.0,0.02,0.0,0.02,0.01,0.02,0.01,0.0,0.01,0.1,0.0,0.02
199,0.01,0.02,0.07,0.01,0.34,0.08,0.02,0.1,0.02,0.0,0.0,0.0,0.03,0.03,0.04,0.03,0.03
112,0.0,0.14,0.05,0.64,0.03,0.63,0.24,0.39,0.98,0.89,1.0,0.99,0.97,1.0,0.96,0.98,0.99
349,0.0,0.03,0.0,0.0,0.35,0.01,0.0,0.03,0.22,0.0,0.02,0.0,0.02,0.0,0.0,0.01,0.03


### Select top K

In [34]:
ranked_methods = np.argsort(-y_pred_proba, axis=1)
# Top-K Selection: Select top-K methods based on ranking
TOP_K = 3
top_k_methods = np.zeros_like(y_pred_proba, dtype=int)

# Set top-K methods as selected (binary)
for i, row in enumerate(ranked_methods):
    top_indices = row[:TOP_K]  # Get top-K indices for this instance
    top_k_methods[i, top_indices] = 1

# Convert to DataFrame for comparison
top_k_methods_df = pd.DataFrame(top_k_methods, columns=y_test.columns, index=y_test.index)

# Display the top-K selected methods
top_k_methods_df.head()

Unnamed: 0,RF_1_0,RF_2_0,RF_2_1,RF_3_0,RF_3_1,RF_3_2,RF_4_0,RF_4_1,RF_4_2,RF_4_3,RF_6_0,RF_6_1,RF_6_2,RF_6_3,RF_6_4,RF_6_5,RF_T_0
114,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1
460,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
199,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0
112,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0
349,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0


In [None]:
# Element-wise multiplication to find correct predictions
correct_predictions = top_k_methods_df * y_test

# Sum the total number of correct predictions
total_correct = correct_predictions.sum().sum()  # Sum across all rows and columns
print(f"Total Correct Predictions: {total_correct}")

# Optionally, calculate the number of correct predictions per instance
correct_per_instance = correct_predictions.sum(axis=1)
print("\nCorrect Predictions Per Instance:")
print(correct_per_instance)

# Optionally, calculate the accuracy as a percentage
total_possible = y_test.sum().sum()  # Total number of actual positive labels
accuracy = (total_correct / total_possible) * 100
print(f"\nAccuracy: {accuracy:.2f}%")

# Precision
if TOP_K == 1:
    precision = total_correct / len(y_test) * 100
    print(f"Precision: {precision:.2f}")

Total Correct Predictions: 111

Correct Predictions Per Instance:
114    3
460    1
199    1
112    3
349    1
      ..
16     1
20     1
376    3
116    3
523    1
Length: 69, dtype: int64

Accuracy: 39.36%


In [36]:
# Step 1: Calculate the number of correct predictions per row (instance)
correct_per_instance = correct_predictions.sum(axis=1)

# Step 2: Identify rows with zero correct predictions
missed_all = (correct_per_instance == 0).sum()  # Count rows with no correct predictions
total_instances = len(correct_predictions)  # Total number of rows

# Step 3: Calculate the accuracy for missed rows
missed_accuracy = (missed_all / total_instances) * 100

# Output the results
print(f"Number of Rows with All Incorrect Predictions: {missed_all}")
print(f"Percentage of Rows with All Incorrect Predictions: {missed_accuracy:.2f}%")

Number of Rows with All Incorrect Predictions: 9
Percentage of Rows with All Incorrect Predictions: 13.04%


### Performance what if only 1 method

In [37]:
# Step 1: Initialize a DataFrame to store results
method_evaluation_results = []

# Step 2: Loop through each method
for method in y_test.columns:
    # Create a binary matrix where the current method is always 1 and others are 0
    single_method_matrix = np.zeros_like(y_test, dtype=int)
    single_method_matrix[:, y_test.columns.get_loc(method)] = 1  # Set current method column to 1
    
    # Convert to DataFrame
    single_method_df = pd.DataFrame(single_method_matrix, columns=y_test.columns, index=y_test.index)

    # Calculate true positives (correct assignments)
    true_positives = ((single_method_df == 1) & (y_test == 1))[method].sum()

    # Calculate false positives (predicted as 1 but actual is 0)
    false_positives = ((single_method_df == 1) & (y_test == 0))[method].sum()

    # Calculate false negatives (predicted as 0 but actual is 1)
    false_negatives = ((single_method_df == 0) & (y_test == 1))[method].sum()

    # Calculate Precision
    if (true_positives + false_positives) > 0:
        precision = true_positives / (true_positives + false_positives)
    else:
        precision = 0.0  # Handle division by zero

    # Calculate Recall
    if (true_positives + false_negatives) > 0:
        recall = true_positives / (true_positives + false_negatives)
    else:
        recall = 0.0  # Handle division by zero

    # Append results to the list
    method_evaluation_results.append({
        'Method': method,
        'True Positives': true_positives,
        'False Positives': false_positives,
        'False Negatives': false_negatives,
        'Precision (%)': precision * 100,
        'Recall (%)': recall * 100
    })

In [38]:
method_evaluation_results_df = pd.DataFrame(method_evaluation_results)
method_evaluation_results_df

Unnamed: 0,Method,True Positives,False Positives,False Negatives,Precision (%),Recall (%)
0,RF_1_0,5,64,0,7.246377,100.0
1,RF_2_0,10,59,0,14.492754,100.0
2,RF_2_1,9,60,0,13.043478,100.0
3,RF_3_0,12,57,0,17.391304,100.0
4,RF_3_1,13,56,0,18.84058,100.0
5,RF_3_2,12,57,0,17.391304,100.0
6,RF_4_0,12,57,0,17.391304,100.0
7,RF_4_1,18,51,0,26.086957,100.0
8,RF_4_2,17,52,0,24.637681,100.0
9,RF_4_3,17,52,0,24.637681,100.0
