<a href="https://colab.research.google.com/github/lucevito/image/blob/main/Split_100_random_forest_GridSearch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd drive/MyDrive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive


In [2]:
import glob
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import joblib
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc
from imblearn.metrics import geometric_mean_score
import os
import pandas as pd
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows


def loaddataset(directory):
  images_files = glob.glob(directory + '/images' + '/*.npy')
  masks_files = glob.glob(directory + '/masks' + '/*.npy')
  x = np.array([np.load(file) for file in images_files])
  y = np.array([np.load(file) for file in masks_files])
  x = x.reshape(len(x) * len(x[0]) * len(x[0][0]), 10)
  y = y.reshape(len(y) * len(y[0]) * len(y[0][0]), 1)
  y = np.ravel(y)
  return x,y

def rflearn(X,Y,filename):
  rf_model = RandomForestClassifier(random_state=42)
  rf_model.fit(X, Y)
  joblib.dump(rf_model, filename)

def rftest(test,filename):
  rf_model = joblib.load(filename)
  predictions = rf_model.predict(test)
  return predictions

def grindsearch(param_grid,X,Y,filename):
  rf_model = RandomForestClassifier(random_state=42)

  grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='roc_auc')
  grid_search.fit(X, Y)

  best_params = grid_search.best_params_
  best_score = grid_search.best_score_

  best_rf_model = RandomForestClassifier(**best_params, random_state=42)
  best_rf_model.fit(X, Y)
  joblib.dump(rf_model, filename)

def selectSet(X, Y, target_class):
  mask = (Y == target_class)
  selectionX = X[mask]
  selectionY = Y[mask]
  return selectionX, selectionY

def sampling(X, Y, n):
    indices = np.random.choice(len(X), n, replace=False)
    sampleX = X[indices]
    sampleY = Y[indices]
    return sampleX, sampleY

def concatenate(X1, X2, Y1, Y2):
    X = np.concatenate((X1, X2), axis=0)
    Y = np.concatenate((Y1, Y2), axis=0)
    return X, Y

def print_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    precision_negative = precision_score(y_true, y_pred, pos_label=0)
    recall_negative = recall_score(y_true, y_pred, pos_label=0)
    fscore_negative = f1_score(y_true, y_pred, pos_label=0)
    precision_positive = precision_score(y_true, y_pred, pos_label=1)
    recall_positive = recall_score(y_true, y_pred, pos_label=1)
    fscore_positive = f1_score(y_true, y_pred, pos_label=1)
    average_accuracy = (accuracy_score(y_true, y_pred) +
                        accuracy_score(y_true, y_pred, normalize=False)) / 2
    overall_accuracy = accuracy_score(y_true, y_pred)
    gmean = geometric_mean_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    print("True Negative (TN):", tn)
    print("False Positive (FP):", fp)
    print("False Negative (FN):", fn)
    print("True Positive (TP):", tp)
    print("Precision (Negative Class):", precision_negative)
    print("Recall (Negative Class):", recall_negative)
    print("F-score (Negative Class):", fscore_negative)
    print("Precision (Positive Class):", precision_positive)
    print("Recall (Positive Class):", recall_positive)
    print("F-score (Positive Class):", fscore_positive)
    print("Average Accuracy:", average_accuracy)
    print("Overall Accuracy:", overall_accuracy)
    print("G-Mean:", gmean)
    print("AUC (Area Under the Curve):", roc_auc)
    print("\n")

def save_csv(model_name, dataset_name, param, y_true, y_pred):
  file_name = 'risultati_modelli.xlsx'
  cm = confusion_matrix(y_true, y_pred)
  tn, fp, fn, tp = cm.ravel()
  precision_negative = precision_score(y_true, y_pred, pos_label=0)
  recall_negative = recall_score(y_true, y_pred, pos_label=0)
  fscore_negative = f1_score(y_true, y_pred, pos_label=0)
  precision_positive = precision_score(y_true, y_pred, pos_label=1)
  recall_positive = recall_score(y_true, y_pred, pos_label=1)
  fscore_positive = f1_score(y_true, y_pred, pos_label=1)
  average_accuracy = (accuracy_score(y_true, y_pred) +
                      accuracy_score(y_true, y_pred, normalize=False)) / 2
  overall_accuracy = accuracy_score(y_true, y_pred)
  gmean = geometric_mean_score(y_true, y_pred)
  roc_auc = roc_auc_score(y_true, y_pred)

  results = [
      {
          'Modello': model_name,
          'Dataset': dataset_name,
          'Parametri della configurazione': param,
          'True Negative': tn,
          'False Negative': fn,
          'False Positive': fp,
          'True Positive': tp,
          'Precision Negative': precision_negative,
          'Recall Negative': recall_negative,
          'Fscore Negative': fscore_negative,
          'Precision Positive': precision_positive,
          'Recall Positive': recall_positive,
          'Fscore Positive': fscore_positive,
          'Average Accuracy': average_accuracy,
          'Overall Accuracy': overall_accuracy,
          'GMean': gmean,
          'AUC': roc_auc,
      },
  ]

  if os.path.exists(file_name):
      existing_df = pd.read_excel(file_name)
      df = pd.concat([existing_df, pd.DataFrame(results)])
  else:
      df = pd.DataFrame(results)
  wb = Workbook()
  ws = wb.active
  for r in dataframe_to_rows(df, index=False, header=True):
      ws.append(r)
  wb.save(file_name)

In [None]:
train_path = 'Immagini_satellitari/Train'
test_path = 'Immagini_satellitari/Test/'
model_name = "rf_split100_GridSearch_model.h"
param = 'rf split100 GridSearch'
param_grid = {
    "class_weight": [{0: 1, 1: 1}, {0: 1, 1: 2}, {0: 1, 1: 3}, {0: 1, 1: 4}, {0: 1, 1: 5}, "balanced"],
    "max_depth": [7, 8, 9],
    "max_samples": [0.8, 0.9, 1.0],
    "criterion": ["entropy", "gini", "gini"],
    "max_features": ["sqrt", "log2"]
}

trainX,trainY = loaddataset(train_path)
testX,testY = loaddataset(test_path)

X1,Y1 = selectSet(trainX, trainY, 1)
X0,Y0 = selectSet(trainX, trainY, 0)
X1,Y1 = sampling(X1, Y1, 100)
X0,Y0 = sampling(X0, Y0, 100)
X,Y = concatenate(X1, X0, Y1, Y0)
grindsearch(param_grid,X,Y,model_name)

test_predictions = rftest(testX,model_name)
print("TEST : ")
print_metrics(testY,test_predictions)
save_csv(model_name, 'Test Set', param, testY, test_predictions)

train_predictions = rftest(trainX,model_name)
print("TRAIN : ")
print_metrics(trainY,train_predictions)
save_csv(model_name, 'Train Set', param, trainY, train_predictions)