# Parameters and Model Evaluation

## Principal Component Analysis

In [2]:
import time
import joblib
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score
from scipy.stats import randint as sp_randint

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler


### Gray Level Co-occurance Matrix (GLCM)

In [2]:
data = pd.read_csv('./features/glcm_features_split.csv')

In [3]:
labels = data['Label']
features = data.drop('Label', axis=1)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Apply PCA
n_components = 10
pca = PCA(n_components=n_components)
pca.fit(features_standardized)

# Transform the features
features_pca = pca.transform(features_standardized)

In [4]:
# Extract features and labels
y = data['Label']

# Encode labels (corrosion, no corrosion) to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_pca, y, test_size=0.2, random_state=42)

In [5]:
params_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive'],
}

# Create a KFold object with 5 splits.
folds = KFold(n_splits=5, shuffle=True, random_state=0)

# Set the parameters by cross-validation.
# Set up GridSearchCV() funcion.
mlp_model = GridSearchCV(MLPClassifier(), params_grid, scoring='accuracy', cv=folds)
mlp_model.fit(X_train, y_train)



In [6]:
# Print the accuracy score.
print('Best score for the training data:', mlp_model.best_score_, '\nusing', mlp_model.best_params_)

Best score for the training data: 0.7765575887527107 
using {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive', 'solver': 'adam'}


In [7]:
import joblib

# Save the trained MLP classifier to a file
model_filename = './model/finetune_v10__mlp_classifier_model.joblib'
joblib.dump(mlp_model, model_filename)

['./model/finetune_v10__mlp_classifier_model.joblib']

### GLRLM

In [8]:
data = pd.read_csv('./features/glrlm_features_split.csv')

In [9]:
labels = data['Label']
features = data.drop('Label', axis=1)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)


# Apply PCA
n_components = 10
pca = PCA(n_components=n_components)
pca.fit(features_standardized)

# Transform the features
features_pca = pca.transform(features_standardized)

In [10]:
# Extract features and labels
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_pca, y, test_size=0.2, random_state=42)

In [11]:
params_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive'],
}

# Create a KFold object with 5 splits.
folds = KFold(n_splits=5, shuffle=True, random_state=0)

# Set the parameters by cross-validation.
# Set up GridSearchCV() funcion.
mlp_model = GridSearchCV(MLPClassifier(), params_grid, scoring='accuracy', cv=folds)
mlp_model.fit(X_train, y_train)



In [12]:
# Print the accuracy score.
print('Best score for the training data:', mlp_model.best_score_, '\nusing', mlp_model.best_params_)

Best score for the training data: 0.736753490412027 
using {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (300,), 'learning_rate': 'adaptive', 'solver': 'adam'}


In [13]:
# Save the trained MLP classifier to a file
model_filename = './model/finetune_v10__mlp_glrlm_model.joblib'
joblib.dump(mlp_model, model_filename)

['./model/finetune_v10__mlp_glrlm_model.joblib']

### Local Binary Pattern

In [37]:
def read_images_from_folder(folder_path):
    images = []
    labels = []

    for subfolder in os.listdir(folder_path):
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            label = subfolder  # Use the subfolder name as the label
            for fn in os.listdir(subfolder_path):
                if fn.endswith('.jpg'):
                    img_path = os.path.join(subfolder_path, fn)
                    im = Image.open(img_path).convert('L')
                    data = np.array(im)
                    images.append(data)
                    labels.append(label)

    return images, labels

# Load images and labels from the 'resize_data' folder structure
data_folder = './resize_data'
images, labels = read_images_from_folder(data_folder)
print('Load data success!')

X = np.array(images)
print(X.shape)

# Encode labels (CORROSION, NOCORROSION) to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

radius = 2
n_point = radius * 8

Load data success!
(1819, 256, 256)
(1455, 256, 256)
(364, 256, 256)
(1455,)
(364,)


In [38]:
def lbp_texture(train_data, test_data):
    max_bins_train = 0
    max_bins_test = 0

    for i in range(len(train_data)):
        lbp = feature.local_binary_pattern(train_data[i], n_point, radius, 'default')
        max_bins_train = max(max_bins_train, int(lbp.max()) + 1)

    for i in range(len(test_data)):
        lbp = feature.local_binary_pattern(test_data[i], n_point, radius, 'default')
        max_bins_test = max(max_bins_test, int(lbp.max()) + 1)

    train_hist = np.zeros((len(train_data), max_bins_train))
    test_hist = np.zeros((len(test_data), max_bins_test))

    for i in range(len(train_data)):
        lbp = feature.local_binary_pattern(train_data[i], n_point, radius, 'default')
        train_hist[i], _ = np.histogram(lbp, bins=max_bins_train, density=True)

    for i in range(len(test_data)):
        lbp = feature.local_binary_pattern(test_data[i], n_point, radius, 'default')
        test_hist[i], _ = np.histogram(lbp, bins=max_bins_test, density=True)

    return train_hist, test_hist


In [39]:
X_train, X_test = lbp_texture(X_train, X_test)

params_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive'],
}

# Create a KFold object with 5 splits.
folds = KFold(n_splits=5, shuffle=True, random_state=0)

# Set the parameters by cross-validation.
# Set up GridSearchCV() funcion.
mlp_model = GridSearchCV(MLPClassifier(), params_grid, scoring='accuracy', cv=folds)
mlp_model.fit(X_train, y_train)

# Predict on the test set
y_pred = mlp_model.predict(X_test)

# Evaluate the mlp_model classifier
train_accuracy = mlp_model.score(X_train, y_train)
test_accuracy = mlp_model.score(X_test, y_test)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')
accuracy = accuracy_score(y_test, y_pred)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Overall Accuracy: {accuracy}")



In [None]:
import joblib

# Save the trained MLP model to a file
model_filename = './model/mlp_lbp_model.pkl'
joblib.dump(mlp, model_filename)

print(f"MLP model saved as {model_filename}")

MLP model saved as ./model/mlp_lbp_model.pkl
