# Parameters and Model Evaluation

## Principal Component Analysis

In [1]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
from sklearn.metrics import make_scorer,accuracy_score
from scipy.stats import randint as sp_randint
import time

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, GridSearchCV


In [2]:
def rf_parameter_estimation(xEst, yEst):
    
    clf = MLPClassifier()    
    # specify parameters and distributions to sample from
    hyperparameters = {
                'alpha': [0.0001, 0.001, 0.01, 0.05],
                'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
                'activation': ['tanh', 'relu'],
                'solver': ['sgd', 'adam'],
                'learning_rate': ['constant','adaptive'],
            }
    
    
    # %%time
    # Create a KFold object with 5 splits.
    folds = KFold(n_splits=5, shuffle=True, random_state=0)

    # Set the parameters by cross-validation.
    # Set up GridSearchCV() funcion.
    mlp_model = GridSearchCV(MLPClassifier(), hyperparameters, scoring='accuracy', cv=folds)
    mlp_model.fit(X_train, y_train)
    
    mlp_model.fit(xEst, yEst)
    report(mlp_model.cv_results_)
    return mlp_model.best_params_

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))

In [3]:
def pca(X_train, X_test,y_train, n_comp):
    """
    PCA transformation for using a 'training' set and a 'testing' set
    """
    pca = PCA(n_components=n_comp)
    pca.fit(X_train,y_train)
    transform = pca.transform(X_test)
    return transform

In [4]:
components = [4,8,12,16]

In [None]:
results_rf = np.zeros(len(components)+1)

start = time.time()
for id_comp,comp in enumerate(components):
    
    print('-------------','n comp. = ',comp,'-------------')
    
    X_train_pca = pca(X_train,X_train,y_train,comp)
    
    X_test_pca = pca(X_train,X_test,y_train,comp)
    
    #RF
    parameters = rf_parameter_estimation(X_train_pca, y_train)
    c_rf = MLPClassifier(**parameters)
    c_rf.fit(X_train_pca,y_train)
    pred = c_rf.predict(X_test_pca)
    acc = accuracy_score(y_test, pred)
    results_rf[id_comp] = acc

parameters = rf_parameter_estimation(X_train_pca, y_train)
c_rf = MLPClassifier(**parameters)
c_rf.fit(X_train_pca,y_train)
pred = c_rf.predict(X_test_pca)
acc = accuracy_score(y_test, pred)
results_rf[len(components)] = acc
    
end = time.time()
print(end - start)

------------- n comp. =  4 -------------


TypeError: MLPClassifier.__init__() got an unexpected keyword argument 'n_estimators'

### Gray Level Co-occurance Matrix (GLCM)

In [16]:
data = pd.read_csv('./features/glcm_features_split.csv')

In [20]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming your data is in a DataFrame called 'df'
labels = data['Label']
features = data.drop('Label', axis=1)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Set the number of components you want
n_components = 7  # You can choose any suitable value

# Apply PCA
pca = PCA(n_components=n_components)
pca.fit(features_standardized)

# Transform the features
features_pca = pca.transform(features_standardized)

# Now 'features_pca' contains the principal components
# You can use these transformed features for further analysis or modeling

In [21]:
# Extract features and labels
y = data['Label']

# Encode labels (corrosion, no corrosion) to numerical values
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_pca, y, test_size=0.2, random_state=42)

In [26]:
params_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive'],
}

# Create a KFold object with 5 splits.
folds = KFold(n_splits=5, shuffle=True, random_state=0)

# Set the parameters by cross-validation.
# Set up GridSearchCV() funcion.
mlp_model = GridSearchCV(MLPClassifier(), params_grid, scoring='accuracy', cv=folds)
mlp_model.fit(X_train, y_train)



In [27]:
# Print the accuracy score.
print('Best score for the training data:', mlp_model.best_score_, '\nusing', mlp_model.best_params_)

Best score for the training data: 0.7752577319587629 
using {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (300,), 'learning_rate': 'constant', 'solver': 'adam'}


In [28]:
import joblib

# Save the trained MLP classifier to a file
model_filename = './model/finetune__mlp_classifier_model.joblib'
joblib.dump(mlp_model, model_filename)

['./model/finetune__mlp_classifier_model.joblib']

### GLRLM

In [30]:
data = pd.read_csv('./features/glrlm_features_split.csv')

In [31]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Assuming your data is in a DataFrame called 'df'
labels = data['Label']
features = data.drop('Label', axis=1)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Set the number of components you want
n_components = 7  # You can choose any suitable value

# Apply PCA
pca = PCA(n_components=n_components)
pca.fit(features_standardized)

# Transform the features
features_pca = pca.transform(features_standardized)

# Now 'features_pca' contains the principal components
# You can use these transformed features for further analysis or modeling

In [32]:
# Extract features and labels
y = data['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_pca, y, test_size=0.2, random_state=42)

In [33]:
params_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.05],
    'hidden_layer_sizes': [(100,50,10), (200,100,50), (500,250,50), (100,), (200,), (300,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant','adaptive'],
}

# Create a KFold object with 5 splits.
folds = KFold(n_splits=5, shuffle=True, random_state=0)

# Set the parameters by cross-validation.
# Set up GridSearchCV() funcion.
mlp_model = GridSearchCV(MLPClassifier(), params_grid, scoring='accuracy', cv=folds)
mlp_model.fit(X_train, y_train)



In [35]:
# Print the accuracy score.
print('Best score for the training data:', mlp_model.best_score_, '\nusing', mlp_model.best_params_)

Best score for the training data: 0.7319587628865979 
using {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (300,), 'learning_rate': 'constant', 'solver': 'adam'}


In [34]:
import joblib

# Save the trained MLP classifier to a file
model_filename = './model/finetune__mlp_glrlm_model.joblib'
joblib.dump(mlp_model, model_filename)

['./model/finetune__mlp_glrlm_model.joblib']