### Conversion of GridSearch log data to a df

The below code converts the logs generated whilst performing GridSearch for complex model architecture & converts the important information from the logs to a value in a dataframe

In [18]:
import os
import re
import ast
import pickle
import pandas as pd

Merge all the ```results_LSTM.txt``` files from all the scripts to a single .txt file

In [19]:
files_to_ignore = ['.DS_Store']

# Specify the LSTM script whose logs are to be considered
logs_path = '../../model_results/bilstm_results/script_3/'

# Obtain all the consequent log folders
log_folders = os.listdir(logs_path)
log_folders = [folder for folder in log_folders if folder not in files_to_ignore]
# print(log_folders)

# Specify the final merged results.txt file path
final_results_file_path = logs_path + 'final_merged_results'

if not os.path.exists(final_results_file_path):
    os.makedirs(final_results_file_path)

final_results_txt_file_path = final_results_file_path + '/final_results.txt'

# Merge all the results to a single results.txt file
def merge_results(source_path, destination_path):
    try:
        with open(destination_path, 'a', encoding='utf-8') as destination_file:
            with open(source_path, 'r', encoding='utf-8') as source_file:
                    content = source_file.read()
                    destination_file.write(content)
            destination_file.write('\n')  # Optionally add a newline between file contents
    except Exception as e:
        print(f'Exception {e} thrown for {source_path}')
    print("Text files combined successfully!")

# Iterate over all the log folders in the model script folder and obtain the path of the results_LSTM.txt paths
for folder in sorted(log_folders):
    folder_path = logs_path + folder
    source_result_path = folder_path + '/results_BiLSTM.txt'
#     print(f'Folder: {folder}, Results Text File: {source_result_path}')
    merge_results(source_result_path, final_results_txt_file_path)


Text files combined successfully!


The below code identifies the beginning and the end of a search combination in the log file and creates the content within that to be considered as a single search instance

In [20]:
with open(final_results_txt_file_path, 'r') as results_file:
    results_content = results_file.read()

cleaned_content = [line for line in results_content.split('\n')]

search_combinations = []
current_instance = []

for line in cleaned_content:
    if re.search(r'BEGIN SEARCH : for Script', line):
        if current_instance:
            search_combinations.append(current_instance)
        current_instance = [line]
    elif re.search(r'END SEARCH', line):
        if current_instance:
            current_instance.append(line)
            search_combinations.append(current_instance)
        current_instance = []
    elif current_instance:
        current_instance.append(line)

# # Print the search_combinations
# for idx, instance in enumerate(search_combinations, start=1):
#     print(f"Instance {idx}:")
#     for line in instance:
#         print(line)
#     print("\n")
#     break

The below code iterates through all search combination instances and extracts the relevant information needed and stores it in the form of a dictionary

In [21]:
search_info = []

for search in search_combinations:
    script_num = 0
    search_count = 0
    sequence_length = 0
    unit = 0
    dropout_rate = 0
    activation_function = ''
    loss_function = ''
    optimizer = ''
    num_epochs = 0
    batch_size = 0
    seed_value = 0
    
    training_loss = 0
    training_accuracy = 0
    validation_loss = 0
    validation_accuracy = 0
    test_loss = 0
    test_accuracy = 0
    
    model_parameters = ''
    model_keys = ''
    
    classification_report = ''
    capture_classification = False  # Flag to capture classification report content
    confusion_matrix = ''

    for line in search:
        script_match = re.search(r'Script (\d+)', line)
        if script_match:
            script_num = ast.literal_eval(script_match.group(1))
        search_match = re.search(r'BEGIN SEARCH : (\d+)', line)
        if search_match:
            search_count = ast.literal_eval(search_match.group(1))
        sequence_length_match = re.search(r'Sequence Length = (\d+)', line)
        if sequence_length_match:
            sequence_length = ast.literal_eval(sequence_length_match.group(1))
        units_match = re.search(r'Units = (\d+)', line)
        if units_match:
            units = ast.literal_eval(units_match.group(1))
        dropout_match = re.search(r'Dropout = (\d+\.?\d*)', line)
        if dropout_match:
            dropout_rate = ast.literal_eval((dropout_match.group(1)))
        activation_match = re.search(r'Activation = (.+)', line)
        if activation_match:
            activation_function = activation_match.group(1)
        loss_match = re.search(r'Loss Function = (.+)', line)
        if loss_match:
            loss_function = loss_match.group(1)
        optimizer_match = re.search(r'Optimizer = (.+)', line)
        if optimizer_match:
            optimizer = optimizer_match.group(1)
        epochs_match = re.search(r'Epochs = (\d+)', line)
        if epochs_match:
            num_epochs = ast.literal_eval(epochs_match.group(1))
        batch_size_match = re.search(r'Batch Size = (\d+)', line)
        if batch_size_match:
            batch_size = ast.literal_eval(batch_size_match.group(1))
        seed_match = re.search(r'Seed Value = (\d+)', line)
        if seed_match:
            seed_value = ast.literal_eval(seed_match.group(1))

        training_loss_match = re.search(r'Training Loss:\s+(\d+\.\d+)', line)
        if training_loss_match:
            training_loss = ast.literal_eval(training_loss_match.group(1))
        training_accuracy_match = re.search(r'Training Accuracy:\s+(\d+\.\d+)', line)
        if training_accuracy_match:
            training_accuracy = ast.literal_eval(training_accuracy_match.group(1))
        validation_loss_match = re.search(r'Validation Loss:\s+(\d+\.\d+)', line)
        if validation_loss_match:
            validation_loss = ast.literal_eval(validation_loss_match.group(1))
        validation_accuracy_match = re.search(r'Validation Accuracy:\s+(\d+\.\d+)', line)
        if validation_accuracy_match:
            validation_accuracy = ast.literal_eval(validation_accuracy_match.group(1))
        test_loss_match = re.search(r'Test Loss:\s+(\d+\.\d+)', line)
        if test_loss_match:
            test_loss = ast.literal_eval(test_loss_match.group(1))
        test_accuracy_match = re.search(r'Test Accuracy:\s+(\d+\.\d+)', line)
        if test_accuracy_match:
            test_accuracy = ast.literal_eval(test_accuracy_match.group(1))

        model_params_match = re.search(r"Model Parameters: (.+)", line)
        if model_params_match:
            model_parameters = ast.literal_eval(model_params_match.group(1))
        model_keys_match = re.search(r"Model Keys: (.+)", line)
        if model_keys_match:
            model_keys = model_keys_match.group(1)

        class_report_start = re.search(r'------------ CLASSIFICATION REPORT ------------', line)
        if class_report_start:
            capture_classification = True  # Start capturing content
            continue  # Skip this line as it's just a marker
        elif capture_classification and re.search(r'------------ CONFUSION MATRIX ------------', line):
            capture_classification = False  # Stop capturing on "Confusion Matrix" marker
        elif capture_classification:
            if line.strip() != '':
                classification_report += line + '\n'

        # Capture the lines within the "CONFUSION MATRIX" section
        if re.search(r'Confusion matrix saved for', line):
            confusion_matrix = line.split('Confusion matrix saved for ')[-1]  # Extract file name
            break  # Stop capturing after saving the confusion matrix filename
        
        
    capture_classification_row = False
    precision_values = {}
    recall_values = {}
    f1_scores = {}
    macro_avgs = {}
    weighted_avgs = {}

    # RE pattern for obtaining classification reports values
    class_values_pattern = r'^\s*(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+'
    macro_avg_pattern = r'\s*macro avg\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)'
    weighted_avg_pattern = r'\s*weighted avg\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)'

    # Once we have the classification report, let us extract all the other information in it
    for row in classification_report.split('\n'):
        row_headers_start_match = re.search("precision    recall  f1-score   support", row)
        if row_headers_start_match:
            capture_classification_row = True
            continue
        elif capture_classification_row and re.search('macro avg', row):
            macro_avg_match = re.search(macro_avg_pattern, row)
            if macro_avg_match:
                macro_avgs['precision'] = ast.literal_eval(macro_avg_match.group(1))
                macro_avgs['recall'] = ast.literal_eval(macro_avg_match.group(2))
                macro_avgs['f1_score'] = ast.literal_eval(macro_avg_match.group(3))
                
        elif capture_classification_row and re.search('weighted avg', row):
            weighted_avg_match = re.search(weighted_avg_pattern, row)
            if weighted_avg_match:
                weighted_avgs['precision'] = ast.literal_eval(weighted_avg_match.group(1))
                weighted_avgs['recall'] = ast.literal_eval(weighted_avg_match.group(2))
                weighted_avgs['f1_score'] = ast.literal_eval(weighted_avg_match.group(3))
                capture_classification_row = False
#                 break
        elif capture_classification_row and re.search(r'\d+', row):
            class_values_match = re.search(class_values_pattern, row)
            if class_values_match:
                class_label = ast.literal_eval(class_values_match.group(1))
                precision_value = ast.literal_eval(class_values_match.group(2))
                recall_value = ast.literal_eval(class_values_match.group(3))
                f1_score = ast.literal_eval(class_values_match.group(4))
                
                precision_values[class_label] = precision_value
                recall_values[class_label] = recall_value
                f1_scores[class_label] = f1_score
                
        

#     print(f'Script Number: {script_num}, Search Number: {search_count}')
#     print(f'Sequence Length: {sequence_length}')
#     print(f'Units: {units}')
#     print(f'Dropout Rate: {dropout_rate}')
#     print(f'Activation Function: {activation_function}')
#     print(f'Loss Function: {loss_function}')
#     print(f'Optimizer: {optimizer}')
#     print(f'Number of Epochs: {num_epochs}')
#     print(f'Batch Size: {batch_size}')
#     print(f'Seed Value: {seed_value}')

#     print(f'Training Loss: {training_loss}')
#     print(f'Training Accuracy: {training_accuracy}')
#     print(f'Validation Loss: {validation_loss}')
#     print(f'Validation Accuracy: {validation_accuracy}')
#     print(f'Test Loss: {test_loss}')
#     print(f'Test Accuracy: {test_accuracy}')

#     print(f'Model Parameters: {model_parameters}')
#     print(f'Model Keys: {model_keys}')

#     print(f'Classification Report:\n{classification_report}')
#     print(f'Confusion Matrix: {confusion_matrix}')
#     print(f'Precison Values : {precision_values}')
#     print(f'Recall Values: {recall_values}')
#     print(f'F1-Scores: {f1_scores}')
#     print(f'Macro Averages: {macro_avgs}')
#     print(f'Weighted Averages: {weighted_avgs}')

    # Append all the collected information to the search_info list if needed
    search_info.append({
        'script_num': script_num,
        'search_count': search_count,
        'sequence_length': sequence_length,
        'units': units,
        'dropout_rate': dropout_rate,
        'activation_function': activation_function,
        'loss_function': loss_function,
        'optimizer': optimizer,
        'num_epochs': num_epochs,
        'batch_size': batch_size,
        'seed_value': seed_value,
        'training_loss': training_loss,
        'training_accuracy': training_accuracy,
        'validation_loss': validation_loss,
        'validation_accuracy': validation_accuracy,
        'test_loss': test_loss,
        'test_accuracy': test_accuracy,
        'precision_values': precision_values,
        'recall_values': recall_values,
        'f1_scores': f1_scores,
        'macro_averages': macro_avgs,
        'weighted_averages': weighted_avgs,
        'model_parameters': model_parameters,
        'model_keys': model_keys,
        'confusion_matrix': confusion_matrix,
        'classification_report': classification_report,
        'precision_values': precision_values,
        'recall_values': recall_values,
        'f1_scores': f1_scores,
        'macro_averages': macro_avgs,
        'weighted_averages': weighted_avgs,
    })

The below code converts the search_combinations_information dictionary to a dataframe

In [22]:
df_columns = [
    'script_num',
    'search_count',
    'sequence_length',
    'units',
    'dropout_rate',
    'activation_function',
    'loss_function',
    'optimizer',
    'num_epochs',
    'batch_size',
    'seed_value',
    'training_loss',
    'training_accuracy',
    'validation_loss',
    'validation_accuracy',
    'test_loss',
    'test_accuracy',
    'precision_values',
    'recall_values',
    'f1_scores',
    'macro_averages',
    'weighted_averages',
    'model_parameters',
    'model_keys',
    'classification_report',
    'confusion_matrix'
]

df = pd.DataFrame(search_info, columns = df_columns)
df

Unnamed: 0,script_num,search_count,sequence_length,units,dropout_rate,activation_function,loss_function,optimizer,num_epochs,batch_size,...,test_accuracy,precision_values,recall_values,f1_scores,macro_averages,weighted_averages,model_parameters,model_keys,classification_report,confusion_matrix
0,3,1,15,32,0.0,sigmoid,categorical_crossentropy,SGD,250,512,...,0.3947,"{0: 0.65, 1: 0.3, 2: 0.23}","{0: 0.34, 1: 0.6, 2: 0.25}","{0: 0.45, 1: 0.4, 2: 0.24}","{'precision': 0.39, 'recall': 0.4, 'f1_score':...","{'precision': 0.49, 'recall': 0.39, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 59}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_32_0_sigmoid_categorical_c...
1,3,2,15,32,0.0,sigmoid,categorical_crossentropy,SGD,250,1024,...,0.4087,"{0: 0.64, 1: 0.3, 2: 0.22}","{0: 0.38, 1: 0.62, 2: 0.16}","{0: 0.48, 1: 0.41, 2: 0.18}","{'precision': 0.39, 'recall': 0.39, 'f1_score'...","{'precision': 0.48, 'recall': 0.41, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 30}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_32_0_sigmoid_categorical_c...
2,3,3,15,32,0.0,sigmoid,categorical_crossentropy,SGD,250,2048,...,0.4222,"{0: 0.61, 1: 0.31, 2: 0.2}","{0: 0.43, 1: 0.6, 2: 0.1}","{0: 0.51, 1: 0.4, 2: 0.13}","{'precision': 0.37, 'recall': 0.38, 'f1_score'...","{'precision': 0.46, 'recall': 0.42, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 15}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_32_0_sigmoid_categorical_c...
3,3,4,15,32,0.0,sigmoid,categorical_crossentropy,SGD,250,4096,...,0.4737,"{0: 0.6, 1: 0.29, 2: 0.3}","{0: 0.62, 1: 0.4, 2: 0.07}","{0: 0.61, 1: 0.34, 2: 0.12}","{'precision': 0.4, 'recall': 0.37, 'f1_score':...","{'precision': 0.47, 'recall': 0.47, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 8}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_32_0_sigmoid_categorical_c...
4,3,5,15,32,0.0,sigmoid,categorical_crossentropy,SGD,500,512,...,0.3660,"{0: 0.64, 1: 0.28, 2: 0.21}","{0: 0.3, 1: 0.58, 2: 0.23}","{0: 0.41, 1: 0.38, 2: 0.22}","{'precision': 0.38, 'recall': 0.37, 'f1_score'...","{'precision': 0.47, 'recall': 0.37, 'f1_score'...","{'verbose': '2', 'epochs': 500, 'steps': 59}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_32_0_sigmoid_categorical_c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1018,3,1019,15,64,0.6,sigmoid,categorical_crossentropy,Adam,250,2048,...,0.3617,"{0: 0.61, 1: 0.27, 2: 0.29}","{0: 0.28, 1: 0.58, 2: 0.28}","{0: 0.39, 1: 0.37, 2: 0.28}","{'precision': 0.39, 'recall': 0.38, 'f1_score'...","{'precision': 0.46, 'recall': 0.36, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 15}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_64_0.6_sigmoid_categorical...
1019,3,1020,15,64,0.6,sigmoid,categorical_crossentropy,Adam,250,4096,...,0.3522,"{0: 0.65, 1: 0.27, 2: 0.19}","{0: 0.28, 1: 0.61, 2: 0.18}","{0: 0.39, 1: 0.38, 2: 0.18}","{'precision': 0.37, 'recall': 0.36, 'f1_score'...","{'precision': 0.47, 'recall': 0.35, 'f1_score'...","{'verbose': '2', 'epochs': 250, 'steps': 8}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_64_0.6_sigmoid_categorical...
1020,3,1021,15,64,0.6,sigmoid,categorical_crossentropy,Adam,500,512,...,0.2759,"{0: 0.53, 1: 0.26, 2: 0.18}","{0: 0.11, 1: 0.69, 2: 0.18}","{0: 0.18, 1: 0.37, 2: 0.18}","{'precision': 0.32, 'recall': 0.33, 'f1_score'...","{'precision': 0.4, 'recall': 0.28, 'f1_score':...","{'verbose': '2', 'epochs': 500, 'steps': 59}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_64_0.6_sigmoid_categorical...
1021,3,1022,15,64,0.6,sigmoid,categorical_crossentropy,Adam,500,1024,...,0.5058,"{0: 0.6, 1: 0.32, 2: 0.0}","{0: 0.69, 1: 0.42, 2: 0.0}","{0: 0.64, 1: 0.36, 2: 0.0}","{'precision': 0.31, 'recall': 0.37, 'f1_score'...","{'precision': 0.43, 'recall': 0.51, 'f1_score'...","{'verbose': '2', 'epochs': 500, 'steps': 30}","dict_keys(['loss', 'accuracy', 'val_loss', 'va...",precision recall f1-score ...,confusion_matrix_15_64_0.6_sigmoid_categorical...


In [23]:
final_results_df_path = final_results_file_path +  '/bilstm_script_3_gridSearch_results.csv'
df_no_duplicates = df.drop_duplicates(subset=['search_count'])
df_no_duplicates.to_csv(f'{final_results_df_path}', index = False)