In [1]:
#%pip install numpy pandas matplotlib seaborn scikit-learn tensorflow obspy distutils obspy

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from obspy import read
import os
import sys


# Define el directorio de datos
path_to_dataset = r'C:\Users\Kono\Desktop\space_apps_2024_seismic_detection\data'
sys.path.append(path_to_dataset)
data_directory = path_to_dataset + r'\lunar\training\data\S12_GradeA'
data_files = os.listdir(data_directory)
data_files = [data_directory + '\\' + file for file in data_files if file.endswith('.mseed')]
abstract_dfs = pd.read_csv(r'C:\Users\Kono\Desktop\space_apps_2024_seismic_detection\data\lunar\training\catalogs\apollo12_catalog_GradeA_final.csv')

# Inicializa un DataFrame vacío con columnas especificadas
df = pd.DataFrame(columns=['file_name', 'start', 'id', 'cant_measurements', 'st'])

for index, file in enumerate(data_files):
    print(f'Processing file {index+1} of {len(data_files)}')
    
    # Lee el archivo .mseed
    temp_df = read(file)
    
    # Extrae el ID del evento del nombre del archivo
    evid_id = file.split('\\')[-1].split('_')[-1].split('evid')[1].split('.')[0]
    
    # Extrae el nombre del archivo sin la extensión .mseed
    file_name = file.split('\\')[-1].rstrip('.mseed')
    
    # Verifica si el nombre del archivo existe en el DataFrame abstract_dfs
    if len(abstract_dfs[abstract_dfs['filename'] == file_name]['time_rel(sec)']) == 0:
        continue  # Salta este archivo si no existe en abstract_dfs
    
    # Obtiene el tiempo de inicio del DataFrame abstract_dfs
    start = abstract_dfs[abstract_dfs['filename'] == file_name]['time_rel(sec)'].iloc[0]
    
    # Extrae la traza y los datos
    tr = temp_df.traces[0].copy()
    tr_data = tr.data  # Velocidades
    tr_times = tr.times()  # Tiempos relativos

    # Crea un diccionario temporal con la información requerida
    temp_dict = {
        'file_name': file_name, 
        'start': start,  
        'id': evid_id, 
        'cant_measurements': temp_df[0].stats.npts,
        'st': temp_df
    }

    # Agrega el diccionario temporal al DataFrame principal
    df = pd.concat([df, pd.DataFrame([temp_dict])], ignore_index=True)

df.head()

Processing file 1 of 76
Processing file 2 of 76
Processing file 3 of 76


  df = pd.concat([df, pd.DataFrame([temp_dict])], ignore_index=True)


Processing file 4 of 76
Processing file 5 of 76
Processing file 6 of 76
Processing file 7 of 76
Processing file 8 of 76
Processing file 9 of 76
Processing file 10 of 76
Processing file 11 of 76
Processing file 12 of 76
Processing file 13 of 76
Processing file 14 of 76
Processing file 15 of 76
Processing file 16 of 76
Processing file 17 of 76
Processing file 18 of 76
Processing file 19 of 76
Processing file 20 of 76
Processing file 21 of 76
Processing file 22 of 76
Processing file 23 of 76
Processing file 24 of 76
Processing file 25 of 76
Processing file 26 of 76
Processing file 27 of 76
Processing file 28 of 76
Processing file 29 of 76
Processing file 30 of 76
Processing file 31 of 76
Processing file 32 of 76
Processing file 33 of 76
Processing file 34 of 76
Processing file 35 of 76
Processing file 36 of 76
Processing file 37 of 76
Processing file 38 of 76
Processing file 39 of 76
Processing file 40 of 76
Processing file 41 of 76
Processing file 42 of 76
Processing file 43 of 76
Proces

Unnamed: 0,file_name,start,id,cant_measurements,st
0,xa.s12.00.mhz.1970-01-19HR00_evid00002,73500.0,2,572415,"[(-6.153278962788711e-14, -7.70128843364098e-1..."
1,xa.s12.00.mhz.1970-03-25HR00_evid00003,12720.0,3,572411,"[(-5.481780117043957e-15, -6.8786525555433944e..."
2,xa.s12.00.mhz.1970-03-26HR00_evid00004,73020.0,4,572411,"[(-2.8212463353274306e-14, -3.523317065258157e..."
3,xa.s12.00.mhz.1970-04-25HR00_evid00006,4440.0,6,572415,"[(9.01642264710853e-15, 1.1305708384819468e-14..."
4,xa.s12.00.mhz.1970-04-26HR00_evid00007,52140.0,7,572411,"[(-1.5835653822406575e-16, -1.8729952083938931..."


In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from deap import base, creator, tools, algorithms
import numpy as np

# Step 1: Load the data
data = df  # Use the DataFrame 'df' directly

# Step 2: Feature engineering on 'st'
# Example: Extracting the number of points and sampling rate as features
data['npts'] = data['st'].apply(lambda x: x[0].stats.npts if isinstance(x, list) and len(x) > 0 else 0)
data['sampling_rate'] = data['st'].apply(lambda x: x[0].stats.sampling_rate if isinstance(x, list) and len(x) > 0 else 0)

# Drop the original 'st' column after feature extraction
data = data.drop(columns=['st'])

# Step 3: Identify categorical columns
categorical_cols = data.select_dtypes(include=['object']).columns

# Step 4: Preprocess the data
# Assuming 'start' is the column to predict and the rest are features
X = data.drop(columns=['start'])
y = data['start']

# Handle missing values if any
#X.fillna(X.mean(), inplace=True)

# Step 5: Apply one-hot encoding to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Create a preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Preprocess the data
X_train_preprocessed = preprocessing_pipeline.fit_transform(X_train)
X_test_preprocessed = preprocessing_pipeline.transform(X_test)

# Convert sparse matrix to dense matrix
X_train_preprocessed = X_train_preprocessed.toarray()
X_test_preprocessed = X_test_preprocessed.toarray()

# Step 8: Define the genetic algorithm for optimizing Random Forest
def evaluate_model(params):
    n_estimators, max_depth, min_samples_split, min_samples_leaf = params
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42
    )
    model.fit(X_train_preprocessed, y_train)
    y_pred = model.predict(X_test_preprocessed)
    mse = mean_squared_error(y_test, y_pred)
    return (mse,)

# Define the parameter bounds
param_bounds = {
    'n_estimators': (10, 200),
    'max_depth': (1, 20),
    'min_samples_split': (2, 20),
    'min_samples_leaf': (1, 20)
}

# Create the DEAP creator
creator.create('FitnessMin', base.Fitness, weights=(-1.0,))
creator.create('Individual', list, fitness=creator.FitnessMin)

# Create the DEAP toolbox
toolbox = base.Toolbox()
toolbox.register('attr_n_estimators', np.random.randint, param_bounds['n_estimators'][0], param_bounds['n_estimators'][1])
toolbox.register('attr_max_depth', np.random.randint, param_bounds['max_depth'][0], param_bounds['max_depth'][1])
toolbox.register('attr_min_samples_split', np.random.randint, param_bounds['min_samples_split'][0], param_bounds['min_samples_split'][1])
toolbox.register('attr_min_samples_leaf', np.random.randint, param_bounds['min_samples_leaf'][0], param_bounds['min_samples_leaf'][1])
toolbox.register('individual', tools.initCycle, creator.Individual, 
                 (toolbox.attr_n_estimators, toolbox.attr_max_depth, toolbox.attr_min_samples_split, toolbox.attr_min_samples_leaf), n=1)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)
toolbox.register('mate', tools.cxBlend, alpha=0.5)
toolbox.register('mutate', tools.mutPolynomialBounded, low=[param_bounds['n_estimators'][0], param_bounds['max_depth'][0], param_bounds['min_samples_split'][0], param_bounds['min_samples_leaf'][0]], 
                 up=[param_bounds['n_estimators'][1], param_bounds['max_depth'][1], param_bounds['min_samples_split'][1], param_bounds['min_samples_leaf'][1]], eta=0.1, indpb=0.2)
toolbox.register('select', tools.selTournament, tournsize=3)
toolbox.register('evaluate', evaluate_model)

# Run the genetic algorithm
population = toolbox.population(n=50)
ngen = 10
cxpb = 0.5
mutpb = 0.2

result, log = algorithms.eaSimple(population, toolbox, cxpb, mutpb, ngen, verbose=True)

# Get the best individual
best_individual = tools.selBest(population, k=1)[0]
best_params = [int(param) for param in best_individual]
print(f'Best Parameters: {best_params}')

# Train the final model with the best parameters
best_model = RandomForestRegressor(
    n_estimators=best_params[0],
    max_depth=best_params[1],
    min_samples_split=best_params[2],
    min_samples_leaf=best_params[3],
    random_state=42
)
best_model.fit(X_train_preprocessed, y_train)

# Evaluate the final model
y_pred = best_model.predict(X_test_preprocessed)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error of Predictions: {mse}')



gen	nevals
0  	50    


InvalidParameterError: The 'max_depth' parameter of RandomForestRegressor must be an int in the range [1, inf) or None. Got 0 instead.