In [1]:
#!/usr/bin/env python
# coding: utf-8

import pandas as pd
import numpy as np
import re
from shapely.geometry import Point
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import Counter
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib


DATA_DIR = '/cluster/home/nteutschm/eqdetection/data/'
#DATA_DIR = 'Data/'
RANDOM_STATE = 86
MODEL_PATH = '/cluster/scratch/nteutschm/eqdetection/models/'
SAVE_PATH = '/cluster/scratch/nteutschm/eqdetection/predictions/'

# Loading the Data

In [2]:
def get_offsets(header_lines):
    # Capture important information from the header
    offset_pattern = re.compile(r"#\s*(\*?)\s*offset\s+\d+:?\s+([-\d.]+)\s+\+/\-\s+([-\d.]+)\s+mm.*?\((\d{4}-\d{2}-\d{2}).*?\)")
    ps_decay_pattern = re.compile(r'#!?\s*ps decay\s+\d:\s*(-?\d+\.\d+)\s+\+/-\s+(\d+\.\d+)\s+mm\s+\((\d{4}-\d{2}-\d{2})\s+\[(\d{4}\.\d+)\]\);\s*tau:\s*(\d+)\s+days')
    component_pattern = re.compile(r"#\s+([neu])\s+component")

    components = {'n': {'offsets': [], 'ps_decays': []}, 'e': {'offsets': [], 'ps_decays': []}, 'u': {'offsets': [], 'ps_decays': []}}
    current_component = None

    for line in header_lines:
        comp_match = component_pattern.match(line)
        if comp_match:
            current_component = comp_match.group(1)
            continue

        # Check for offset
        offset_match = offset_pattern.match(line)
        if offset_match and current_component:
            coseismic = bool(offset_match.group(1))  # True if * present, meaning coseismic
            offset_value = float(offset_match.group(2))
            offset_error = float(offset_match.group(3))
            offset_date = offset_match.group(4)
            components[current_component]['offsets'].append({
                'value': offset_value,
                'error': offset_error,
                'date': offset_date,
                'coseismic': coseismic
            })

        # Check for postseismic decay
        ps_decay_match = ps_decay_pattern.match(line)
        if ps_decay_match and current_component:
            decay_value = float(ps_decay_match.group(1))
            decay_error = float(ps_decay_match.group(2))
            decay_date = ps_decay_match.group(3)
            tau = int(ps_decay_match.group(5))
            # Determine decay type based on the presence of '!'
            decay_type = 'logarithmic' if '!' in line else 'exponential'
            components[current_component]['ps_decays'].append({
                'value': decay_value,
                'error': decay_error,
                'tau': tau,
                'date': decay_date,
                'type': decay_type
            })

    return components

def read_file(filename):
    
    with open(DATA_DIR+filename, 'r') as file:
        lines = file.readlines()

    header_lines = [line for line in lines if line.startswith('#')]
    if header_lines:
        column_names = re.split(r'\s{2,}', header_lines[-1].lstrip('#').strip())
    else:
        column_names = []
        
    data_lines = []
    for line in lines:
        if not line.startswith('#'):
            parts = line.strip().split()
            # Check if the number of parts matches the expected number of columns
            if len(parts) < len(column_names):
                # Add None for missing values
                parts.extend([None] * (len(column_names) - len(parts)))
            data_lines.append(parts)

    data = pd.DataFrame(data_lines)
    data.columns = column_names
    
    # Extracts latitude, longitude and height
    pattern = r'Latitude\(DD\)\s*:\s*(-?\d+\.\d+)|East Longitude\(DD\)\s*:\s*(-?\d+\.\d+)|Height\s*\(M\)\s*:\s*(-?\d+\.\d+)'
    #referece_pattern = r'Reference_X\s*:\s*(-?\d+\.\d+)|Reference_Y\s*:\s*(-?\d+\.\d+)|Reference_Z\s*:\s*(-?\d+\.\d+)'
    matches = re.findall(pattern, ' '.join(header_lines))
    geom = Point(float(matches[1][1]), float(matches[0][0]))
    
    offsets = get_offsets(header_lines)

    data['Date'] = pd.to_datetime(data['Yr'].astype(str) + data['DayOfYr'].astype(str), format='%Y%j')
    data.set_index('Date', inplace=True)
    data.drop(['Dec Yr', 'Yr', 'DayOfYr', 'Chi-Squared'], axis=1, inplace=True)
    cols = ['N', 'E', 'U', 'N sig', 'E sig', 'U sig', 'CorrNE', 'CorrNU', 'CorrEU']
    data[cols] = data[cols].astype(float)
    
    data.name = filename.replace("RawTrend.neu", "")
    data.attrs['geometry'] = geom
    data.attrs['height'] = float(matches[2][2])
    data.attrs['offsets'] = offsets
    
    return data

In [3]:
def organize_files():
    gdfs = []
    dir = Path(DATA_DIR)
    for file_path in dir.iterdir():
        if file_path.is_file():
            gdfs.append(read_file(file_path.name))
    return gdfs

# Cleaning Data

In [5]:
def add_missing_dates(df):
    df.index = pd.to_datetime(df.index)
    full_date_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
    df_full = df.reindex(full_date_range)
    df_full.name = df.name
    return df_full

In [6]:
def clean_dataframes(dfs, missing_value_threshold=None):
    """
    Cleans the dataframes by:
    1. Removing dataframes without any coseismic offsets in any of the 3 components (n, e, u).
    2. Removing non-coseismic offsets from all components.
    3. Optionally removing dataframes with excessive missing values in all 3 components.

    Parameters:
    dfs (list): List of dataframes with GNSS data.
    missing_value_threshold (float, optional): Percentage (0 to 1) of allowed missing values.
                                               If exceeded, the dataframe is removed.

    Returns:
    list: Cleaned list of dataframes.
    """

    cleaned_dfs = []
    components = ['N', 'E', 'U']
    components_offsets = ['n', 'e', 'u']

    for org_df in dfs:
        
        has_coseismic = False
        df = add_missing_dates(org_df)

        for comp in components_offsets:
            filtered_offsets = []
            for offset in df.attrs['offsets'][comp]['offsets']:
                if offset['coseismic']:
                    has_coseismic = True
                    filtered_offsets.append(offset)
            # Update offsets to retain only coseismic
            df.attrs['offsets'][comp]['offsets'] = filtered_offsets

        # Skip dataframe if no coseismic offsets in any component
        if not has_coseismic:
            continue

        # Check missing values for all components combined, if threshold is provided
        if missing_value_threshold is not None:
            total_values = sum(df[comp].size for comp in components)
            missing_values = sum(df[comp].isna().sum() for comp in components)

            missing_percentage = missing_values / total_values
            if missing_percentage > missing_value_threshold:
                continue  # Skip the dataframe if missing values exceed the threshold

        # Add the cleaned dataframe to the list
        cleaned_dfs.append(df)

    return cleaned_dfs

# Isolation Forest

In [68]:
def extract_features(dfs):
    """
    Extracts relevant features from a list of dataframes, including displacement values, 
    errors, offsets, decay information, station locations, and heights.

    Parameters:
    dfs (list): List of dataframes with GNSS data.

    Returns:
    DataFrame: Combined dataframe with all extracted features.
    """
    feature_dfs = []
    components_offsets = ['n', 'e', 'u']

    for df in dfs:
        features = df[['N', 'E', 'U', 'N sig', 'E sig', 'U sig', 'CorrNE', 'CorrNU', 'CorrEU']].copy()
        
        features.interpolate(method='time', inplace=True)

        location = df.attrs.get('geometry')
        if isinstance(location, Point):
            features['latitude'] = location.y
            features['longitude'] = location.x

        features['height'] = df.attrs.get('height')
        
        for comp in components_offsets:
            offset_series = pd.Series(0, index=df.index)
            offset_error_series = pd.Series(0, index=df.index)

            for offset in df.attrs['offsets'][comp]['offsets']:
                offset_series.loc[offset['date']] = offset['value']
                offset_error_series.loc[offset['date']] = offset['error']

            features[f'{comp}_offset_value'] = offset_series
            features[f'{comp}_offset_error'] = offset_error_series
        
        for comp in components_offsets:
            decay_value_series = pd.Series(0, index=df.index)
            decay_tau_series = pd.Series(0, index=df.index)
            decay_error_series = pd.Series(0, index=df.index)
            decay_type_series = pd.Series(0, index=df.index)

            for decay in df.attrs['offsets'][comp]['ps_decays']:
                decay_value_series.loc[decay['date']] = decay['value']
                decay_error_series.loc[decay['date']] = decay['error']
                decay_tau_series.loc[decay['date']] = decay['tau']
                decay_type_series.loc[decay['date']] = 1 if decay['type'] == 'logarithmic' else 2 if decay['type'] == 'exponential' else 0
            features[f'{comp}_decay_value'] = decay_value_series
            features[f'{comp}_decay_error'] = decay_error_series
            features[f'{comp}_decay_tau'] = decay_tau_series
            features[f'{comp}_decay_type'] = decay_type_series
        
        feature_dfs.append(features)
    
    return feature_dfs

In [None]:
def train_isolation_forest(X):
    X_train, y_test = train_test_split(X, test_size=0.3, random_state=RANDOM_STATE)
    y_test = pd.concat(y_test, axis=0)
    X_test = y_test.drop(columns=[col for col in y_test.columns if 'offset' in col or 'decay' in col])
    
    model = IsolationForest(contamination=0.05, random_state=RANDOM_STATE)
    model.fit(X_train)

    joblib.dump(model, MODEL_PATH)
    
    test_predictions = model.predict(X_test)

    return model, test_predictions

In [None]:
def save_predictions(test_predictions):
    test_predictions_df = pd.DataFrame(test_predictions, columns=['Predictions'])
    test_predictions_df.to_csv(SAVE_PATH, index=False)

In [None]:
def main():
    dfs = organize_files()
    cleaned_dfs = clean_dataframes(dfs, missing_value_threshold=0.2)
    X = extract_features(cleaned_dfs)
    model, test_predictions = train_isolation_forest(X)
    save_predictions(test_predictions)

In [None]:
if __name__=='__main__':
    main()