## Imports

In [1]:
import pandas as pd
import numpy as np
import re
from shapely.geometry import Point
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import Counter
from sklearn.ensemble import IsolationForest, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
import joblib
from sklearn.metrics import classification_report
from scipy.stats import f
from datetime import datetime
import pickle
import os


DATA_DIR = 'Data/'

## Functions

### Loading

In [2]:
def get_offsets(header_lines):
    """
    Extracts offset and postseismic decay information from the header lines of a GNSS file.

    The function captures both coseismic and non-coseismic offsets, along with postseismic decays, for 
    north (N), east (E), and up (U) components. It parses lines starting with '#' and collects the relevant 
    values into a structured dictionary categorized by the component.

    Parameters:
    - header_lines (list of str): Lines from the file that contain metadata and comments starting with '#'.

    Returns:
    - components (dict): A dictionary with keys 'n', 'e', and 'u' representing the north, east, and up components.
      Each component holds a dictionary with:
        - 'offsets': A list of dictionaries containing offset information (value, error, date, coseismic flag).
        - 'ps_decays': A list of dictionaries containing postseismic decay information (value, error, tau, date, type).
    """
    
    # Capture important information from the header
    offset_pattern = re.compile(r"#\s*(\*?)\s*offset\s+\d+:?\s+([-\d.]+)\s+\+/\-\s+([-\d.]+)\s+mm.*?\((\d{4}-\d{2}-\d{2}).*?\)")
    ps_decay_pattern = re.compile(r'#!?\s*ps decay\s+\d:\s*(-?\d+\.\d+)\s+\+/-\s+(\d+\.\d+)\s+mm\s+\((\d{4}-\d{2}-\d{2})\s+\[(\d{4}\.\d+)\]\);\s*tau:\s*(\d+)\s+days')
    component_pattern = re.compile(r"#\s+([neu])\s+component")

    components = {'n': {'offsets': [], 'ps_decays': []}, 'e': {'offsets': [], 'ps_decays': []}, 'u': {'offsets': [], 'ps_decays': []}}
    current_component = None

    for line in header_lines:
        comp_match = component_pattern.match(line)
        if comp_match:
            current_component = comp_match.group(1)
            continue

        # Check for offset
        offset_match = offset_pattern.match(line)
        if offset_match and current_component:
            coseismic = bool(offset_match.group(1))  # True if * present, meaning coseismic
            offset_value = float(offset_match.group(2))
            offset_error = float(offset_match.group(3))
            offset_date = offset_match.group(4)
            components[current_component]['offsets'].append({
                'value': offset_value,
                'error': offset_error,
                'date': offset_date,
                'coseismic': coseismic
            })

        # Check for postseismic decay
        ps_decay_match = ps_decay_pattern.match(line)
        if ps_decay_match and current_component:
            decay_value = float(ps_decay_match.group(1))
            decay_error = float(ps_decay_match.group(2))
            decay_date = ps_decay_match.group(3)
            tau = int(ps_decay_match.group(5))
            # Determine decay type based on the presence of '!'
            decay_type = 'logarithmic' if '!' in line else 'exponential'
            components[current_component]['ps_decays'].append({
                'value': decay_value,
                'error': decay_error,
                'tau': tau,
                'date': decay_date,
                'type': decay_type
            })

    return components

def read_file(filename):
    """
    Reads a GNSS file, extracting both header and data information into a pandas DataFrame.

    The function processes the header to extract metadata (e.g., station coordinates, height, offsets, decays) 
    and processes the data section to extract time-series GNSS measurements. It combines these into a DataFrame 
    with attributes containing additional metadata.

    Parameters:
    - filename (str): The path to the file containing GNSS data.

    Returns:
    - data (pandas.DataFrame): A DataFrame containing the time-series GNSS data (N, E, U components, sigmas, correlations),
      indexed by date. The DataFrame has additional attributes storing station geometry (latitude, longitude), height, 
      and offset/decay information.
    """
    
    with open(DATA_DIR+filename, 'r') as file:
        lines = file.readlines()

    header_lines = [line for line in lines if line.startswith('#')]
    if header_lines:
        column_names = re.split(r'\s{2,}', header_lines[-1].lstrip('#').strip())
    else:
        column_names = []
        
    data_lines = []
    for line in lines:
        if not line.startswith('#'):
            parts = line.strip().split()
            # Check if the number of parts matches the expected number of columns
            if len(parts) < len(column_names):
                # Add None for missing values
                parts.extend([None] * (len(column_names) - len(parts)))
            data_lines.append(parts)

    data = pd.DataFrame(data_lines)
    data.columns = column_names
    
    # Extracts latitude, longitude and height
    pattern = r'Latitude\(DD\)\s*:\s*(-?\d+\.\d+)|East Longitude\(DD\)\s*:\s*(-?\d+\.\d+)|Height\s*\(M\)\s*:\s*(-?\d+\.\d+)'
    #referece_pattern = r'Reference_X\s*:\s*(-?\d+\.\d+)|Reference_Y\s*:\s*(-?\d+\.\d+)|Reference_Z\s*:\s*(-?\d+\.\d+)'
    matches = re.findall(pattern, ' '.join(header_lines))
    geom = Point(float(matches[1][1]), float(matches[0][0]))
    
    offsets = get_offsets(header_lines)

    data['Date'] = pd.to_datetime(data['Yr'].astype(str) + data['DayOfYr'].astype(str), format='%Y%j')
    data.set_index('Date', inplace=True)
    data.drop(['Dec Yr', 'Yr', 'DayOfYr', 'Chi-Squared'], axis=1, inplace=True)
    cols = ['N', 'E', 'U', 'N sig', 'E sig', 'U sig', 'CorrNE', 'CorrNU', 'CorrEU']
    data[cols] = data[cols].astype(float)
    
    data.name = filename.replace("RawTrend.neu", "")
    data.attrs['geometry'] = geom
    data.attrs['height'] = float(matches[2][2])
    data.attrs['offsets'] = offsets
    
    return data

### Cleaning

In [3]:
def add_missing_dates(df):
    df.index = pd.to_datetime(df.index)
    full_date_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
    df_full = df.reindex(full_date_range)
    df_full.name = df.name
    return df_full

def clean_dataframes(dfs, missing_value_threshold=None):
    """
    Cleans the dataframes by:
    1. Removing dataframes without any coseismic offsets in any of the 3 components (n, e, u).
    2. Removing non-coseismic offsets from all components.
    3. Optionally removing dataframes with excessive missing values in all 3 components.

    Parameters:
    dfs (list): List of dataframes with GNSS data.
    missing_value_threshold (float, optional): Percentage (0 to 1) of allowed missing values.
                                               If exceeded, the dataframe is removed.

    Returns:
    list: Cleaned list of dataframes.
    """

    cleaned_dfs = []
    components = ['N', 'E', 'U']
    components_offsets = ['n', 'e', 'u']

    for org_df in dfs:
        
        has_coseismic = False
        df = add_missing_dates(org_df)

        for comp in components_offsets:
            filtered_offsets = []
            for offset in df.attrs['offsets'][comp]['offsets']:
                if offset['coseismic']:
                    has_coseismic = True
                    filtered_offsets.append(offset)
            # Update offsets to retain only coseismic
            df.attrs['offsets'][comp]['offsets'] = filtered_offsets

        # Skip dataframe if no coseismic offsets in any component
        if not has_coseismic:
            continue

        # Check missing values for all components combined, if threshold is provided
        if missing_value_threshold is not None:
            total_values = sum(df[comp].size for comp in components)
            missing_values = sum(df[comp].isna().sum() for comp in components)

            missing_percentage = missing_values / total_values
            if missing_percentage > missing_value_threshold:
                continue  # Skip the dataframe if missing values exceed the threshold

        # Add the cleaned dataframe to the list
        cleaned_dfs.append(df)

    return cleaned_dfs

### Chow test

In [4]:
def chow_test(df, alpha=0.01, treshold=1, direction='N'):

    chow_df = pd.DataFrame(index=df.index, columns=['break', 'score'], dtype=float)
    chow_df['score'] = np.nan  # Initialize all 'score' values to NaN
    chow_df['break'] = 0  # Initialize all 'break' values to 0
    
    n = len(df)
    window_N = 60
    k = 2
    
    for i in range(n - window_N + 1):
        # Define windows
        full_window = df.iloc[i:i+window_N]
        left = full_window.iloc[:window_N//2]
        right = full_window.iloc[window_N//2:]

        # Full window
        y_window = full_window[direction].values
        coef_window = np.polyfit(np.arange(window_N), y_window, 1)
        pred_window = np.polyval(coef_window, np.arange(window_N))
        Sc = np.sum((y_window - pred_window) ** 2)

        # Left window
        y_left = left[direction].values
        coef_left = np.polyfit(np.arange(len(left)), y_left, 1)
        pred_left = np.polyval(coef_left, np.arange(len(left)))
        S1 = np.sum((y_left - pred_left) ** 2)
        N1 = len(left)

        # Right window
        y_right = right[direction].values
        coef_right = np.polyfit(np.arange(len(right)), y_right, 1)
        pred_right = np.polyval(coef_right, np.arange(len(right)))
        S2 = np.sum((y_right - pred_right) ** 2)
        N2 = len(right)
        
        # Calculate CTS
        cts = ((Sc - (S1 + S2)) / k ) / ((S1 + S2) / (N1 + N2 - 2 * k))
        dfd = N1 + N2 - 2 * k
        c_value = f.ppf(q=1-alpha, dfn=k, dfd=dfd)
        score = cts / c_value


        chow_df.at[full_window.index[window_N//2], 'score'] = cts 
        if abs(score) > treshold:
            chow_df.at[full_window.index[window_N//2], 'break'] = 1

    return chow_df

def apply_chow_test(stations_list, alpha=0.01, treshold=1):
    result_list = []

    for idx, station_df in enumerate(stations_list):
        if idx % 100 == 0:
            print(f"Processing station {idx}...")
            
        station_result = {'station_idx': idx}
        
        # Apply chow test to each direction (N, E, U)
        for direction in ['N', 'E', 'U']:
            result_key = f'breaks_{direction.lower()}'
            station_result[result_key] = chow_test(station_df[[direction]], alpha=alpha, treshold=treshold, direction=direction)
        
        result_list.append(station_result)

    return result_list

### Evaluation

In [12]:
def date_in_range(break_date_idx, offset_dates, threshold_days=30):
    """Check if a detected break (by its index) is within a certain range of any known offsets (within threshold days)."""
    for offset_date in offset_dates:
        if abs((break_date_idx - offset_date).days) <= threshold_days:
            return True
    return False


def evaluate_breaks(result_list, stations_list, threshold_days=30):
    stats = {'TP': 0, 'FP': 0, 'FN': 0, 'TN': 0}

    # Loop through each station and compare breaks with offsets
    for idx, station_result in enumerate(result_list):
        if idx % 100 == 0:
            print(f"Processing station {idx}...")
            
        station = stations_list[idx]
        offsets = station.attrs['offsets']

        for direction in ['n', 'e', 'u']:
            result_key = f'breaks_{direction}'
            breaks_df = station_result[result_key]

            # Get list of known offsets for this direction and convert them to datetime objects
            known_offsets = offsets[direction]['offsets']
            offset_dates = [datetime.strptime(offset['date'], '%Y-%m-%d') for offset in known_offsets]
            used_offsets = set()  # Set to track used offsets in each direction

            # True positives and false positives
            detected_breaks = breaks_df[breaks_df['break'] == 1]
            for break_idx, _ in detected_breaks.iterrows():
                break_date_idx = pd.to_datetime(break_idx)  # Ensure index is a Timestamp
                
                if date_in_range(break_date_idx, offset_dates, threshold_days):
                    if break_date_idx not in used_offsets:  # Count offset only once per direction
                        stats['TP'] += 1  # True positive
                        used_offsets.add(break_date_idx)
                else:
                    stats['FP'] += 1  # False positive

            # False negatives: unmatched offsets in each direction
            for offset in known_offsets:
                offset_date = datetime.strptime(offset['date'], '%Y-%m-%d')
                if offset_date not in used_offsets and not date_in_range(offset_date, detected_breaks.index, threshold_days):
                    stats['FN'] += 1  # False negative

            # True negatives
            non_breaks = breaks_df[breaks_df['break'] == 0]
            for non_break_idx, _ in non_breaks.iterrows():
                non_break_date_idx = pd.to_datetime(non_break_idx)  # Ensure index is a Timestamp
                
                if not date_in_range(non_break_date_idx, offset_dates, threshold_days):
                    stats['TN'] += 1  # True negative

    return stats


## Calling the functions

### Choose the treshold for the input data

In [26]:
dfs = []
dir = Path(DATA_DIR)
for file_path in dir.iterdir():
    if file_path.is_file():
        dfs.append(read_file(file_path.name))

# Adjust treshold
cleaned_dfs = clean_dataframes(dfs, missing_value_threshold=0.05)
print(len(cleaned_dfs))

#Adjust path and name
save_dir = 'Storage/5_percent'
os.makedirs(save_dir, exist_ok=True)

with open(os.path.join(save_dir, 'cleaned_dfs.pkl'), 'wb') as file:
    pickle.dump(cleaned_dfs, file)

"""
#Adjust name
with open(os.path.join(save_dir, 'cleaned_dfs.pkl'), 'rb') as file:
    cleaned_dfs = pickle.load(file)
    """

678


"\n#Adjust name\nwith open('storage/cleaned_dfs.pkl', 'rb') as file:\n    cleaned_dfs = pickle.load(file)\n    "

### Choose the treshold for the chow test

In [None]:
# Adjust treshold
results = apply_chow_test(cleaned_dfs, alpha=0.1, treshold=15)

#Adjust name
with open(os.path.join(save_dir, 'results15_a0_1.pkl'), 'wb') as file:
    pickle.dump(results, file)

"""
#Adjust name
with open(os.path.join(save_dir, 'results.pkl'), 'rb') as file:
    results = pickle.load(file)
"""

Processing station 0...


### Chose the treshold for the number of days

In [25]:
# Adjust treshold
stats = evaluate_breaks(results, cleaned_dfs, threshold_days=0)

print(f"True Positives: {stats['TP']}")
print(f"False Positives: {stats['FP']}")
print(f"False Negatives: {stats['FN']}")
print(f"True Negatives: {stats['TN']}")
stats_df = pd.DataFrame([stats])

#Adjust name
stats_df.to_csv(os.path.join(save_dir, 'stats20_0_a0_1.csv'), index=False)

"""
#Adjust name
stats_df = pd.read_csv('stats.csv')
"""

Processing station 0...
Processing station 100...
Processing station 200...
Processing station 300...
Processing station 400...
Processing station 500...
Processing station 600...
True Positives: 351
False Positives: 604
False Negatives: 2321
True Negatives: 13790016


"\n#Adjust name\nstats_df = pd.read_csv('stats.csv')\n"

Old version: If multiple breaks are detected within a close window around a single true offset, the sum of true positives (TP) and false negatives (FN) may increase beyond the actual number of true offsets.

Now: When you increase the threshold, you're expanding the range around each offset in which a predicted break can be considered a true positive (TP).
If there is a break predicted within the threshold window of an offset, it will now count as a true positive, whereas before it might have been considered a false positive (FP) or a false negative (FN).


## Appendix

In [21]:
def calculate_metrics(tp, fp, fn, tn):
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) > 0 else 0
    return precision, recall, f1, accuracy

In [27]:
outliers_percentage = save_dir.split('/')[-1].replace('_percent', '%')

# Get all 'stats' csv files in the directory
files = [f for f in os.listdir(save_dir) if f.startswith('stats') and f.endswith('.csv')]

# Extract chow threshold and day window from file names and store along with the file
file_info = []
for file in files:
    file_name_parts = file.replace('.csv', '').replace('stats', '').split('_')
    chow_treshold = int(file_name_parts[0])
    days_window = int(file_name_parts[1])
    file_info.append((chow_treshold, days_window, file))

# Sort the files by chow threshold first, and then by day window
file_info_sorted = sorted(file_info, key=lambda x: (x[0], x[1]))

# Iterate over each file (sorted) and process
for chow_treshold, days_window, file in file_info_sorted:
    file_path = os.path.join(save_dir, file)

    # Read the CSV file
    df = pd.read_csv(file_path)

    # Assume the csv has columns ['TP', 'FP', 'FN', 'TN']
    tp = df['TP'].sum()
    fp = df['FP'].sum()
    fn = df['FN'].sum()
    tn = df['TN'].sum()

    # Calculate metrics
    precision, recall, f1, accuracy = calculate_metrics(tp, fp, fn, tn)
    
    # Print the results
    print(f"Experiment Parameters:")
    print(f"Outliers: <{outliers_percentage}, Chow Treshold: {chow_treshold}, Days window: {days_window}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print("-" * 50)  # Separator for readability

Experiment Parameters:
Outliers: <5%, Chow Treshold: 10, Days window: 0
Precision: 0.0494
Recall: 0.2242
F1-Score: 0.0809
Accuracy: 0.9990
--------------------------------------------------
Experiment Parameters:
Outliers: <5%, Chow Treshold: 15, Days window: 0
Precision: 0.1460
Recall: 0.1755
F1-Score: 0.1594
Accuracy: 0.9996
--------------------------------------------------
Experiment Parameters:
Outliers: <5%, Chow Treshold: 20, Days window: 0
Precision: 0.2695
Recall: 0.1471
F1-Score: 0.1903
Accuracy: 0.9998
--------------------------------------------------
Experiment Parameters:
Outliers: <5%, Chow Treshold: 20, Days window: 0
Precision: 0.3675
Recall: 0.1314
F1-Score: 0.1935
Accuracy: 0.9998
--------------------------------------------------
Experiment Parameters:
Outliers: <5%, Chow Treshold: 20, Days window: 1
Precision: 0.4232
Recall: 0.2148
F1-Score: 0.2849
Accuracy: 0.9998
--------------------------------------------------
Experiment Parameters:
Outliers: <5%, Chow Treshol

In [82]:
def count_total_offsets(stations_list):
    total_offsets = 0
    
    for station_df in stations_list:
        # Access the offsets from the station attributes
        offsets = station_df.attrs.get('offsets', {})
        
        # Sum the offsets for all directions (n, e, u)
        for direction in offsets.values():
            # Each direction has a list of offsets
            total_offsets += len(direction['offsets'])  # Count the number of offsets in this direction

    return total_offsets

total_offsets = count_total_offsets(cleaned_dfs)
print(f"Total number of offsets found in cleaned_dfs: {total_offsets}")


Total number of offsets found in cleaned_dfs: 2672
