In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import zscore
from sklearn.ensemble import IsolationForest

import os

In [2]:
input_dir = '../01_data/00_metadata'
out_dir = '../03_results/out_metadata_outliers'
os.makedirs(out_dir, exist_ok=True)

filename = 'metadata_chile.tsv'
filepath = os.path.join(input_dir, filename)

In [3]:
md = pd.read_csv(filepath, sep='\t', index_col=0)

cont_variables = ['Depth [m]', 'Temperature [ºC]', 'Salinity [PSU]',
                  'Density [kg/m3]','Oxygen [ml/l]', 'Oxygen [%]', 'Fluorescence [mg/m3]',
                  'Orthophosphate [uM]', 'Silicic-acid [uM]', 'Nitrite [uM]','Nitrates [uM]',
                  'Nitrate [uM]', 'NP ratio'
]

md[cont_variables].head()

Unnamed: 0_level_0,Depth [m],Temperature [ºC],Salinity [PSU],Density [kg/m3],Oxygen [ml/l],Oxygen [%],Fluorescence [mg/m3],Orthophosphate [uM],Silicic-acid [uM],Nitrite [uM],Nitrates [uM],Nitrate [uM],NP ratio
Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
S01_Z00,2,11.272,24.72,1018.754,6.665,101.623,13.091,1.206,7.38,0.03,12.382,12.353,10.267
S01_Z01,20,10.569,29.59,1022.73,5.856,90.689,4.438,0.389,1.483,0.097,2.133,2.037,5.484
S01_Z05,90,9.06,32.82,1025.818,4.836,73.974,1.328,1.246,3.997,0.024,13.68,13.656,10.979
S02_Z00,2,11.465,26.58,1020.165,6.43,99.611,11.013,0.597,3.29,0.082,6.825,6.743,11.432
S02_Z01,10,11.426,26.774,1020.357,6.413,99.384,9.846,0.287,1.268,0.065,0.3,0.235,1.047


In [4]:
desc_stats = {}
for var in cont_variables:
    if var in md.columns:
        desc_stats[var] = md[var].describe()

stats_df = pd.concat(desc_stats, axis=1)

out_filename = 'env_continuous_description.tsv'
output_filepath = os.path.join(out_dir, out_filename)
stats_df.to_csv(output_filepath, sep='\t')

output_filepath

'../03_results/out_metadata_outliers/env_continuous_description.tsv'

In [5]:
# Z-score Method
def detect_outliers_zscore(data):
    z_scores = zscore(data)
    abs_z_scores = np.abs(z_scores)
    return (abs_z_scores > 3)  # Modify threshold as needed

# Tukey's Method (IQR)
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))

# Isolation Forest
def detect_outliers_iforest(data):
    iforest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    preds = iforest.fit_predict(data.values.reshape(-1, 1))
    return preds == -1

# Applying methods to each variable
outlier_results = {}
for var in cont_variables:
    if var in md.columns:
        data = md[var].dropna()  # Ensure no NaN values
        outliers_zscore = detect_outliers_zscore(data)
        outliers_iqr = detect_outliers_iqr(data)
        outliers_iforest = detect_outliers_iforest(data)
        outlier_results[var] = {
            'Z-score': data[outliers_zscore],
            'IQR': data[outliers_iqr],
            'Isolation Forest': data[outliers_iforest]
        }


In [6]:
# Initialize a dictionary to hold the DataFrame for each variable
outliers_dfs = {}

# Iterate over each variable and its corresponding outlier detection results
for var in cont_variables:
    if var in md.columns:
        # Initialize the DataFrame for this variable with the same index as md and columns for each method
        outliers_df = pd.DataFrame(index=md.index)
        outliers_df['Outlier_Zscore'] = False
        outliers_df['Outlier_IQR'] = False
        outliers_df['Outlier_IsolationForest'] = False

        # Check if the variable has any detected outliers and update the DataFrame
        if var in outlier_results:
            if 'Z-score' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['Z-score'].index, 'Outlier_Zscore'] = True
            if 'IQR' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['IQR'].index, 'Outlier_IQR'] = True
            if 'Isolation Forest' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['Isolation Forest'].index, 'Outlier_IsolationForest'] = True
        
        # Store the DataFrame in the dictionary
        outliers_dfs[var] = outliers_df

In [7]:
# Exporting each variable's outlier DataFrame to CSV
for var, df in outliers_dfs.items():
    var_temp = var.split(' ')[0]
    out_path = os.path.join(out_dir, f'outliers_{var_temp}.tsv')
    df.to_csv(out_path, sep='\t')