In [69]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import zscore
from sklearn.ensemble import IsolationForest

import os

In [70]:
input_dir = '../00_matrices'
out_dir = '../../out_results/metadata'
os.makedirs(out_dir, exist_ok=True)
filename = 'metadata_chile.tsv'
filepath = os.path.join(input_dir, filename)

In [71]:
md = pd.read_csv(filepath, sep='\t', index_col=0)

cont_variables = ['Depth', 'Temperature', 'Salinity', 'Density',
                  'Oxygen', 'Fluorescence', 'Orthophosphate',
                  'Silicic acid', 'Nitrite', 'Nitrates', 'Nitrate', 'NP']

md[cont_variables].head()

Unnamed: 0_level_0,Depth,Temperature,Salinity,Density,Oxygen,Fluorescence,Orthophosphate,Silicic acid,Nitrite,Nitrates,Nitrate,NP
Samples,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
S01_Z00,2.0,11.272,24.72,1018.754,6.665,13.091,1.206,7.38,0.03,12.382,12.353,10.267
S01_Z01,20.0,10.569,29.59,1022.73,5.856,4.438,389.0,1.483,97.0,2.133,2.037,5.484
S01_Z05,90.0,9.06,32.82,1025.818,4.836,1.328,1.246,3.997,24.0,13.68,13.656,10.979
S02_Z00,2.0,11.465,26.58,1020.165,6.43,11.013,597.0,3.29,82.0,6.825,6.743,11.432
S02_Z01,10.0,11.426,26.774,1020.357,6.413,9.846,287.0,1.268,65.0,0.3,235.0,1.047


In [72]:
desc_stats = {}
for var in cont_variables:
    if var in md.columns:
        desc_stats[var] = md[var].describe()

stats_df = pd.concat(desc_stats, axis=1)

out_filename = 'env_continuous_description.tsv'
output_filepath = os.path.join(out_dir, out_filename)
stats_df.to_csv(output_filepath, sep='\t')

output_filepath

'../../out_results/metadata/env_continuous_description.tsv'

In [73]:
# Z-score Method
def detect_outliers_zscore(data):
    z_scores = zscore(data)
    abs_z_scores = np.abs(z_scores)
    return (abs_z_scores > 3)  # Modify threshold as needed

# Tukey's Method (IQR)
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    return ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR)))

# Isolation Forest
def detect_outliers_iforest(data):
    iforest = IsolationForest(n_estimators=100, contamination='auto', random_state=42)
    preds = iforest.fit_predict(data.values.reshape(-1, 1))
    return preds == -1

# Applying methods to each variable
outlier_results = {}
for var in cont_variables:
    if var in md.columns:
        data = md[var].dropna()  # Ensure no NaN values
        outliers_zscore = detect_outliers_zscore(data)
        outliers_iqr = detect_outliers_iqr(data)
        outliers_iforest = detect_outliers_iforest(data)
        outlier_results[var] = {
            'Z-score': data[outliers_zscore],
            'IQR': data[outliers_iqr],
            'Isolation Forest': data[outliers_iforest]
        }


In [74]:
# Initialize a dictionary to hold the DataFrame for each variable
outliers_dfs = {}

# Iterate over each variable and its corresponding outlier detection results
for var in cont_variables:
    if var in md.columns:
        # Initialize the DataFrame for this variable with the same index as md and columns for each method
        outliers_df = pd.DataFrame(index=md.index)
        outliers_df['Outlier_Zscore'] = False
        outliers_df['Outlier_IQR'] = False
        outliers_df['Outlier_IsolationForest'] = False

        # Check if the variable has any detected outliers and update the DataFrame
        if var in outlier_results:
            if 'Z-score' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['Z-score'].index, 'Outlier_Zscore'] = True
            if 'IQR' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['IQR'].index, 'Outlier_IQR'] = True
            if 'Isolation Forest' in outlier_results[var]:
                outliers_df.loc[outlier_results[var]['Isolation Forest'].index, 'Outlier_IsolationForest'] = True
        
        # Store the DataFrame in the dictionary
        outliers_dfs[var] = outliers_df

In [78]:
# Exporting each variable's outlier DataFrame to CSV
for var, df in outliers_dfs.items():
    out_path = os.path.join(out_dir, f'outliers_{var}.tsv')
    df.to_csv(out_path, sep='\t')