In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [3]:
combined_df_nans = pd.read_parquet("/cluster/home/maikents/sinmod_features_masters/dataframes_pipeline_6/total_df_with_nans_and_labels.parquet")

In [4]:
combined_df_no_nans = combined_df_nans.dropna()

In [7]:
#Remove features after correlation analysis 
print(combined_df_no_nans.columns)

Index(['x', 'y', 'bathymetry', 'slope', 'fine_BPI_std', 'broad_BPI_std',
       'log_ruggedness', 'aspect_cos', 'aspect_sin', 'temperature_min',
       'temperature_mean', 'salinity_mean', 'current_speed_mean',
       'statistical_northness', 'statistical_eastness', 'current_aspect_angle',
       'chlor_a_mean', 'sediment_nitrate_10th_percentile',
       'sediment_silicate_10th_percentile',
       'sediment_silicate_90th_percentile', 'labels'],
      dtype='object')


In [6]:
combined_df_no_nans = combined_df_no_nans.drop(columns=['chlor_a_10th_percentile', 'sediment_nitrate_mean',
'sediment_nitrate_90th_percentile', 'temperature_max', 'salinity_10th_percentile', 'salinity_90th_percentile',
'current_speed_90th_percentile', 'chlor_a_90th_percentile', 'sediment_silicate_mean'], axis=1)

In [8]:
presence_df = combined_df_no_nans[combined_df_no_nans['labels'] == 1]

In [9]:
absence_df = combined_df_no_nans[combined_df_no_nans['labels'] == 0]
downsampled_absences = absence_df.sample(n=2000, random_state=42)

In [10]:
balanced_df = pd.concat([downsampled_absences, presence_df])

In [11]:
#Define features and labels
X = balanced_df.drop(columns=['labels', 'x', 'y'])  
y = balanced_df['labels']

In [12]:
#Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
X_train.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/X_train_spatial.csv", index=False)
X_test.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/X_test_spatial.csv", index=False)
y_train.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/y_train_spatial.csv", index=False)
y_test.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6_v2/y_test_spatial.csv", index=False)

In [14]:
#For spatial block cv:
n_x_bins = 3
n_y_bins = 2

balanced_df['x_bin'] = pd.cut(balanced_df['x'], bins=n_x_bins, labels=False)
balanced_df['y_bin'] = pd.cut(balanced_df['y'], bins=n_y_bins, labels=False)

balanced_df['spatial_block'] = balanced_df['y_bin'] * n_x_bins + balanced_df['x_bin']



In [15]:
X = balanced_df.drop(columns=['labels', 'spatial_block', 'x', 'y'])  
y = balanced_df['labels']
groups = balanced_df['spatial_block'] 

X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split(
    X, y, groups, test_size=0.2, random_state=42, stratify=groups
)


In [16]:
X_train.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/X_train_spatial.csv", index=False)
X_test.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/X_test_spatial.csv", index=False)
y_train.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/y_train_spatial.csv", index=False)
y_test.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/y_test_spatial.csv", index=False)
groups_train.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/groups_train_spatial.csv", index=False)
groups_test.to_csv("/cluster/home/maikents/sinmod_features_masters/ready_for_training/pipeline6/groups_test_spatial.csv", index=False)
