In [103]:
import os
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import pairwise_distances, silhouette_score
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [13]:
def load_and_clean(npy_filename, y_filename, x_col, target_col):
    """
    Load the feature array (npy_filename) and label array (y_filename),
    drop rows in the feature array that contain any NaNs, and apply
    the same mask to the label array.
    """
    # Load numpy arrays
    X = np.load(os.path.join(folder, npy_filename))
    y = np.load(os.path.join(folder, y_filename), allow_pickle=True)
    
    # Ensure the number of rows matches between X and y
    if X.shape[0] != y.shape[0]:
        raise ValueError("The number of rows in {} and {} do not match.".format(npy_filename, y_filename))
    
    # Create a boolean mask for rows that do NOT have any NaN values in X
    valid_rows = ~np.isnan(X).any(axis=1)
    #print(valid_rows)
    # Filter both arrays using the valid_rows mask
    X_clean = X[valid_rows]
    y_clean = y[valid_rows]
    
    # Convert arrays to DataFrames
    df_X = pd.DataFrame(X_clean)
    df_y = pd.DataFrame(y_clean)
    df_X.columns = x_col
    df_y.columns = target_col

    df_y[target_col] = df_y[target_col].astype(int)
    return df_X, df_y

In [107]:
def analyze_dataset(df_num, dataset_name):
    df_num = df_num.select_dtypes(include=[np.number])

    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df_num)

    # Correlation Analysis
    corr_matrix = pd.DataFrame(df_scaled, columns=df_num.columns).corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    mean_corr = upper_tri.stack().mean()
    std_corr = upper_tri.stack().std()

    # Distance + Feature Clustering
    dist_matrix = pairwise_distances(df_scaled.T, metric='euclidean')
    dist_condensed = pdist(df_scaled.T)
    linkage_matrix = linkage(dist_condensed, method='ward')
    clusters = fcluster(linkage_matrix, t=2, criterion='maxclust')
    silhouette = silhouette_score(dist_matrix, clusters, metric='precomputed') if len(set(clusters)) > 1 else 0

    # Feature Continuity Score: average adjacent feature correlation (simulate 1D layout locality)
    feature_continuity = np.mean([np.corrcoef(df_scaled[:, i], df_scaled[:, i+1])[0, 1]
                                  for i in range(df_scaled.shape[1] - 1)])

    # Feature interaction proxy: variance of pairwise feature product means
    interaction_values = []
    for i in range(df_scaled.shape[1]):
        for j in range(i + 1, df_scaled.shape[1]):
            interaction_values.append(np.mean(df_scaled[:, i] * df_scaled[:, j]))
    interaction_var = np.var(interaction_values)

    # Normalized metrics
    norm_corr = min(mean_corr / 1.0, 1.0)
    norm_silhouette = max(silhouette, 0)
    norm_dim = min(df_scaled.shape[1] / 100, 1.0)
    norm_continuity = max(min(feature_continuity, 1), 0)
    norm_interaction = min(interaction_var / 1.0, 1.0)

    # Weighted Suitability Score
    suitability_score = (
        0.35 * norm_corr +
        0.26 * norm_silhouette +
        0.13 * norm_dim + 
        0.13 * norm_continuity +
        0.13 * norm_interaction
    )

    if suitability_score >= 0.66:
        category = 'High Correlation Suitability'
    elif suitability_score >= 0.33:
        category = 'Medium Correlation Suitability'
    else:
        category = 'Low Correlation Suitability'

    result = {
        'dataset': dataset_name,
        'n_samples': df_scaled.shape[0],
        'n_num_features': df_scaled.shape[1],
        'mean_corr': float(mean_corr),
        'std_corr': float(std_corr),
        'silhouette': float(silhouette),
        'feature_continuity': float(feature_continuity),
        'interaction_var': float(interaction_var),
        'suitability_score': float(suitability_score),
        'category': category
    }

    print(f"[DONE] {dataset_name}: Score={suitability_score:.3f} → {category}")
    # Compute average absolute correlation per feature (excluding self-correlation)
    feature_scores = corr_matrix.copy()
    np.fill_diagonal(feature_scores.values, np.nan)  # ignore self-correlation
    feature_avg_corr = feature_scores.mean(axis=1).sort_values(ascending=False)
    # Show as a table
    feature_table = pd.DataFrame({
        'Feature': feature_avg_corr.index,
        'Avg Absolute Correlation': feature_avg_corr.values
    })
    
    print("\n=== Average Absolute Correlation Per Feature ===")
    print(feature_table.to_string(index=False))

    return result

In [15]:
target_col=["class"]

In [108]:
folder="data/treasury"
x_col=["1Y-CMaturityRate", "30Y-CMortgageRate", "3M-Rate-AuctionAverage", "3M-Rate-SecondaryMarket", "3Y-CMaturityRate", 
       "5Y-CMaturityRate", "bankCredit", "currency", "demandDeposits", "federalFunds", "moneyStock", "checkableDeposits", 
       "loansLeases", "savingsDeposits", "tradeCurrencies"]
X_train, _ = load_and_clean('N_train.npy', 'y_train.npy',x_col, target_col)
analyze_dataset(X_train, folder)

[DONE] data/treasury: Score=0.666 → High Correlation Suitability

=== Average Absolute Correlation Per Feature ===
                Feature  Avg Absolute Correlation
 3M-Rate-AuctionAverage                  0.881946
      checkableDeposits                  0.872843
             bankCredit                  0.871836
       5Y-CMaturityRate                  0.866276
      30Y-CMortgageRate                  0.844544
        tradeCurrencies                  0.838322
            loansLeases                  0.834138
3M-Rate-SecondaryMarket                  0.826873
       3Y-CMaturityRate                  0.825911
               currency                  0.825009
             moneyStock                  0.816039
        savingsDeposits                  0.815735
         demandDeposits                  0.801799
           federalFunds                  0.801674
       1Y-CMaturityRate                  0.562426


{'dataset': 'data/treasury',
 'n_samples': 671,
 'n_num_features': 15,
 'mean_corr': 0.8190246831610097,
 'std_corr': 0.1340964965385351,
 'silhouette': 0.7702299002045945,
 'feature_continuity': 0.5387911062077669,
 'interaction_var': 0.688590223117676,
 'suitability_score': 0.6659779859718555,
 'category': 'High Correlation Suitability'}

In [109]:
folder="data/puma8NH"
x_col=["theta1", "theta2", "theta3", "thetad1", "thetad2", "thetad3", "tau1","tau2"]
X_train, _ = load_and_clean('N_train.npy', 'y_train.npy',x_col, target_col)
analyze_dataset(X_train, folder)

[DONE] data/puma8NH: Score=0.015 → Low Correlation Suitability

=== Average Absolute Correlation Per Feature ===
Feature  Avg Absolute Correlation
   tau2                  0.011663
 theta2                  0.010963
 theta1                  0.009622
 theta3                  0.007331
thetad2                  0.007282
   tau1                  0.006976
thetad3                  0.006474
thetad1                  0.005836


{'dataset': 'data/puma8NH',
 'n_samples': 5242,
 'n_num_features': 8,
 'mean_corr': 0.008268348352486755,
 'std_corr': 0.00650531152108807,
 'silhouette': 0.004769042974062304,
 'feature_continuity': 0.0022739606162350046,
 'interaction_var': 0.00010764769466484429,
 'suitability_score': 0.014843482177043543,
 'category': 'Low Correlation Suitability'}

In [110]:
folder="data/FOREX_cadjpy-day-High"
x_col=["Bid_Open", "Bid_High", "Bid_Low", "Bid_Close", "Bid_Volume", "Ask_Open", "Ask_High", "Ask_Low", "Ask_Close","Ask_Volume"]
X_train, _ = load_and_clean('N_train.npy', 'y_train.npy',x_col, target_col)
analyze_dataset(X_train, folder)

[DONE] data/FOREX_cadjpy-day-High: Score=0.645 → Medium Correlation Suitability

=== Average Absolute Correlation Per Feature ===
   Feature  Avg Absolute Correlation
   Ask_Low                  0.845501
   Bid_Low                  0.845448
 Bid_Close                  0.843024
 Ask_Close                  0.843024
  Ask_Open                  0.841695
  Bid_Open                  0.841624
  Ask_High                  0.840112
  Bid_High                  0.840089
Bid_Volume                  0.378499
Ask_Volume                  0.368663


{'dataset': 'data/FOREX_cadjpy-day-High',
 'n_samples': 1173,
 'n_num_features': 10,
 'mean_corr': 0.7487677680675592,
 'std_corr': 0.3388843248518122,
 'silhouette': 0.9482354646130104,
 'feature_continuity': 0.5654130508093214,
 'interaction_var': 0.38451655466092927,
 'suitability_score': 0.6451007883341608,
 'category': 'Medium Correlation Suitability'}

In [111]:
folder="data/wall-robot-navigation"
x_col=["V1", "V2", "V3", "V4", "V5", "V6", "V7", "V8","V9","V10","V11", "V12", "V13", 
       "V14", "V15", "V16", "V17", "V18","V19","V20","V21", "V22", "V23", "V24"]
X_train, _ = load_and_clean('N_train.npy', 'y_train.npy',x_col, target_col)
analyze_dataset(X_train, folder)

[DONE] data/wall-robot-navigation: Score=0.182 → Low Correlation Suitability

=== Average Absolute Correlation Per Feature ===
Feature  Avg Absolute Correlation
     V9                  0.216733
    V10                  0.201590
    V22                  0.201490
     V1                  0.200496
     V7                  0.200118
     V8                  0.194734
    V21                  0.193048
    V23                  0.189889
    V24                  0.184586
     V4                  0.178936
     V6                  0.178921
     V5                  0.177320
    V18                  0.175391
    V11                  0.173374
    V14                  0.164253
    V19                  0.161838
    V12                  0.159364
    V17                  0.155663
     V3                  0.144565
    V20                  0.142113
    V13                  0.140473
    V15                  0.134190
     V2                  0.105729
    V16                  0.098139


{'dataset': 'data/wall-robot-navigation',
 'n_samples': 3491,
 'n_num_features': 24,
 'mean_corr': 0.169706349687189,
 'std_corr': 0.11912759022047617,
 'silhouette': 0.13050503493449714,
 'feature_continuity': 0.4000057581485569,
 'interaction_var': 0.04291495250784353,
 'suitability_score': 0.18210822385881748,
 'category': 'Low Correlation Suitability'}