In [None]:
# download the dataset from https://bitbucket.org/gramacylab/tpm.git
!git clone https://bitbucket.org/gramacylab/tpm.git
!cp -r tpm/data/HST ../
!rm -rf tpm

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import os 

# Combine He and He2 data
he_data = pd.read_csv('../HST/hstHe.dat', header=0, sep='\s+')
he2_data = pd.read_csv('../HST/hstHe2.dat', header=0, sep='\s+')

# Concatenate the dataframes
combined_he_data = pd.concat([he_data, he2_data], axis=0, ignore_index=True)

# Save the combined data
combined_he_data.to_csv('../HST/hstHe.dat', sep=' ', index=False)

print(f"Combined He data shape: {combined_he_data.shape}")
print(f"Original He data shape: {he_data.shape}")
print(f"Original He2 data shape: {he2_data.shape}")


In [None]:

species = ['He', 'H', 'N', 'O', 'O2', 'N2'] 
for spec in species:
    # Load the data with space separator, skipping the header
    data = pd.read_csv(f'../HST/hst{spec}.dat', header=0, sep='\s+')

    # Separate features and target
    X = data.iloc[:, :-1]  # All columns except the last one
    y = data.iloc[:, -1]   # Last column

    # Scale the features to [0,1]
    scaler_X = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

    # Standardize the target variable to have mean=0 and variance=1
    y_standardized = y
    # y_standardized = (y - y.mean()) / y.std()

    # Initialize 10-fold cross validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)

    # Create and save each fold
    for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled), 1):
        # Get the training and validation sets for this fold
        X_train, X_val = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
        y_train, y_val = y_standardized.iloc[train_idx], y_standardized.iloc[val_idx]
        
        # Combine features and target
        train_data = pd.concat([X_train, y_train], axis=1)
        val_data = pd.concat([X_val, y_val], axis=1)
        
        # Save the fold data
        os.makedirs(f'../HST/{spec}', exist_ok=True)
        train_data.to_csv(f'../HST/{spec}/hst{spec}_fold{fold}_train.csv', index=False, header=False, sep=',')
        val_data.to_csv(f'../HST/{spec}/hst{spec}_fold{fold}_val.csv', index=False, header=False, sep=',')
        # Print sizes of train and validation sets
        print(f"\n{spec} Fold {fold}:")
        print(f"Training set size: {len(train_data)}")
        print(f"Validation set size: {len(val_data)}")

In [None]:
import pandas as pd

species = ['He', 'H', 'N', 'O', 'O2', 'N2']
# Calculate variance for each dataset
for spec in species:
    # Load the data
    data = pd.read_csv(f'hst{spec}.dat', header=0, sep='\s+')
    
    # Print total number of points
    print(f"\nTotal number of points for {spec}: {len(data)}")
    
    # Print variance information
    print(f"\n{'='*50}")
    print(f"Variance analysis for {spec}:")
    print(f"{'='*50}")
    # print("\nFeature variances:")
    # print(data.iloc[:, :-1].var())
    print(f"\nTarget variance: {data.iloc[:, -1].var():.6f}")