In [1]:
#Feature Engineering
"""
This script creates a dataset with the features used for model prediction
"""

'\nThis script creates a dataset with the features used for model prediction\n'

In [2]:
import os
import pandas as pd
import numpy as np
import pyBigWig
from sklearn.model_selection import train_test_split

In [11]:
# Function to load bigWig files and extract features
def load_bw_features(bw_file_path, gene_info_df, window_length, n_bins):
    """
    Load bigWig data and extract features for each gene based on the window length and number of bins.
    """
    print(f"Opening bigWig file: {bw_file_path}")
    bw = pyBigWig.open(bw_file_path)
    features = []
    half_window = window_length // 2

    for idx, row in gene_info_df.iterrows():
        chrom = row['chr']  # Adjusted column name
        strand = row['strand']
        # Use 'TSS_start' and 'TSS_end' instead of 'start' and 'end'
        tss = row['TSS_start'] if strand == '+' else row['TSS_end']
        tss = int(tss)  # Ensure tss is an integer
        start = max(0, tss - half_window)
        end = tss + half_window
        bins = np.linspace(start, end, n_bins + 1, dtype=int)
        values = []
        for i in range(n_bins):
            bin_start = bins[i]
            bin_end = bins[i + 1]
            try:
                bin_values = bw.stats(chrom, bin_start, bin_end, type='mean')[0]
                if bin_values is None:
                    bin_values = 0
            except Exception as e:
                print(f"Error fetching stats for {chrom}:{bin_start}-{bin_end}: {e}")
                bin_values = 0
            values.append(bin_values)
        features.append(values)

    bw.close()
    features_df = pd.DataFrame(features)
    features_df.columns = [f'bin_{i}' for i in range(n_bins)]
    features_df['gene_name'] = gene_info_df['gene_name'].values
    return features_df

# Function to load and merge CAGE info and expression data
def load_and_merge_cage(info_file_path, expression_file_path=None):
    """
    Load CAGE info and expression data, and merge them on 'gene_name'.
    """
    info_df = pd.read_csv(info_file_path, sep='\t')
    if expression_file_path:
        expression_df = pd.read_csv(expression_file_path, sep='\t')
        merged_df = pd.merge(info_df, expression_df, on='gene_name')
    else:
        merged_df = info_df
    return merged_df

# Function to process a single cell line
def process_cell_line(cell_line, histone_marks, window_length, n_bins, base_data_dir, data_type='train'):
    """
    Process a single cell line, extract features, and merge with gene expression data if available.
    data_type: 'train', 'validation', or 'test'
    """
    # Load gene info and expression data
    if data_type == 'train':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_train_info.tsv')
        expression_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_train_y.tsv')
    elif data_type == 'validation':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_val_info.tsv')
        expression_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_val_y.tsv')
    elif data_type == 'test':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_test_info.tsv')
        expression_file = None
    else:
        raise ValueError("data_type must be 'train', 'validation', or 'test'")

    gene_df = load_and_merge_cage(info_file, expression_file)

    # Ensure data types are correct
    gene_df['TSS_start'] = gene_df['TSS_start'].astype(int)
    gene_df['TSS_end'] = gene_df['TSS_end'].astype(int)
    gene_df['strand'] = gene_df['strand'].astype(str)
    gene_df['chr'] = gene_df['chr'].astype(str)

    # Initialize final features DataFrame
    final_df = gene_df.copy()

    # For each histone mark, extract features and merge
    for mark in histone_marks:
        if mark == 'DNase':
            mark_dir = 'DNase-bigwig'
        else:
            mark_dir = f'{mark}-bigwig'

        # All files now have the .bw extension
        file_extension = '.bw'

        # Construct the bigWig file path
        bw_file = os.path.join(base_data_dir, mark_dir, f'{cell_line}{file_extension}')

        # Check if the file exists
        if not os.path.exists(bw_file):
            print(f"File not found: {bw_file}")
            continue  # Skip this histone mark if the file is missing

        # Extract features
        features_df = load_bw_features(bw_file, gene_df, window_length, n_bins)
        # Rename feature columns to include histone mark
        feature_cols = [f'{mark}_bin_{i}' for i in range(n_bins)]
        features_df.columns = feature_cols + ['gene_name']
        # Merge with final_df
        final_df = pd.merge(final_df, features_df, on='gene_name')

    return final_df

# Main code for data preprocessing
if __name__ == "__main__":
    # Define parameters
    window_length = 2000  # Adjust as needed
    n_bins = 20           # Adjust as needed
    histone_marks = ['H3K27me3', 'H3K4me1', 'H3K4me3', 'H3K27ac', 'H3K36me3', 'DNase'] #'H3K9me3',
    base_data_dir = '/Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data'

    # Process training data
    X1_train_data = process_cell_line('X1', histone_marks, window_length, n_bins, base_data_dir, data_type='train')
    X2_train_data = process_cell_line('X2', histone_marks, window_length, n_bins, base_data_dir, data_type='train')

    # Modify gene names to avoid overlap
    X1_train_data['gene_name'] = X1_train_data['gene_name'] + '_X1'
    X2_train_data['gene_name'] = X2_train_data['gene_name'] + '_X2'

    # Merge training data
    X_train_full = pd.concat([X1_train_data, X2_train_data], ignore_index=True)
    X_train_full.to_csv('X_train.tsv', sep='\t', index=False)

    # Process validation data
    X1_val_data = process_cell_line('X1', histone_marks, window_length, n_bins, base_data_dir, data_type='validation')
    X2_val_data = process_cell_line('X2', histone_marks, window_length, n_bins, base_data_dir, data_type='validation')

    # Modify gene names to avoid overlap
    X1_val_data['gene_name'] = X1_val_data['gene_name'] + '_X1'
    X2_val_data['gene_name'] = X2_val_data['gene_name'] + '_X2'

    # Merge validation data
    X_validation = pd.concat([X1_val_data, X2_val_data], ignore_index=True)
    X_validation.to_csv('X_validation.tsv', sep='\t', index=False)

    # Process test data (X3)
    X3_test_data = process_cell_line('X3', histone_marks, window_length, n_bins, base_data_dir, data_type='test')
    X3_test_data.to_csv('X_3_test.tsv', sep='\t', index=False)

Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K27me3-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K4me1-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K4me3-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K27ac-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K36me3-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/DNase-bigwig/X1.bw
Opening bigWig file: /Users/gonuni/Desktop/College/CBB/3rd_Semester/ML4Genomics/Projects/Project_1/ML4G_Project_1_Data/H3K27me3-bigwig/X2.bw
Opening bigWig file

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
import xgboost as xgb

# Custom Spearman correlation scorer
def spearman_corr(y_true, y_pred):
    corr, _ = spearmanr(y_true, y_pred)
    return corr

spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

# Load training and validation data
X_train_df = pd.read_csv('X_train.tsv', sep='\t')
X_val_df = pd.read_csv('X_validation.tsv', sep='\t')

# Drop non-numeric columns
non_numeric_columns = ['gene_name', 'gex', 'chr', 'strand']

X_train = X_train_df.drop(columns=non_numeric_columns)
y_train = X_train_df['gex']

X_val = X_val_df.drop(columns=non_numeric_columns)
y_val = X_val_df['gex']

# Define parameter grid for GridSearch
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=spearman_scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# After obtaining best_model
y_train_pred = best_model.predict(X_train)
spearman_corr_train = spearmanr(y_train, y_train_pred)[0]
print(f"Spearman correlation on training set: {spearman_corr_train:.4f}")

# Evaluate on validation set
y_val_pred = best_model.predict(X_val)
spearman_corr_val = spearmanr(y_val, y_val_pred)[0]
print(f"Spearman correlation on validation set: {spearman_corr_val}")

# Make predictions on X3 test set
X3_test = pd.read_csv('X_3_test.tsv', sep='\t')
X3_features = X3_test.drop(columns=['gene_name', 'chr', 'strand'])

# Predict gene expression
gex_predicted = best_model.predict(X3_features)

# Save predictions
output_df = pd.DataFrame({
    'gene_name': X3_test['gene_name'],
    'gex_predicted': gex_predicted
})
output_df.to_csv('gex_predicted.csv', index=False)

Fitting 3 folds for each of 48 candidates, totalling 144 fits
Best hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 100, 'subsample': 1.0}
Spearman correlation on training set: 0.7766
Spearman correlation on validation set: 0.7872874201196813
