## Packages

In [1]:
import os
import pandas as pd
import numpy as np
import pyBigWig
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import spearmanr
import xgboost as xgb

## Data Loading & Preprocessing

In [33]:
# Load bigwig files and extract features for each gene 
def load_bw_features(file_path, gene_df, window_length, num_bins):
    
    print(f"Loading this bigWig files: {file_path}")
    bw = pyBigWig.open(file_path)
    features = []
    half_window = window_length // 2

    for idx, row in gene_df.iterrows():
        chrom = row['chr'] 
        strand = row['strand']
        tss = int(row['TSS_start'] if strand == '+' else row['TSS_end'])
        start = max(0, tss - half_window)
        end = tss + half_window
        bins = np.linspace(start, end, num_bins + 1, dtype=int)
        values = []
        for i in range(num_bins):
            bin_start = bins[i]
            bin_end = bins[i + 1]
          
            bin_values = bw.stats(chrom, bin_start, bin_end, type='mean')[0]
            if bin_values is None:
                bin_values = 0
            values.append(bin_values)
        features.append(values)

    bw.close()
    features_df = pd.DataFrame(features)
    features_df.columns = [f'bin_{i}' for i in range(num_bins)]
    features_df['gene_name'] = gene_df['gene_name'].values
    return features_df

# Merge CAGE and expression data 
def load_and_merge_cage(info_path, expression_path=None):
    
    info_df = pd.read_csv(info_path, sep='\t')
    if expression_path:
        expression_df = pd.read_csv(expression_path, sep='\t')
        merged_df = pd.merge(info_df, expression_df, on='gene_name')
    else:
        merged_df = info_df
    return merged_df

# Process a cell line with features 
def process_cell_line(cell_line, histone_marks, window_length, n_bins, base_data_dir, data_type='train'):
    if data_type == 'train':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_train_info.tsv')
        expression_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_train_y.tsv')
    elif data_type == 'validation':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_val_info.tsv')
        expression_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_val_y.tsv')
    elif data_type == 'test':
        info_file = os.path.join(base_data_dir, 'CAGE-train', 'CAGE-train', f'{cell_line}_test_info.tsv')
        expression_file = None

    gene_df = load_and_merge_cage(info_file, expression_file)
    final_df = gene_df.copy()

    # Extract features for each histone mark 
    for mark in histone_marks:
        if mark == 'DNase':
            mark_dir = 'DNase-bigwig'
        else:
            mark_dir = f'{mark}-bigwig'

        file_extension = '.bw'
        bw_file = os.path.join(base_data_dir, mark_dir, f'{cell_line}{file_extension}')

        # Check if the file exists
        if not os.path.exists(bw_file):
            print(f"File not found: {bw_file}")
            continue 
        
        # Extract feature and include in df 
        features_df = load_bw_features(bw_file, gene_df, window_length, n_bins)
        feature_cols = [f'{mark}_bin_{i}' for i in range(n_bins)]
        features_df.columns = feature_cols + ['gene_name']
        final_df = pd.merge(final_df, features_df, on='gene_name')

    return final_df

if __name__ == "__main__":
    # Define parameters: here 2000 and 20 determined to perform best 
    window_length = 2000
    n_bins = 20           
    histone_marks = ['H3K4me1', 'H3K4me3', 'H3K27ac', 'DNase']
    base_data_dir = '/mnt/d/ML4G_Project_1_Data'

    # Process training data
    X1_train_data = process_cell_line('X1', histone_marks, window_length, n_bins, base_data_dir, data_type='train')
    X2_train_data = process_cell_line('X2', histone_marks, window_length, n_bins, base_data_dir, data_type='train')

    # Modify gene names so there is no overlap 
    X1_train_data['gene_name'] = X1_train_data['gene_name'] + '_X1'
    X2_train_data['gene_name'] = X2_train_data['gene_name'] + '_X2'

    # training data
    X_train_full = pd.concat([X1_train_data, X2_train_data], ignore_index=True)
    X_train_full.to_csv('X_train_less_3.tsv', sep='\t', index=False)

    # validation data
    X1_val_data = process_cell_line('X1', histone_marks, window_length, n_bins, base_data_dir, data_type='validation')
    X2_val_data = process_cell_line('X2', histone_marks, window_length, n_bins, base_data_dir, data_type='validation')

    # Modify gene names again so there is no overlap 
    X1_val_data['gene_name'] = X1_val_data['gene_name'] + '_X1'
    X2_val_data['gene_name'] = X2_val_data['gene_name'] + '_X2'

    # validation data
    X_validation = pd.concat([X1_val_data, X2_val_data], ignore_index=True)
    X_validation.to_csv('X_validation_less_3.tsv', sep='\t', index=False)

    # test data (X3)
    X3_test_data = process_cell_line('X3', histone_marks, window_length, n_bins, base_data_dir, data_type='test')
    X3_test_data.to_csv('X_3_test_less_3.tsv', sep='\t', index=False)

Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K4me3-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K27ac-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/DNase-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K4me3-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K27ac-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/DNase-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K4me3-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K27ac-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/DNase-bigwig/X1.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K4me3-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K27ac-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/DNase-bigwig/X2.bw
Loading this bigWig files: /mnt/d/ML4G_Project_1_Data/H3K4me3-bigwig/X3.bw
Loading this bigWig files: /mnt/d

## Model (XGBoost)

In [34]:
# spearmann correlation 
def spearman_corr(y_true, y_pred):
    corr, _ = spearmanr(y_true, y_pred)
    return corr

spearman_scorer = make_scorer(spearman_corr, greater_is_better=True)

# Preapre training and validation data and exclude non-numeric columns 
X_train_df = pd.read_csv('X_train_less_4.tsv', sep='\t')
X_val_df = pd.read_csv('X_validation_less_4.tsv', sep='\t')
non_numeric = ['gene_name', 'gex', 'chr', 'strand']

X_train = X_train_df.drop(columns=non_numeric)
y_train = X_train_df['gex']
X_val = X_val_df.drop(columns=non_numeric)
y_val = X_val_df['gex']

# Define paramteres needed for GridSearch with XGBoost model, fit the model and get best 
param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01],
    'n_estimators': [100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=spearman_scorer,
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print("Hyperparameters:", grid_search.best_params_)

# Get spearman for validation set 
y_val_pred = best_model.predict(X_val)
spearman_corr_val = spearmanr(y_val, y_val_pred)[0]
print(f"Correlation on validation set: {spearman_corr_val}")



Fitting 3 folds for each of 48 candidates, totalling 144 fits
Hyperparameters: {'colsample_bytree': 1.0, 'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Correlation on validation set: 0.7822579352477439


## Predicition on Test Data

In [13]:
X3_test = pd.read_csv('X_3_test_less_3.tsv', sep='\t')
X3_features = X3_test.drop(columns=['gene_name', 'chr', 'strand'])
pred = best_model.predict(X3_features)

assert isinstance(pred, np.ndarray), 'Prediction array must be a numpy array'
assert np.issubdtype(pred.dtype, np.number), 'Prediction array must be numeric'
assert pred.shape[0] == len(X3_test), 'Each gene should have a unique predicted expression'

save_dir = '/mnt/c/Users/marie/Desktop/ML_project1' 
file_name = 'gex_predicted.csv'        
zip_name = "SearchingForDNA_Project1.zip" 
save_path = f'{save_dir}/{zip_name}'
compression_options = dict(method="zip", archive_name=file_name)

test_genes  = pd.DataFrame({
    'gene_name': X3_test['gene_name'],
    'gex_predicted': pred.tolist()
})
test_genes.to_csv(save_path, compression=compression_options)
