In [8]:
import pandas as pd
import numpy as np
import pyreadr

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
from sklearn.feature_selection import SelectKBest, f_regression

from scipy.stats import spearmanr
import xgboost as xgb

In [2]:
# specimens/subjects
specimens = pd.read_csv("subject_specimen.tsv", sep="\t", index_col=0)

# pbmcs
pbmc_freqs = pd.read_csv("pbmc_cell_frequency_batchCorrected_data.tsv", sep="\t", index_col=0).T
pbmc_tpms = pd.read_csv("pbmc_gene_expression_tpm_batchCorrected_data.tsv", sep="\t", index_col=0).T

# cytokines
cyto_olink = pd.read_csv("plasma_cytokine_concentrations_by_olink_batchCorrected_data.tsv", sep="\t", index_col=0).T
cyto_legend = pd.read_csv("plasma_cytokine_concentrations_by_legendplex_normalized_data.tsv", sep="\t", index_col=0).T

# plasma
plasma_ab = pd.read_csv("plasma_ab_titer_batchCorrected_data.tsv", sep="\t", index_col=0).T

# t cells
tcell_activ = pd.read_csv("t_cell_activation_raw_data.tsv", sep="\t").T
tcell_polar = pd.read_csv("t_cell_polarization_raw_data.tsv", sep="\t").T

def standardize_index(df):
    df.index = df.index.astype(int).astype(str)
    return df

specimens = standardize_index(specimens)
pbmc_freqs = standardize_index(pbmc_freqs)
pbmc_tpms = standardize_index(pbmc_tpms)
cyto_olink = standardize_index(cyto_olink)
cyto_legend = standardize_index(cyto_legend)
plasma_ab = standardize_index(plasma_ab)
tcell_activ = standardize_index(tcell_activ)
tcell_polar = standardize_index(tcell_polar)

cyto_olink = cyto_olink.add_suffix('_olink')
cyto_legend = cyto_legend.add_suffix('_legend')

# join
combined_df = specimens.copy()
other_dfs = [pbmc_freqs, pbmc_tpms, cyto_olink, cyto_legend, plasma_ab, tcell_activ, tcell_polar]

for df in other_dfs:
    combined_df = combined_df.join(df, how='left')

# problem 1.2 prep
day0_df = combined_df[combined_df['timepoint'] == 0].set_index('subject_id')
day14_df = combined_df[combined_df['timepoint'] == 14].set_index('subject_id')

merged_df = day0_df[['IgG_PT']].rename(columns={'IgG_PT': 'IgG_PT_day0'}).join(
    day14_df[['IgG_PT']].rename(columns={'IgG_PT': 'IgG_PT_day14'}),
    how='inner'
)

merged_df['fold_change_IgG_PT'] = (merged_df['IgG_PT_day14'] + 1e-6) / (merged_df['IgG_PT_day0'] + 1e-6)

combined_df = combined_df.merge(merged_df[['fold_change_IgG_PT']], left_on='subject_id', right_index=True, how='left')
combined_df['fold_change_IgG_PT'] = np.where(combined_df['timepoint'] == 14, combined_df['fold_change_IgG_PT'], np.nan)

# problem 2.2 prep
day0_df = combined_df[combined_df['timepoint'] == 0].set_index('subject_id')
day1_df = combined_df[combined_df['timepoint'] == 1].set_index('subject_id')

merged_df = day0_df[['Monocytes']].rename(columns={'Monocytes': 'Monocytes_day0'}).join(
    day1_df[['Monocytes']].rename(columns={'Monocytes': 'Monocytes_day1'}),
    how='inner'
)

merged_df['fold_change_Monocytes'] = (merged_df['Monocytes_day1'] + 1e-6) / (merged_df['Monocytes_day0'] + 1e-6)

combined_df = combined_df.merge(merged_df[['fold_change_Monocytes']], left_on='subject_id', right_index=True, how='left')
combined_df['fold_change_Monocytes'] = np.where(combined_df['timepoint'] == 1, combined_df['fold_change_Monocytes'], np.nan)

# problem 3.2 prep
day0_df = combined_df[combined_df['timepoint'] == 0].set_index('subject_id')
day3_df = combined_df[combined_df['timepoint'] == 1].set_index('subject_id')

merged_df = day0_df[['ENSG00000277632.1']].rename(columns={'ENSG00000277632.1': 'CCL3_day0'}).join(
    day3_df[['ENSG00000277632.1']].rename(columns={'ENSG00000277632.1': 'CCL3_day3'}),
    how='inner'
)

merged_df['fold_change_CCL3'] = (merged_df['CCL3_day3'] + 1e-6) / (merged_df['CCL3_day0'] + 1e-6)

combined_df = combined_df.merge(merged_df[['fold_change_CCL3']], left_on='subject_id', right_index=True, how='left')
combined_df['fold_change_CCL3'] = np.where(combined_df['timepoint'] == 3, combined_df['fold_change_CCL3'], np.nan)

# problem 4.1 prep
day0_df = combined_df[combined_df['timepoint'] == 0].set_index('subject_id')
day30_df = combined_df[combined_df['timepoint'] == 30].set_index('subject_id')

merged_df = day0_df[['PT_P01579', 'PT_P05113']].rename(columns={'PT_P01579': 'PT_P01579_day0', 'PT_P05113': 'PT_P05113_day0'}).join(
    day30_df[['PT_P01579', 'PT_P05113']].rename(columns={'PT_P01579': 'PT_P01579_day30', 'PT_P05113': 'PT_P05113_day30'}),
    how='inner'
)

merged_df['fold_change_PT_ratio'] = ((merged_df['PT_P01579_day30'] + 1e-6) / (merged_df['PT_P05113_day30'] + 1e-6)) / ((merged_df['PT_P01579_day0'] + 1e-6) / (merged_df['PT_P05113_day0'] + 1e-6))

combined_df = combined_df.merge(merged_df[['fold_change_PT_ratio']], left_on='subject_id', right_index=True, how='left')
combined_df['fold_change_PT_ratio'] = np.where(combined_df['timepoint'] == 30, combined_df['fold_change_PT_ratio'], np.nan)

print(combined_df.shape)
print(combined_df.isnull().sum().sum())
print(combined_df.isnull().sum().sort_values(ascending=False).head())

(896, 6751)
1566692
fold_change_PT_ratio     853
fold_change_Monocytes    822
fold_change_CCL3         803
fold_change_IgG_PT       785
PHA                      664
dtype: int64


## Training

In [12]:
from tqdm import tqdm
import time

def tune_and_evaluate(combined_df, target_col, train_datasets, test_dataset, target_timepoint, verbose=True):
    start_time = time.time()
    if verbose:
        print(f"\nStarting tuning for {target_col} (Train on {train_datasets}, Test on {test_dataset})")
    
    df = combined_df.copy() 
    print(f"Initial data shape: {df.shape}")
    
    # encode categorical variables
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    
    train_df = df[df['dataset'].isin(train_datasets)]
    test_df = df[df['dataset'] == test_dataset]
    
    print(f"Train data shape before timepoint filter: {train_df.shape}")
    print(f"Test data shape before timepoint filter: {test_df.shape}")
    
    # filter by timepoint and handle missing values
    train_df = train_df.dropna(subset=[target_col])
    test_df = test_df[test_df['timepoint'] == target_timepoint]
    test_df = test_df.dropna(subset=[target_col])
    
    print(f"Train data shape after filtering: {train_df.shape}")
    print(f"Test data shape after filtering: {test_df.shape}")
    
    # check if we have enough data to proceed
    if train_df.empty or test_df.empty:
        print(f"Warning: No data available for {target_col} with train datasets {train_datasets}, test dataset {test_dataset}, timepoint {target_timepoint}")
        return None, None, None
    
    # prepare features and targets
    X_train = train_df.drop(columns=[target_col, 'subject_id', 'dataset', 'timepoint'])
    y_train = train_df[target_col]
    X_test = test_df.drop(columns=[target_col, 'subject_id', 'dataset', 'timepoint'])
    y_test = test_df[target_col]
    
    # remove columns with all NaN values
    X_train = X_train.dropna(axis=1, how='all')
    
    # ensure X_train and X_test have the same columns
    common_columns = list(set(X_train.columns) & set(X_test.columns))
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]
    
    # impute missing values
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_test_imputed = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    # feature selection
    selector = SelectKBest(f_regression, k=min(100, X_train_imputed.shape[1]))
    X_train_selected = selector.fit_transform(X_train_imputed, y_train)
    X_test_selected = selector.transform(X_test_imputed)
    
    param_grid = {
        'max_depth': [3, 4, 5, 6, 8, 10],
        'learning_rate': [0.001, 0.01, 0.05, 0.1],
        'n_estimators': [100, 200, 300, 500],
        'min_child_weight': [1, 3, 5, 7],
        'gamma': [0, 0.1, 0.2, 0.3],
        'subsample': [0.6, 0.7, 0.8, 0.9],
        'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
        'reg_alpha': [0, 0.1, 0.5, 1.0],
        'reg_lambda': [0.1, 0.5, 1.0, 1.5]
    }
    
    if verbose:
        print(f"Starting RandomizedSearchCV with 10 iterations")
    
    # custom scorer
    def spearman_scorer(y_true, y_pred):
        correlation, _ = spearmanr(y_true, y_pred)
        return correlation
    
    # perform randomized search
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_jobs=1
    )
    
    # modify RandomizedSearchCV parameters
    random_search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grid,
        n_iter=50,
        scoring=make_scorer(spearman_scorer),
        cv=5,
        n_jobs=4,
        verbose=1,
        random_state=42,
        return_train_score=True,
        refit=True
    )
    
    random_search.fit(X_train_selected, y_train)
    
    # evaluate on test set
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test_selected)
    correlation, p_value = spearmanr(y_test, y_pred)
    
    elapsed_time = time.time() - start_time
    
    if verbose:
        print(f"\nResults for {target_col} (Test on dataset {test_dataset}):")
        print(f"Best parameters: {random_search.best_params_}")
        print(f"Spearman correlation: {correlation:.3f}")
        print(f"P-value: {p_value:.3f}")
        print(f"Time taken: {elapsed_time:.1f} seconds")
    
    return random_search.best_params_, correlation, p_value
    
def predict(combined_df, target_col, train_datasets=[0, 1], test_dataset=2, target_timepoint=1, xgb_params=None):
    df = combined_df.copy()
    
    # encode categorical variables
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns
    le = LabelEncoder()
    for col in categorical_cols:
        df[col] = le.fit_transform(df[col].astype(str))
    
    train_df = df[df['dataset'].isin(train_datasets)]
    
    # handle test dataset
    if test_dataset == 3:
        # for dataset 3, get the subjects from their existing timepoints
        test_df = df[df['dataset'] == test_dataset]
        # keep track of subject_ids in the same order as the data
        subject_ids = test_df['subject_id'].values
        X_test = test_df.drop(columns=[target_col, 'subject_id', 'dataset', 'timepoint'], errors='ignore')
        y_test = None
    else:
        test_df = df[(df['dataset'] == test_dataset) & (df['timepoint'] == target_timepoint)]
        test_df = test_df.dropna(subset=[target_col])
        X_test = test_df.drop(columns=[target_col, 'subject_id', 'dataset', 'timepoint'])
        y_test = test_df[target_col]
        subject_ids = None
    
    # remove rows with NaN in target column for training
    train_df = train_df.dropna(subset=[target_col])
    
    # prepare features and target for training
    X_train = train_df.drop(columns=[target_col, 'subject_id', 'dataset', 'timepoint'])
    y_train = train_df[target_col]
    
    # remove columns with all NaN values
    X_train = X_train.dropna(axis=1, how='all')
    
    # ensure X_train and X_test have the same columns
    common_columns = list(set(X_train.columns) & set(X_test.columns))
    X_train = X_train[common_columns]
    X_test = X_test[common_columns]
    
    # impute missing values
    imputer = SimpleImputer(strategy='median')
    X_train_imputed = pd.DataFrame(
        imputer.fit_transform(X_train),
        columns=X_train.columns,
        index=X_train.index
    )
    X_test_imputed = pd.DataFrame(
        imputer.transform(X_test),
        columns=X_test.columns,
        index=X_test.index
    )
    
    # feature selection
    selector = SelectKBest(f_regression, k=min(100, X_train_imputed.shape[1]))
    X_train_selected = selector.fit_transform(X_train_imputed, y_train)
    X_test_selected = selector.transform(X_test_imputed)
    
    selected_feature_names = X_train_imputed.columns[selector.get_support()].tolist()
    
    # train model
    if xgb_params is None:
        model = xgb.XGBRegressor(
            max_depth=6,
            learning_rate=0.01,
            n_estimators=200,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=3,
            gamma=0.1,
            reg_alpha=0.1,
            reg_lambda=1,
            objective='reg:squarederror',
            random_state=42
        )
    else:
        # use provided parameters
        model = xgb.XGBRegressor(
            **xgb_params,
            objective='reg:squarederror',
            random_state=42
        )

    model.fit(X_train_selected, y_train)
    
    # predict
    y_pred = model.predict(X_test_selected)
    
    print(f"\nPredicting {target_col} for dataset {test_dataset}")
    
    if test_dataset == 3:
        prediction_df = pd.DataFrame({
            'Subject': subject_ids,
            'Predicted_Value': y_pred
        })
        prediction_df['Predicted_Rank'] = prediction_df['Predicted_Value'].rank(method='dense', ascending=False)
        
        # get unique predictions (one per subject)
        prediction_df = prediction_df.groupby('Subject')['Predicted_Value'].mean().reset_index()
        prediction_df['Predicted_Rank'] = prediction_df['Predicted_Value'].rank(method='dense', ascending=False)
        
        print("\nPredicted Values with Subject IDs and Ranks:")
        print(prediction_df)
        return prediction_df
    else:
        y_pred_ranks = pd.Series(y_pred).rank(method='dense', ascending=False)
        y_test_ranks = y_test.rank(method='dense', ascending=False)
        spearman_corr, p_value = spearmanr(y_pred_ranks, y_test_ranks)
        
        comparison_df = pd.DataFrame({
            'Actual_Value': y_test.values,
            'Predicted_Value': y_pred,
            'Actual_Rank': y_test_ranks.values,
            'Predicted_Rank': y_pred_ranks.values
        })
        print("\nPredicted vs Actual Rankings:")
        print(comparison_df)
        
        print(f"\nSpearman Rank Correlation: {spearman_corr}")
        print(f"P-value: {p_value}")
        
        importance = model.feature_importances_
        feature_importance = pd.DataFrame({
            'feature': selected_feature_names,
            'importance': importance
        }).sort_values('importance', ascending=False)
        print("\nTop 10 most important features:")
        print(feature_importance.head(10))
        
        return comparison_df

In [13]:
test_combinations = [
    # IgG_PT and fold change
    ("IgG_PT", 14, [0, 1], 2),
    ("IgG_PT", 14, [0, 2], 1),
    ("IgG_PT", 14, [1, 2], 0),
    ("fold_change_IgG_PT", 14, [0, 1], 2),
    ("fold_change_IgG_PT", 14, [0, 2], 1),
    ("fold_change_IgG_PT", 14, [1, 2], 0),
    
    # Monocytes and fold change
    ("Monocytes", 1, [0, 1], 2),
    ("Monocytes", 1, [0, 2], 1),
    ("Monocytes", 1, [1, 2], 0),
    ("fold_change_Monocytes", 1, [0, 1], 2),
    ("fold_change_Monocytes", 1, [0, 2], 1),
    ("fold_change_Monocytes", 1, [1, 2], 0),
    
    # CCL3 (using ENSG ID) and fold change
    ("ENSG00000277632.1", 3, [0, 1], 2),
    ("ENSG00000277632.1", 3, [0, 2], 1),
    ("ENSG00000277632.1", 3, [1, 2], 0),
    ("fold_change_CCL3", 3, [0, 1], 2),
    ("fold_change_CCL3", 3, [0, 2], 1),
    ("fold_change_CCL3", 3, [1, 2], 0),
    
    # IFNG/IL5 ratio
    ("fold_change_PT_ratio", 30, [0, 1], 2),
    ("fold_change_PT_ratio", 30, [0, 2], 1),
    ("fold_change_PT_ratio", 30, [1, 2], 0)
]

# when running multiple combinations:
print(f"Total combinations to test: {len(test_combinations)}")
for idx, (target_col, timepoint, train_sets, test_set) in enumerate(test_combinations, 1):
    print(f"\nProcessing combination {idx}/{len(test_combinations)}")
    print(f"Target: {target_col}, Timepoint: {timepoint}")
    best_params, corr, p_val = tune_and_evaluate(
        combined_df, target_col, train_sets, test_set, timepoint
    )

# store
tuning_results = {}
for target_col, timepoint, train_sets, test_set in test_combinations:
    print(f"\nProcessing combination {target_col}, timepoint {timepoint}")
    print(f"Training on datasets {train_sets}, testing on dataset {test_set}")
    
    try:
        best_params, corr, p_val = tune_and_evaluate(
            combined_df, target_col, train_sets, test_set, timepoint
        )
        
        if best_params is not None:
            key = f"{target_col}_tp{timepoint}"
            tuning_results[key] = {
                'best_params': best_params,
                'correlation': corr,
                'p_value': p_val
            }
        else:
            print(f"Skipping {target_col} due to insufficient data")
            
    except Exception as e:
        print(f"Error processing {target_col}: {str(e)}")
        continue

# make final predictions for dataset 3 using the best parameters found
template_df = pd.read_csv("3rdChallengeSubmissionTemplate_10032024.tsv", sep='\t')

predictions = {
    '1.1) IgG-PT-D14-titer-Rank': predict(
        combined_df, 
        "IgG_PT", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=14,
        xgb_params=tuning_results['IgG_PT_tp14']['best_params']
    ),
    
    '1.2) IgG-PT-D14-FC-Rank': predict(
        combined_df, 
        "fold_change_IgG_PT", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=14,
        xgb_params=tuning_results['fold_change_IgG_PT_tp14']['best_params']
    ),
    
    '2.1) Monocytes-D1-Rank': predict(
        combined_df, 
        "Monocytes", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=1,
        xgb_params=tuning_results['Monocytes_tp1']['best_params']
    ),
    
    '2.2) Monocytes-D1-FC-Rank': predict(
        combined_df, 
        "fold_change_Monocytes", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=1,
        xgb_params=tuning_results['fold_change_Monocytes_tp1']['best_params']
    ),
    
    '3.1) CCL3-D3-Rank': predict(
        combined_df, 
        "ENSG00000277632.1", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=3,
        xgb_params=tuning_results['ENSG00000277632.1_tp3']['best_params']
    ),
    
    '3.2) CCL3-D3-FC-Rank': predict(
        combined_df, 
        "fold_change_CCL3", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=3,
        xgb_params=tuning_results['fold_change_CCL3_tp3']['best_params']
    ),
    
    '4.1) IFNG/IL5-Polarization-D30-Rank': predict(
        combined_df, 
        "fold_change_PT_ratio", 
        train_datasets=[0, 1, 2], 
        test_dataset=3, 
        target_timepoint=30,
        xgb_params=tuning_results['fold_change_PT_ratio_tp30']['best_params']
    )
}

# fill in template with predictions, maintaining template's SubjectID order
for col in template_df.columns:
    if col in predictions:
        rank_dict = dict(zip(predictions[col]['Subject'], predictions[col]['Predicted_Rank']))
        template_df[col] = template_df['SubjectID'].map(rank_dict)

# save final submission
template_df.to_csv("final_submission_hyper.tsv", sep='\t', index=False)

print("\nTuning Results Summary:")
for key, results in tuning_results.items():
    print(f"\n{key}:")
    print(f"Best correlation: {results['correlation']:.3f}")
    print(f"P-value: {results['p_value']:.3f}")

Total combinations to test: 21

Processing combination 1/21
Target: IgG_PT, Timepoint: 14

Starting tuning for IgG_PT (Train on [0, 1], Test on 2)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (566, 6751)
Test data shape before timepoint filter: (168, 6751)
Train data shape after filtering: (538, 6751)
Test data shape after filtering: (21, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits





Results for IgG_PT (Test on dataset 2):
Best parameters: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Spearman correlation: -0.001
P-value: 0.996
Time taken: 25.5 seconds

Processing combination 2/21
Target: IgG_PT, Timepoint: 14

Starting tuning for IgG_PT (Train on [0, 2], Test on 1)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (520, 6751)
Test data shape before timepoint filter: (214, 6751)
Train data shape after filtering: (507, 6751)
Test data shape after filtering: (33, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits





Results for IgG_PT (Test on dataset 1):
Best parameters: {'subsample': 0.6, 'reg_lambda': 1.5, 'reg_alpha': 1.0, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 10, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 0.7}
Spearman correlation: 0.584
P-value: 0.000
Time taken: 25.9 seconds

Processing combination 3/21
Target: IgG_PT, Timepoint: 14

Starting tuning for IgG_PT (Train on [1, 2], Test on 0)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (382, 6751)
Test data shape before timepoint filter: (352, 6751)
Train data shape after filtering: (365, 6751)
Test data shape after filtering: (57, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for IgG_PT (Test on dataset 0):
Best parameters: {'subsample': 0.9, 'reg_lambda': 1.0, 'reg_alpha': 0.5, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 3, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.8}
Spearma

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for Monocytes (Test on dataset 1):
Best parameters: {'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
Spearman correlation: 0.993
P-value: 0.000
Time taken: 7.2 seconds

Processing combination 9/21
Target: Monocytes, Timepoint: 1

Starting tuning for Monocytes (Train on [1, 2], Test on 0)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (382, 6751)
Test data shape before timepoint filter: (352, 6751)
Train data shape after filtering: (303, 6751)
Test data shape after filtering: (20, 6751)


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for Monocytes (Test on dataset 0):
Best parameters: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Spearman correlation: 0.974
P-value: 0.000
Time taken: 8.4 seconds

Processing combination 10/21
Target: fold_change_Monocytes, Timepoint: 1

Starting tuning for fold_change_Monocytes (Train on [0, 1], Test on 2)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (566, 6751)
Test data shape before timepoint filter: (168, 6751)
Train data shape after filtering: (53, 6751)
Test data shape after filtering: (21, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for fold_change_Monocytes (Test on dataset 2):
Best parameters: {'subsample': 0.9, 'reg_lambd




Results for ENSG00000277632.1 (Test on dataset 2):
Best parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Spearman correlation: 0.928
P-value: 0.000
Time taken: 16.3 seconds

Processing combination 14/21
Target: ENSG00000277632.1, Timepoint: 3

Starting tuning for ENSG00000277632.1 (Train on [0, 2], Test on 1)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (520, 6751)
Test data shape before timepoint filter: (214, 6751)
Train data shape after filtering: (327, 6751)
Test data shape after filtering: (36, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for ENSG00000277632.1 (Test on dataset 1):
Best parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, '

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)
 -0.58 -0.34   nan -0.68 -0.14 -0.68 -0.38   nan -0.28 -0.44 -0.38   nan
 -0.28 -0.38   nan -0.38 -0.48 -0.44 -0.38 -0.46 -0.38 -0.58 -0.38 -0.86
   nan -0.64 -0.58   nan -0.58   nan -0.76 -0.46   nan -0.46 -0.38   nan
 -0.38 -0.48]
 0.86763237 0.83236763 0.82487512 0.96813187 0.9989011  0.95904096
 0.97222777 0.93816184        nan 0.91938062 1.         0.9028971
 0.90649351        nan 0.87562438 1.         0.91618382        nan
 0.88031968 0.98811189        nan 0.91748252 0.9978022  1.
 0.85404595 1.         0.91718282 0.89080919 0.87082917 0.9978022
        nan 0.96523477 0.84915085        nan 0.97692308        nan
 0.90539461 0.96633367        nan 0.98611389 0.99310689        nan
 0.91428571 0.88151848]



Results for fold_change_PT_ratio (Test on dataset 1):
Best parameters: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.6}
Spearman correlation: 0.098
P-value: 0.628
Time taken: 1.0 seconds

Processing combination 21/21
Target: fold_change_PT_ratio, Timepoint: 30

Starting tuning for fold_change_PT_ratio (Train on [1, 2], Test on 0)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (382, 6751)
Test data shape before timepoint filter: (352, 6751)
Train data shape after filtering: (43, 6751)
Test data shape after filtering: (0, 6751)

Processing combination IgG_PT, timepoint 14
Training on datasets [0, 1], testing on dataset 2

Starting tuning for IgG_PT (Train on [0, 1], Test on 2)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (566, 6751)
Test data shape before timepoint filter: (168, 6751)
Train data shape after

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for Monocytes (Test on dataset 1):
Best parameters: {'subsample': 0.6, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 100, 'min_child_weight': 3, 'max_depth': 5, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.8}
Spearman correlation: 0.993
P-value: 0.000
Time taken: 7.2 seconds

Processing combination Monocytes, timepoint 1
Training on datasets [1, 2], testing on dataset 0

Starting tuning for Monocytes (Train on [1, 2], Test on 0)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (382, 6751)
Test data shape before timepoint filter: (352, 6751)
Train data shape after filtering: (303, 6751)
Test data shape after filtering: (20, 6751)


  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)


Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for Monocytes (Test on dataset 0):
Best parameters: {'subsample': 0.7, 'reg_lambda': 0.5, 'reg_alpha': 0.1, 'n_estimators': 300, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Spearman correlation: 0.974
P-value: 0.000
Time taken: 8.4 seconds

Processing combination fold_change_Monocytes, timepoint 1
Training on datasets [0, 1], testing on dataset 2

Starting tuning for fold_change_Monocytes (Train on [0, 1], Test on 2)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (566, 6751)
Test data shape before timepoint filter: (168, 6751)
Train data shape after filtering: (53, 6751)
Test data shape after filtering: (21, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for fold_change_Monocytes (Test on dataset 2):
Best parame




Results for ENSG00000277632.1 (Test on dataset 2):
Best parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.9}
Spearman correlation: 0.928
P-value: 0.000
Time taken: 16.2 seconds

Processing combination ENSG00000277632.1, timepoint 3
Training on datasets [0, 2], testing on dataset 1

Starting tuning for ENSG00000277632.1 (Train on [0, 2], Test on 1)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (520, 6751)
Test data shape before timepoint filter: (214, 6751)
Train data shape after filtering: (327, 6751)
Test data shape after filtering: (36, 6751)
Starting RandomizedSearchCV with 10 iterations
Fitting 5 folds for each of 50 candidates, totalling 250 fits

Results for ENSG00000277632.1 (Test on dataset 1):
Best parameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 1.0, 'n_estimators': 500, 'min_child_weight': 1, 'max_

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)
 -0.58 -0.34   nan -0.68 -0.14 -0.68 -0.38   nan -0.28 -0.44 -0.38   nan
 -0.28 -0.38   nan -0.38 -0.48 -0.44 -0.38 -0.46 -0.38 -0.58 -0.38 -0.86
   nan -0.64 -0.58   nan -0.58   nan -0.76 -0.46   nan -0.46 -0.38   nan
 -0.38 -0.48]
 0.86763237 0.83236763 0.82487512 0.96813187 0.9989011  0.95904096
 0.97222777 0.93816184        nan 0.91938062 1.         0.9028971
 0.90649351        nan 0.87562438 1.         0.91618382        nan
 0.88031968 0.98811189        nan 0.91748252 0.9978022  1.
 0.85404595 1.         0.91718282 0.89080919 0.87082917 0.9978022
        nan 0.96523477 0.84915085        nan 0.97692308        nan
 0.90539461 0.96633367        nan 0.98611389 0.99310689        nan
 0.91428571 0.88151848]



Results for fold_change_PT_ratio (Test on dataset 1):
Best parameters: {'subsample': 0.8, 'reg_lambda': 0.1, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 1, 'max_depth': 8, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 0.6}
Spearman correlation: 0.098
P-value: 0.628
Time taken: 1.0 seconds

Processing combination fold_change_PT_ratio, timepoint 30
Training on datasets [1, 2], testing on dataset 0

Starting tuning for fold_change_PT_ratio (Train on [1, 2], Test on 0)
Initial data shape: (896, 6751)
Train data shape before timepoint filter: (382, 6751)
Test data shape before timepoint filter: (352, 6751)
Train data shape after filtering: (43, 6751)
Test data shape after filtering: (0, 6751)
Skipping fold_change_PT_ratio due to insufficient data

Predicting IgG_PT for dataset 3

Predicted Values with Subject IDs and Ranks:
    Subject  Predicted_Value  Predicted_Rank
0       119         2.698532            16.0
1       120         3.553877            10.0
2       1

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)



Predicting Monocytes for dataset 3

Predicted Values with Subject IDs and Ranks:
    Subject  Predicted_Value  Predicted_Rank
0       119         0.835010            43.0
1       120         0.560415            52.0
2       121         0.747014            48.0
3       122         1.040535            23.0
4       123         1.285804             9.0
5       124         0.583353            51.0
6       125         0.533436            54.0
7       126         1.267653            11.0
8       127         1.145654            18.0
9       128         0.752498            47.0
10      129         0.805769            45.0
11      130         1.201614            15.0
12      131         1.153475            17.0
13      132         1.118386            20.0
14      133         1.417879             5.0
15      134         1.086394            21.0
16      135         1.561418             3.0
17      136         0.713169            49.0
18      137         0.842382            41.0
19      138       

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)



Predicting fold_change_PT_ratio for dataset 3

Predicted Values with Subject IDs and Ranks:
    Subject  Predicted_Value  Predicted_Rank
0       119        16.446642            40.0
1       120         8.822064            48.0
2       121        37.876019            11.0
3       122        26.496637            25.0
4       123        31.080765            17.0
5       124        28.730558            23.0
6       125        29.182541            22.0
7       126         5.870202            52.0
8       127        38.585129            10.0
9       128         7.691967            49.0
10      129        18.895630            37.0
11      130       138.222763             1.0
12      131        26.375071            27.0
13      132         5.662271            53.0
14      133        26.455259            26.0
15      134         9.829465            45.0
16      135        18.518602            38.0
17      136        41.731037             4.0
18      137        40.410671             5.0
19     



In [None]:
# Problem 1
#predict(combined_df, "IgG_PT", train_datasets=[0, 1], test_dataset=2, target_timepoint=1)
#predict(combined_df, "fold_change_IgG_PT", train_datasets=[0, 1], test_dataset=2, target_timepoint=14)

# Problem 2
#predict(combined_df, "Monocytes", train_datasets=[0, 1], test_dataset=2, target_timepoint=1)
#predict(combined_df, "fold_change_Monocytes", train_datasets=[0, 1], test_dataset=2)

# Problem 3
#predict(combined_df, "ENSG00000277632.1", train_datasets=[0, 1], test_dataset=2, target_timepoint=3)
#predict(combined_df, "fold_change_CCL3", train_datasets=[0, 1], test_dataset=2, target_timepoint=3)

# Problem 4
#predict(combined_df, "fold_change_PT_ratio", train_datasets=[0, 1], test_dataset=2, target_timepoint=30)