## O'niel

In [2]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

unprocessed = pd.read_excel('../Datasets/Oneil/Oneil_combination_response.xls')
unprocessed['ic50'] = unprocessed['X/X0'].apply(lambda x: 1 if x>=.45 and x<=0.55 else 0 )
unprocessed['new drugA Conc (µM)'] = np.log2(unprocessed['drugA Conc (µM)'])
unprocessed['new drugB Conc (µM)'] = np.log2(unprocessed['drugB Conc (µM)'])

In [3]:
data = pd.read_csv('../Datasets/Oneil/Drug_CellLine_matrix_logartihm.csv', index_col='Unnamed: 0')
main_concs = pd.read_csv('../Datasets/Oneil/Drug_CellLine_matrix.csv', index_col='Unnamed: 0')

In [9]:
import re
import numpy as np
import numpy.linalg as la
from tqdm import tqdm

def pmf_with_bias(X: np.array, test: np.array, k: int, learning_rate: float, num_iterations: int, lambda_reg: float):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            #print('MAE train error')
            mask = np.nonzero(X)
            #print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            #print('MAE test error')
            mae_loss = np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]))
            #print(mae_loss)

            #print('MSE train error')
            mask = np.nonzero(X)
            #print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            #print('MSE test error')
            mse_loss = np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2))
            #print(mse_loss)

            pred = np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :]
            mask_pred = pred[mask]
            mask_test = test[mask]
            mask_test_real = list()
            mask_pred_real = list()
            error=0
            c=0
            for i in range(len(mask_test)):
                a = complex(np.power(2, mask_test[i].real), np.power(2, mask_test[i].imag))
                b = complex(np.power(2, mask_pred[i].real), np.power(2, mask_pred[i].imag))
                mask_test_real.append(a)
                mask_pred_real.append(b)
            
            mae_loss_transformed = np.mean(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)))
            #print(f'converted MAE transformed : {mae_loss_transformed}')
            mse_loss_transformed = np.sqrt(np.mean(np.power(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)), 2)))
            #print(f'converted MSE transformed : {mse_loss_transformed}')
    return U, V, b_u, b_v, mae_loss, mse_loss, mae_loss_transformed, mse_loss_transformed


def load_fold_indices(fold, base_path="../Datasets/Oneil/"):
    """Load train and test indices for a given fold"""
    train_path = f"{base_path}train_index_fold_{fold}.txt"
    test_path = f"{base_path}test_index_fold_{fold}.txt"
    return (
        np.loadtxt(train_path, dtype=int),
        np.loadtxt(test_path, dtype=int)
    )

def shuffle_drugs(ratings, swap_fraction=0.5, ):
    """
    Randomly shuffle complex matrix by swapping real/imaginary parts in selected columns.
    """
    num_columns = ratings.shape[1]
    num_to_swap = int(num_columns * swap_fraction)
    random_columns = np.random.choice(num_columns, size=num_to_swap, replace=False)
    
    # Create copy to avoid modifying original array
    modified = ratings.copy()
    # Swap real and imaginary parts in selected columns
    modified[:, random_columns] = 1j * modified[:, random_columns].real + modified[:, random_columns].imag
    return modified

def create_train_test_matrices(ratings, indexes, train_idx, test_idx):
    """Create train and test matrices from indices"""
    train = np.zeros_like(ratings, dtype=complex)
    test = np.zeros_like(ratings, dtype=complex)
    
    # shuffle drugs
    ratings_shuffled = shuffle_drugs(ratings)

    for temp in train_idx:
        train[tuple(indexes[temp])] = ratings_shuffled[tuple(indexes[temp])]
    for temp in test_idx:
        test[tuple(indexes[temp])] = ratings_shuffled[tuple(indexes[temp])]
    
    return train, test

def calculate_validation_accuracy(t, test_idx, indexes, unprocessed, data):
    """Calculate validation accuracy by comparing predictions to ground truth"""
    c = 0
    for i in test_idx:
        row, col = indexes[i]
        cell_line = data.index[row]
        combo_name = data.columns[col]
        pred = t[row, col]
        
        combs = unprocessed[
            (unprocessed['cell_line'] == cell_line) & 
            (unprocessed['combination_name'] == combo_name)
        ][['ic50', 'new drugA Conc (µM)', 'new drugB Conc (µM)', 'X/X0']]
        
        # Find nearest concentrations
        a = min(combs['new drugA Conc (µM)'].unique(), 
                key=lambda t: abs(pred.real - t))
        b = min(combs['new drugB Conc (µM)'].unique(), 
                key=lambda t: abs(pred.imag - t))
        
        # Check if prediction falls in valid range
        pred_conc = combs[
            (combs['new drugA Conc (µM)'] == a) & 
            (combs['new drugB Conc (µM)'] == b)]
        if pred_conc[(pred_conc['X/X0'] >= 0.44) & (pred_conc['X/X0'] <= 0.56)].shape[0] > 0:
            c += 1
            
    return c / len(test_idx)

def run_cross_validation(data, unprocessed, n_folds=5, pmf_params=None):
    """Run full cross-validation pipeline"""
    if pmf_params is None:
        pmf_params = {
            'n_features': 700,
            'learning_rate': 0.001,
            'n_epochs': 300,
            'reg_param': 0.0001        
            }
    
    ratings = data.to_numpy(dtype='complex')
    result_np = ratings.copy()
    indexes = np.argwhere(result_np != complex(0, 0))
    
    for fold in range(n_folds):
        print(f'\nTraining on fold: {fold}')
        
        # Load fold data
        train_idx, test_idx = load_fold_indices(fold)
        print(f"Train samples: {len(train_idx)}, Test samples: {len(test_idx)}")
        
        # Create train/test matrices
        train, test = create_train_test_matrices(ratings, indexes, train_idx, test_idx)
        
        # Run PMF with bias
        U, V, b_u, b_v, mae, mse, mae_t, mse_t = pmf_with_bias(
            train, test, 
            pmf_params['n_features'], 
            pmf_params['learning_rate'], 
            pmf_params['n_epochs'], 
            pmf_params['reg_param']
        )
        
        # Calculate predictions
        t = np.dot(U, V.T) + b_u[:, np.newaxis] + b_v[np.newaxis, :]
        
        # Calculate validation accuracy
        val_acc = calculate_validation_accuracy(t, test_idx, indexes, unprocessed, data)
        
        # Print results
        print(f'Final MAE: {mae:.4f}')
        print(f'Final MSE: {mse:.4f}')
        print(f'Final MAE (transformed): {mae_t:.4f}')
        print(f'Final MSE (transformed): {mse_t:.4f}')
        print(f'Validation accuracy: {val_acc:.4f}')

# Example usage:
run_cross_validation(data, unprocessed, n_folds=5)


Training on fold: 0
Train samples: 8424, Test samples: 2106
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Iteration : 180
Iteration : 190
Iteration : 200
Iteration : 210
Iteration : 220
Iteration : 230
Iteration : 240
Iteration : 250
Iteration : 260
Iteration : 270
Iteration : 280
Iteration : 290
Final MAE: 2.2212
Final MSE: 6.3331
Final MAE (transformed): 9.1552
Final MSE (transformed): 33.6507
Validation accuracy: 0.2631

Training on fold: 1
Train samples: 8424, Test samples: 2106
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Ite

## ALMANAC

## Astra

In [10]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


unprocessed = pd.read_csv('../Datasets/Astrazeneca/final_astrazeneca.csv')
unprocessed['ic50'] = unprocessed['inhibition'].apply(lambda x: 1 if x>44 and x<56 else 0 )
unprocessed = unprocessed[(unprocessed['conc_c']!=0)&(unprocessed['conc_r']!=0)]
unprocessed = unprocessed[unprocessed['drug_row']!=unprocessed['drug_col']]
unprocessed['ordered_combinations'] =unprocessed.apply(lambda row: sorted(list(set([row['drug_row'], row['drug_col']]))), axis=1)
unprocessed['combination_name'] = unprocessed['ordered_combinations'].apply(lambda x: str(x[0]) + ' & '+ str(x[1]))
unprocessed['new conc_r'] = np.log2(unprocessed['conc_r'])
unprocessed['new conc_c'] = np.log2(unprocessed['conc_c'])

In [11]:
data = pd.read_csv('../Datasets/Astrazeneca/astra_drug_cell_log_matrix.csv', index_col=0)

In [12]:
import re
import numpy as np
import numpy.linalg as la
from tqdm import tqdm

def pmf_with_bias(X: np.array, test: np.array, k: int, learning_rate: float, num_iterations: int, lambda_reg: float):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            #print('MAE train error')
            mask = np.nonzero(X)
            #print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            #print('MAE test error')
            mae_loss = np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]))
            #print(mae_loss)

            #print('MSE train error')
            mask = np.nonzero(X)
            #print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            #print('MSE test error')
            mse_loss = np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2))
            #print(mse_loss)

            pred = np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :]
            mask_pred = pred[mask]
            mask_test = test[mask]
            mask_test_real = list()
            mask_pred_real = list()
            error=0
            c=0
            for i in range(len(mask_test)):
                a = complex(np.power(2, mask_test[i].real), np.power(2, mask_test[i].imag))
                b = complex(np.power(2, mask_pred[i].real), np.power(2, mask_pred[i].imag))
                mask_test_real.append(a)
                mask_pred_real.append(b)
            
            mae_loss_transformed = np.mean(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)))
            #print(f'converted MAE transformed : {mae_loss_transformed}')
            mse_loss_transformed = np.sqrt(np.mean(np.power(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)), 2)))
            #print(f'converted MSE transformed : {mse_loss_transformed}')
    return U, V, b_u, b_v, mae_loss, mse_loss, mae_loss_transformed, mse_loss_transformed


def load_astrazeneca_fold_indices(fold, base_path="../Datasets/Astrazeneca/"):
    """Load train and test indices for a given fold"""
    train_path = f"{base_path}astra_train_index_fold_{fold}.txt"
    test_path = f"{base_path}astra_test_index_fold_{fold}.txt"
    return (
        np.loadtxt(train_path, dtype=int),
        np.loadtxt(test_path, dtype=int)
    )

def shuffle_drugs(ratings, swap_fraction=0.5, ):
    """
    Randomly shuffle complex matrix by swapping real/imaginary parts in selected columns.
    """
    num_columns = ratings.shape[1]
    num_to_swap = int(num_columns * swap_fraction)
    random_columns = np.random.choice(num_columns, size=num_to_swap, replace=False)
    
    # Create copy to avoid modifying original array
    modified = ratings.copy()
    # Swap real and imaginary parts in selected columns
    modified[:, random_columns] = 1j * modified[:, random_columns].real + modified[:, random_columns].imag
    return modified

def create_train_test_matrices(ratings, indexes, train_idx, test_idx):
    """Create train and test matrices from indices"""
    train = np.zeros_like(ratings, dtype=complex)
    test = np.zeros_like(ratings, dtype=complex)
    
    shuffled_ratings = shuffle_drugs(ratings) 
    for temp in train_idx:
        train[tuple(indexes[temp])] = shuffled_ratings[tuple(indexes[temp])]
    for temp in test_idx:
        test[tuple(indexes[temp])] = shuffled_ratings[tuple(indexes[temp])]
    
    return train, test

def calculate_astrazeneca_validation_accuracy(t, test_idx, indexes, unprocessed, data):
    """Calculate validation accuracy for AstraZeneca dataset"""
    c = 0
    for i in test_idx:
        row, col = indexes[i]
        cell_line = data.index[row]
        combo_name = data.columns[col]
        pred = t[row, col]
        
        combs = unprocessed[
            (unprocessed['cell_line_name'] == cell_line) & 
            (unprocessed['combination_name'] == combo_name)
        ][['ic50', 'new conc_r', 'new conc_c', 'inhibition']]
        
        # Find nearest concentrations
        a = min(combs['new conc_r'].unique(),
                key=lambda x: abs(pred.real - x))
        b = min(combs['new conc_c'].unique(),
                key=lambda x: abs(pred.imag - x))
        
        # Check if prediction falls in valid inhibition range (44-56%)
        pred_conc = combs[
            (combs['new conc_r'] == a) & 
            (combs['new conc_c'] == b)]
        if pred_conc[(pred_conc['inhibition'] >= 44) & (pred_conc['inhibition'] <= 56)].shape[0] > 0:
            c += 1
            
    return c / len(test_idx)

def run_astrazeneca_cross_validation(data, unprocessed, n_folds=5, pmf_params=None):
    """
    Run full cross-validation pipeline for AstraZeneca dataset
    
    Parameters:
    -----------
    data : pd.DataFrame
        Processed drug-cell matrix (complex concentrations)
    unprocessed : pd.DataFrame
        Original raw dataset with all measurements
    n_folds : int
        Number of cross-validation folds
    pmf_params : dict
        Parameters for PMF model training
    """
    if pmf_params is None:
        pmf_params = {
            'n_features': 700,
            'learning_rate': 0.001,
            'n_epochs': 300,
            'reg_param': 0.001
                            }
    
    ratings = data.to_numpy(dtype='complex')
    result_np = ratings.copy()
    indexes = np.argwhere(result_np != complex(0, 0))
    
    for fold in range(n_folds):
        print(f'\nTraining on fold: {fold}')
        
        # Load fold data
        train_idx, test_idx = load_astrazeneca_fold_indices(fold)
        print(f"Train samples: {len(train_idx)}, Test samples: {len(test_idx)}")
        
        # Create train/test matrices
        train, test = create_train_test_matrices(ratings, indexes, train_idx, test_idx)
        
        # Run PMF with bias
        U, V, b_u, b_v, mae, mse, mae_t, mse_t = pmf_with_bias(
            train, test, 
            pmf_params['n_features'],
            pmf_params['learning_rate'],
            pmf_params['n_epochs'],
            pmf_params['reg_param']
                            )
        
        # Calculate predictions
        t = np.dot(U, V.T) + b_u[:, np.newaxis] + b_v[np.newaxis, :]
        
        # Calculate validation accuracy
        val_acc = calculate_astrazeneca_validation_accuracy(t, test_idx, indexes, unprocessed, data)
        
        # Print results
        print(f'Final MAE: {mae:.4f}')
        print(f'Final MSE: {mse:.4f}')
        print(f'Final MAE (transformed): {mae_t:.4f}')
        print(f'Final MSE (transformed): {mse_t:.4f}')
        print(f'Validation accuracy: {val_acc:.4f}')

run_astrazeneca_cross_validation(data, unprocessed)


Training on fold: 0
Train samples: 4380, Test samples: 1096
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Iteration : 180
Iteration : 190
Iteration : 200
Iteration : 210
Iteration : 220
Iteration : 230
Iteration : 240
Iteration : 250
Iteration : 260
Iteration : 270
Iteration : 280
Iteration : 290
Final MAE: 2.7737
Final MSE: 10.1987
Final MAE (transformed): 1.3014
Final MSE (transformed): 5.1872
Validation accuracy: 0.1962

Training on fold: 1
Train samples: 4381, Test samples: 1095
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Ite

## ALMANAC

In [None]:
import pandas as pd

unprocessed = pd.read_csv('./NCI files/NCI-ALMANAC_subset_555300.csv')
unprocessed.dropna(inplace=True)
unprocessed = unprocessed[(unprocessed['Conc1'] > 0) & (unprocessed['Conc2'] > 0)]
unprocessed['ic50'] = unprocessed['PercentageGrowth'].apply(lambda x: 1 if x>=45 and x<=55 else 0 )
unprocessed['combination_name'] = unprocessed['Drug1'] + ' & ' + unprocessed['Drug2']
unprocessed = unprocessed[unprocessed['combination_name'].isin(unprocessed[unprocessed['ic50']==1]['combination_name'])]
#unprocessed.loc[unprocessed['Conc1'] < .001, 'Conc1'] = 0.001
#unprocessed.loc[unprocessed['Conc2'] < .001, 'Conc2'] = 0.001
#unprocessed['Conc1'] += 1
#unprocessed['Conc2'] += 1
#unprocessed[unprocessed['Conc1'] > 5]=5
#unprocessed[unprocessed['Conc2']>5]= 5
#gene_data = pd.read_csv('../drug_synergy/our data/rna.csv')
#null_CellLines = unprocessed[~unprocessed['CellLine'].isin(gene_data.columns)]['CellLine'].unique().tolist()
#unprocessed = unprocessed[~unprocessed['CellLine'].isin(null_CellLines)]

In [None]:
import numpy as np

unprocessed['new Conc1'] = np.log2(unprocessed['Conc1'])
unprocessed['new Conc2'] = np.log2(unprocessed['Conc2'])

unprocessed.loc[unprocessed['new Conc1'] >= 0, 'new Conc1'] +=1
unprocessed.loc[unprocessed['new Conc1'] < 0, 'new Conc1'] -=1
unprocessed.loc[unprocessed['new Conc2'] >= 0, 'new Conc2'] +=1
unprocessed.loc[unprocessed['new Conc2'] < 0, 'new Conc2'] -=1

In [None]:
import pandas as pd 

result = pd.read_csv('./NCI files/NCI60_drug_cell_log_matrix.csv', index_col=0)#.replace({np.nan:complex(0,0)})
result2 = pd.read_csv('./NCI files/NCI60_drug_cell_matrix.csv', index_col=0)#.replace({np.nan:complex(0,0)})
threshold = len(result) * 0.3

result = result.dropna(thresh=threshold, axis=1).replace({np.nan:complex(0,0)})
result2 = result2.dropna(thresh=threshold, axis=1).replace({np.nan:complex(0,0)})

In [None]:
## final results based on nested cross validation 

## different mf method 
import re
import numpy as np
import numpy.linalg as la
from tqdm import tqdm

def pmf_with_bias(X, test, k, learning_rate, num_iterations, lambda_reg, sigma_sq):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            print('MAE train error')
            mask = np.nonzero(X)
            print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            print('MAE test error')
            print(np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))

            print('MSE train error')
            mask = np.nonzero(X)
            print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            print('MSE test error')
            print(np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))

        
    return U, V, b_u, b_v

for fold in range(10):
    print(f'training on fold : {fold}')
    train_index = np.loadtxt(f'./NCI files/NCI_train_index_fold_{fold}.txt', dtype=int)
    test_index = np.loadtxt(f'./NCI files/NCI_test_index_fold_{fold}.txt', dtype=int)
    ratings = result.to_numpy(dtype='complex')

    num_columns = ratings.shape[1]
    random_columns = np.random.choice(num_columns, size=int(num_columns/2), replace=False)

    # Replace real parts with imaginary parts in the selected columns
    ratings[:, random_columns].real = ratings[:, random_columns].imag

    result_np = ratings.copy()
    indexes = np.argwhere(result_np != complex(0,0))

    print("Train Index: ", len(train_index))
    print("Test Index: ", len(test_index))

    test = np.zeros(ratings.shape, dtype=complex)
    train = np.zeros(ratings.shape, dtype=complex)
    for temp in train_index:
        train[tuple(indexes[temp])] = result_np[tuple(indexes[temp])]
    for temp in test_index:
        test[tuple(indexes[temp])] = result_np[tuple(indexes[temp])]
    #u, v, b_u, b_v = kernel_matrix_factorization(train, test, 200, .0001, 1000, 0.001)
    U, V, b_u, b_v = pmf_with_bias(train, test, 700, .001, 300, .001, 0.01)
    t = np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :]
    c = 0
    for i in test_index:
        i = indexes[i]
        combs = unprocessed[(unprocessed['CellLine']== result.index[i[0]]) & (unprocessed['combination_name']== result.columns[i[1]])][['ic50', 'new Conc1', 'new Conc2', 'PercentageGrowth']]
        #print(t[i[0], i[1]])
        pred = t[i[0], i[1]]
        #print(pred)

        a = min(combs['new Conc1'].unique().tolist(), key=lambda t: abs(pred.real-t))
        b = min(combs['new Conc2'].unique().tolist(), key=lambda t: abs(pred.imag-t))
        #print(combs['new drugB Conc (µM)'].unique().tolist())
        #print('Nearest : ' + str(a) + '\t' + str(b))
        #print(combs[combs['ic50']==1])

        pred_conc = combs[(combs['new Conc1']==a) & (combs['new Conc2']==b)]
        if pred_conc[(pred_conc['PercentageGrowth'] >= 44) & (pred_conc['PercentageGrowth'] <= 56)].shape[0] > 0:
            c += 1
            #print('yes')
            continue

    print(f'validation precision : {c/len(test_index)}')

training on fold : 0
Train Index:  4636
Test Index:  1160
Iteration : 0
MAE train error
28.12127658609536
MAE test error
28.227617641247246
MSE train error
809.2889531671813
MSE test error
814.8066725247634
Iteration : 10
MAE train error
11.359085720594722
MAE test error
11.700155741124462
MSE train error
149.06816268776987
MSE test error
157.88688875532395
Iteration : 20
MAE train error
6.720467652374832
MAE test error
7.099596455382276
MSE train error
60.011072434451194
MSE test error
65.62756739176207
Iteration : 30
MAE train error
5.639906046093231
MAE test error
6.025776986016261
MSE train error
44.3411659446059
MSE test error
49.70438997636571
Iteration : 40
MAE train error
4.535131166829906
MAE test error
4.846849255577872
MSE train error
28.627410573689673
MSE test error
32.42889470409964
Iteration : 50
MAE train error
3.915740043380308
MAE test error
4.178716970981839
MSE train error
20.806761783088216
MSE test error
23.66606717309567
Iteration : 60
MAE train error
3.546916226

FileNotFoundError: ./NCI files/NCI_train_index_fold_5.txt not found.