In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')


unprocessed = pd.read_csv('../Datasets/Astrazeneca/final_astrazeneca.csv')
unprocessed['ic50'] = unprocessed['inhibition'].apply(lambda x: 1 if x>44 and x<56 else 0 )
unprocessed = unprocessed[(unprocessed['conc_c']!=0)&(unprocessed['conc_r']!=0)]
unprocessed = unprocessed[unprocessed['drug_row']!=unprocessed['drug_col']]
unprocessed['ordered_combinations'] =unprocessed.apply(lambda row: sorted(list(set([row['drug_row'], row['drug_col']]))), axis=1)
unprocessed['combination_name'] = unprocessed['ordered_combinations'].apply(lambda x: str(x[0]) + ' & '+ str(x[1]))
unprocessed['new conc_r'] = np.log2(unprocessed['conc_r'])
unprocessed['new conc_c'] = np.log2(unprocessed['conc_c'])

### creating drug-cell line matrix

In [None]:
def process_astrazeneca_data(
    df,
    cell_line_col='cell_line_name',
    combo_name_col='combination_name',
    conc_r_col='conc_r',
    conc_c_col='conc_c',
    new_conc_r_col='new conc_r',
    new_conc_c_col='new conc_c',
    ic50_col='ic50',
    threshold=0.5,
    random_state=None
):
    """
    Process AstraZeneca drug combination data to create complex concentration matrices.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe containing drug combination data
    cell_line_col : str
        Column name for cell line identifiers
    combo_name_col : str
        Column name for combination identifiers
    conc_r_col, conc_c_col : str
        Columns for original drug concentrations (reference and combination)
    new_conc_r_col, new_conc_c_col : str
        Columns for transformed drug concentrations
    ic50_col : str
        Column indicating IC50 values (1 = active)
    threshold : float (0-1)
        Threshold for dropping columns with too many missing values
    random_state : int, optional
        Random seed for reproducibility
    
    Returns:
    --------
    tuple of (pd.DataFrame, pd.DataFrame)
        Returns two dataframes:
        1. Processed matrix with new concentrations
        2. Original concentration matrix
    """
    
    # Create mappings for drug concentrations
    active_drugs = df[df[ic50_col] == 1]
    l1 = active_drugs[conc_r_col].value_counts().index.tolist()
    l2 = active_drugs[conc_c_col].value_counts().index.tolist()
    
    l1_mapping = {value: idx for idx, value in enumerate(l1)}
    l2_mapping = {value: idx for idx, value in enumerate(l2)}
    
    # Create all possible cell line × combination pairs
    names = pd.merge(
        df[cell_line_col].drop_duplicates(),
        df[combo_name_col].drop_duplicates(),
        how='cross'
    )
    
    # Initialize result dictionaries
    new_conc_matrix = {}
    orig_conc_matrix = {}
    
    # Process each cell line and combination
    for _, row in tqdm(names.iterrows(), total=len(names)):
        cell_line = row[cell_line_col]
        combo = row[combo_name_col]
        
        if cell_line not in new_conc_matrix:
            new_conc_matrix[cell_line] = []
            orig_conc_matrix[cell_line] = []
        
        # Filter relevant data
        mask = (
            (df[cell_line_col] == cell_line) & 
            (df[combo_name_col] == combo) & 
            (df[ic50_col] == 1))
        temp = df[mask]
        
        if temp.empty:
            new_conc_matrix[cell_line].append(np.nan)
            orig_conc_matrix[cell_line].append(np.nan)
        else:
            # Sort by drug concentrations
            temp['sorting_key'] = temp.apply(
                lambda x: (l1_mapping[x[conc_r_col]], l2_mapping[x[conc_c_col]]), 
                axis=1
            )
            temp = temp.sort_values('sorting_key').iloc[0]
            
            # Store as complex numbers
            new_conc_matrix[cell_line].append(
                complex(temp[new_conc_r_col], temp[new_conc_c_col])
            )
            orig_conc_matrix[cell_line].append(
                complex(temp[conc_r_col], temp[conc_c_col])
            )
    
    # Convert to dataframes
    cols = names[names[cell_line_col] == names[cell_line_col].iloc[0]][combo_name_col].tolist()
    
    new_df = pd.DataFrame.from_dict(new_conc_matrix, orient='index', columns=cols)
    orig_df = pd.DataFrame.from_dict(orig_conc_matrix, orient='index', columns=cols)
    
    # Apply threshold filtering
    threshold_count = int(len(new_df) * threshold)
    
    new_df = new_df.dropna(thresh=threshold_count, axis=1).fillna(0)
    orig_df = orig_df.dropna(thresh=threshold_count, axis=1).fillna(0)
    
    return new_df, orig_df


# Example usage:
# new_conc, orig_conc = process_astrazeneca_data(unprocessed)
# new_conc.to_csv('astrazeneca_drug_cell_log_matrix.csv')
# orig_conc.to_csv('astrazeneca_drug_cell_matrix.csv')

In [None]:
result, result2 = process_astrazeneca_data(unprocessed)

In [None]:
result.to_csv('astra_drug_cell_log_matrix.csv')
result2.to_csv('astra_drug_cell_matrix.csv')

### cross validation

In [2]:
data = pd.read_csv('../Datasets/Astrazeneca/astra_drug_cell_log_matrix.csv', index_col=0)

In [10]:
import re
import numpy as np
import numpy.linalg as la
from tqdm import tqdm

def pmf_with_bias(X: np.array, test: np.array, k: int, learning_rate: float, num_iterations: int, lambda_reg: float):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            #print('MAE train error')
            mask = np.nonzero(X)
            #print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            #print('MAE test error')
            mae_loss = np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]))
            #print(mae_loss)

            #print('MSE train error')
            mask = np.nonzero(X)
            #print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            #print('MSE test error')
            mse_loss = np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2))
            #print(mse_loss)

            pred = np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :]
            mask_pred = pred[mask]
            mask_test = test[mask]
            mask_test_real = list()
            mask_pred_real = list()
            error=0
            c=0
            for i in range(len(mask_test)):
                a = complex(np.power(2, mask_test[i].real), np.power(2, mask_test[i].imag))
                b = complex(np.power(2, mask_pred[i].real), np.power(2, mask_pred[i].imag))
                mask_test_real.append(a)
                mask_pred_real.append(b)
            
            mae_loss_transformed = np.mean(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)))
            #print(f'converted MAE transformed : {mae_loss_transformed}')
            mse_loss_transformed = np.sqrt(np.mean(np.power(np.abs(np.array(mask_test_real) - np.array(mask_pred_real)), 2)))
            #print(f'converted MSE transformed : {mse_loss_transformed}')
    return U, V, b_u, b_v, mae_loss, mse_loss, mae_loss_transformed, mse_loss_transformed


def load_astrazeneca_fold_indices(fold, base_path="../Datasets/Astrazeneca/"):
    """Load train and test indices for a given fold"""
    train_path = f"{base_path}astra_train_index_fold_{fold}.txt"
    test_path = f"{base_path}astra_test_index_fold_{fold}.txt"
    return (
        np.loadtxt(train_path, dtype=int),
        np.loadtxt(test_path, dtype=int)
    )

def create_train_test_matrices(ratings, indexes, train_idx, test_idx):
    """Create train and test matrices from indices"""
    train = np.zeros_like(ratings, dtype=complex)
    test = np.zeros_like(ratings, dtype=complex)
    
    for temp in train_idx:
        train[tuple(indexes[temp])] = ratings[tuple(indexes[temp])]
    for temp in test_idx:
        test[tuple(indexes[temp])] = ratings[tuple(indexes[temp])]
    
    return train, test

def calculate_astrazeneca_validation_accuracy(t, test_idx, indexes, unprocessed, data):
    """Calculate validation accuracy for AstraZeneca dataset"""
    c = 0
    for i in test_idx:
        row, col = indexes[i]
        cell_line = data.index[row]
        combo_name = data.columns[col]
        pred = t[row, col]
        
        combs = unprocessed[
            (unprocessed['cell_line_name'] == cell_line) & 
            (unprocessed['combination_name'] == combo_name)
        ][['ic50', 'new conc_r', 'new conc_c', 'inhibition']]
        
        # Find nearest concentrations
        a = min(combs['new conc_r'].unique(),
                key=lambda x: abs(pred.real - x))
        b = min(combs['new conc_c'].unique(),
                key=lambda x: abs(pred.imag - x))
        
        # Check if prediction falls in valid inhibition range (44-56%)
        pred_conc = combs[
            (combs['new conc_r'] == a) & 
            (combs['new conc_c'] == b)]
        if pred_conc[(pred_conc['inhibition'] >= 44) & (pred_conc['inhibition'] <= 56)].shape[0] > 0:
            c += 1
            
    return c / len(test_idx)

def run_astrazeneca_cross_validation(data, unprocessed, n_folds=5, pmf_params=None):
    """
    Run full cross-validation pipeline for AstraZeneca dataset
    
    Parameters:
    -----------
    data : pd.DataFrame
        Processed drug-cell matrix (complex concentrations)
    unprocessed : pd.DataFrame
        Original raw dataset with all measurements
    n_folds : int
        Number of cross-validation folds
    pmf_params : dict
        Parameters for PMF model training
    """
    if pmf_params is None:
        pmf_params = {
            'n_features': 700,
            'learning_rate': 0.001,
            'n_epochs': 300,
            'reg_param': 0.001
                            }
    
    ratings = data.to_numpy(dtype='complex')
    result_np = ratings.copy()
    indexes = np.argwhere(result_np != complex(0, 0))
    
    for fold in range(n_folds):
        print(f'\nTraining on fold: {fold}')
        
        # Load fold data
        train_idx, test_idx = load_astrazeneca_fold_indices(fold)
        print(f"Train samples: {len(train_idx)}, Test samples: {len(test_idx)}")
        
        # Create train/test matrices
        train, test = create_train_test_matrices(ratings, indexes, train_idx, test_idx)
        
        # Run PMF with bias
        U, V, b_u, b_v, mae, mse, mae_t, mse_t = pmf_with_bias(
            train, test, 
            pmf_params['n_features'],
            pmf_params['learning_rate'],
            pmf_params['n_epochs'],
            pmf_params['reg_param']
                            )
        
        # Calculate predictions
        t = np.dot(U, V.T) + b_u[:, np.newaxis] + b_v[np.newaxis, :]
        
        # Calculate validation accuracy
        val_acc = calculate_astrazeneca_validation_accuracy(t, test_idx, indexes, unprocessed, data)
        
        # Print results
        print(f'Final MAE: {mae:.4f}')
        print(f'Final MSE: {mse:.4f}')
        print(f'Final MAE (transformed): {mae_t:.4f}')
        print(f'Final MSE (transformed): {mse_t:.4f}')
        print(f'Validation accuracy: {val_acc:.4f}')

# Example usage:
# run_astrazeneca_cross_validation(result, unprocessed, n_folds=5)

In [11]:
run_astrazeneca_cross_validation(data, unprocessed)


Training on fold: 0
Train samples: 4380, Test samples: 1096
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Iteration : 180
Iteration : 190
Iteration : 200
Iteration : 210
Iteration : 220
Iteration : 230
Iteration : 240
Iteration : 250
Iteration : 260
Iteration : 270
Iteration : 280
Iteration : 290
Final MAE: 2.7283
Final MSE: 9.8400
Final MAE (transformed): 1.2929
Final MSE (transformed): 5.1131
Validation accuracy: 0.1953

Training on fold: 1
Train samples: 4381, Test samples: 1095
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Iter