In [1]:
import pandas as pd 
import numpy as np 
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

unprocessed = pd.read_csv('../Datasets/NCI/NCI/NCI-ALMANAC_subset_555300.csv')
unprocessed.dropna(inplace=True)
unprocessed = unprocessed[(unprocessed['Conc1'] > 0) & (unprocessed['Conc2'] > 0)]
unprocessed['ic50'] = unprocessed['PercentageGrowth'].apply(lambda x: 1 if x>=45 and x<=55 else 0 )
unprocessed['combination_name'] = unprocessed['Drug1'] + ' & ' + unprocessed['Drug2']
unprocessed = unprocessed[unprocessed['combination_name'].isin(unprocessed[unprocessed['ic50']==1]['combination_name'])]

unprocessed['new Conc1'] = np.log2(unprocessed['Conc1'])
unprocessed['new Conc2'] = np.log2(unprocessed['Conc2'])

unprocessed.loc[unprocessed['new Conc1'] >= 0, 'new Conc1'] +=1
unprocessed.loc[unprocessed['new Conc1'] < 0, 'new Conc1'] -=1
unprocessed.loc[unprocessed['new Conc2'] >= 0, 'new Conc2'] +=1
unprocessed.loc[unprocessed['new Conc2'] < 0, 'new Conc2'] -=1

### Creating Drug-cell line matrix

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def process_drug_combination_data(
    df,
    cell_line_col='CellLine',
    combo_name_col='combination_name',
    drugA_col='Conc1',
    drugB_col='Conc2',
    new_drugA_col='new Conc1',
    new_drugB_col='new Conc2',
    ic50_col='ic50',
    threshold=0.3,
    random_state=None
):
    """
    Process drug combination data to create complex concentration matrices.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe containing drug combination data
    cell_line_col : str
        Column name for cell line identifiers
    combo_name_col : str
        Column name for combination identifiers
    drugA_col, drugB_col : str
        Columns for original drug concentrations
    new_drugA_col, new_drugB_col : str
        Columns for transformed drug concentrations
    ic50_col : str
        Column indicating IC50 values (1 = active)
    threshold : float (0-1)
        Threshold for dropping columns with too many missing values
    random_state : int, optional
        Random seed for reproducibility
    
    Returns:
    --------
    tuple of (pd.DataFrame, pd.DataFrame)
        Returns two dataframes:
        1. Processed matrix with new concentrations
        2. Original concentration matrix
    """
    
    # Create mappings for drug concentrations
    active_drugs = df[df[ic50_col] == 1]
    l1 = active_drugs[drugA_col].value_counts().index.tolist()
    l2 = active_drugs[drugB_col].value_counts().index.tolist()
    
    l1_mapping = {value: idx for idx, value in enumerate(l1)}
    l2_mapping = {value: idx for idx, value in enumerate(l2)}
    
    # Create all possible cell line × combination pairs
    names = pd.merge(
        df[cell_line_col].drop_duplicates(),
        df[combo_name_col].drop_duplicates(),
        how='cross'
    )
    
    # Initialize result dictionaries
    new_conc_matrix = {}
    orig_conc_matrix = {}
    
    # Process each cell line and combination
    for _, row in tqdm(names.iterrows(), total=len(names)):
        cell_line = row[cell_line_col]
        combo = row[combo_name_col]
        
        if cell_line not in new_conc_matrix:
            new_conc_matrix[cell_line] = []
            orig_conc_matrix[cell_line] = []
        
        # Filter relevant data
        mask = (
            (df[cell_line_col] == cell_line) & 
            (df[combo_name_col] == combo) & 
            (df[ic50_col] == 1)
        )
        temp = df[mask]
        
        if temp.empty:
            new_conc_matrix[cell_line].append(np.nan)
            orig_conc_matrix[cell_line].append(np.nan)
        else:
            # Sort by drug concentrations
            temp['sorting_key'] = temp.apply(
                lambda x: (l1_mapping[x[drugA_col]], l2_mapping[x[drugB_col]]), 
                axis=1
            )
            temp = temp.sort_values('sorting_key').iloc[0]
            
            # Store as complex numbers
            new_conc_matrix[cell_line].append(
                complex(temp[new_drugA_col], temp[new_drugB_col])
            )
            orig_conc_matrix[cell_line].append(
                complex(temp[drugA_col], temp[drugB_col])
            )
    
    # Convert to dataframes
    cols = unprocessed['combination_name'].unique().tolist()
    
    new_df = pd.DataFrame.from_dict(new_conc_matrix, orient='index', columns=cols)
    orig_df = pd.DataFrame.from_dict(orig_conc_matrix, orient='index', columns=cols)
    
    # Apply threshold filtering
    threshold_count = int(len(new_df) * threshold)
    
    new_df = new_df.dropna(thresh=threshold_count, axis=1)
    orig_df = orig_df.dropna(thresh=threshold_count, axis=1)
    
    return new_df, orig_df


# Example usage:
# new_conc, orig_conc = process_drug_combination_data(unprocessed)
# new_conc.to_csv('drug_cell_log_matrix.csv')
# orig_conc.to_csv('drug_cell_matrix.csv')

In [4]:
result, result2 = process_drug_combination_data(unprocessed)

100%|██████████| 34920/34920 [14:36<00:00, 39.83it/s]


In [5]:
result

Unnamed: 0,Amifostine & Chlorambucil,Hydroxyurea & Chlorambucil,Ifosfamide & Gefitinib,Amifostine & Thioguanine,Chlorambucil & Gefitinib,Chlorambucil & Melphalan,Hydroxyurea & Gefitinib,Hydroxyurea & Thioguanine,Estramustine phosphate sodium & Thioguanine,Chlorambucil & Teniposide,...,Floxuridine & Ruxolitinib,Floxuridine & Vinorelbine tartrate,Floxuridine & Crizotinib,Floxuridine & Axitinib,Floxuridine & Vandetanib,Floxuridine & Vismodegib,Floxuridine & Megestrol acetate,Floxuridine & Lenalidomide,Floxuridine & Dacarbazine,Floxuridine & Lomustine
786-0,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-16.287712j,NaN+ 0.000000j,-21.446142-19.346606j,NaN+ 0.000000j,NaN+ 0.000000j
A498,-20.931569-14.287712j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,-17.609640-17.609640j,-14.287712-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,...,-24.768070-17.609640j,-24.768070-24.253497j,-24.768070-20.931569j,-24.768070-24.253497j,-24.768070-23.253497j,-24.768070-19.609640j,-21.446142-19.346606j,-24.768070-16.024678j,-24.768070-15.287712j,-24.768070-21.931569j
A549/ATCC,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-24.253497j,...,-24.768070-17.609640j,NaN+ 0.000000j,-24.768070-20.931569j,-21.446142-24.253497j,-24.768070-19.931569j,-24.768070-16.287712j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-15.287712j
ACHN,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-20.931569j,NaN+ 0.000000j,-17.609640-24.253497j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-28.089998-19.931569j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-21.446142-15.287712j
CCRF-CEM,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-20.931569j,-14.287712-17.609640j,-14.287712-24.253497j,NaN+ 0.000000j,-17.609640-27.575425j,...,-28.089998-24.253497j,-28.089998-24.253497j,-28.089998-20.931569j,-28.089998-24.253497j,-28.089998-23.253497j,NaN+ 0.000000j,-28.089998-16.024678j,-28.089998-16.024678j,-28.089998-15.287712j,-28.089998-18.609640j
COLO 205,-17.609640-14.287712j,-14.287712-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-20.931569j,NaN+ 0.000000j,-17.609640-24.253497j,...,-28.089998-17.609640j,NaN+ 0.000000j,-28.089998-20.931569j,-28.089998-20.931569j,-28.089998-23.253497j,-28.089998-16.287712j,-28.089998-16.024678j,-28.089998-16.024678j,NaN+ 0.000000j,NaN+ 0.000000j
DU-145,-20.931569-14.287712j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-24.253497j,-14.287712-20.931569j,-17.609640-17.609640j,NaN+ 0.000000j,-14.287712-20.931569j,-14.287712-27.575425j,...,-24.768070-20.931569j,-24.768070-24.253497j,-24.768070-24.253497j,-24.768070-20.931569j,-24.768070-23.253497j,-24.768070-16.287712j,-24.768070-19.346606j,-24.768070-16.024678j,-24.768070-15.287712j,-24.768070-15.287712j
EKVX,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,-20.931569-17.609640j,-17.609640-17.609640j,NaN+ 0.000000j,-17.609640-17.609640j,-16.024678-17.609640j,-17.609640-17.609640j,NaN+ 0.000000j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-19.931569j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j
HCC-2998,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,-14.287712-17.609640j,-14.287712-20.931569j,-20.931569-17.609640j,NaN+ 0.000000j,...,-24.768070-20.931569j,-28.089998-24.253497j,-24.768070-24.253497j,-24.768070-20.931569j,-28.089998-19.931569j,-28.089998-22.931569j,-28.089998-16.024678j,NaN+ 0.000000j,-28.089998-15.287712j,-28.089998-18.609640j
HCT-15,-17.609640-14.287712j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-27.575425j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j


In [6]:
result.to_csv('../Datasets/NCI/Drug_CellLine_matrix_logartihm.csv')
result2.to_csv('../Datasets/NCI/Drug_CellLine_matrix.csv')

In [7]:
unprocessed

Unnamed: 0,Conc1,Conc2,Drug1,Drug2,CellLine,PercentageGrowth,ic50,combination_name,new Conc1,new Conc2
0,1.000000e-04,1.000000e-04,Ifosfamide,Chlorambucil,786-0,53.552,1,Ifosfamide & Chlorambucil,-14.287712,-14.287712
2,1.000000e-04,1.000000e-04,Amifostine,Chlorambucil,786-0,26.408,0,Amifostine & Chlorambucil,-14.287712,-14.287712
3,1.000000e-04,1.000000e-04,Chlorambucil,Exemestane,786-0,-56.172,0,Chlorambucil & Exemestane,-14.287712,-14.287712
4,1.000000e-04,1.000000e-04,Hydroxyurea,Chlorambucil,786-0,13.685,0,Hydroxyurea & Chlorambucil,-14.287712,-14.287712
5,1.000000e-04,1.000000e-04,Hydroxyurea,Exemestane,786-0,-52.717,0,Hydroxyurea & Exemestane,-14.287712,-14.287712
...,...,...,...,...,...,...,...,...,...,...
333175,7.000000e-09,5.000000e-05,Floxuridine,Lomustine,SK-MEL-28,52.286,1,Floxuridine & Lomustine,-28.089998,-15.287712
333176,7.000000e-09,5.000000e-06,Floxuridine,Dacarbazine,SK-MEL-28,112.215,0,Floxuridine & Dacarbazine,-28.089998,-18.609640
333177,7.000000e-09,5.000000e-06,Floxuridine,Lomustine,SK-MEL-28,98.742,0,Floxuridine & Lomustine,-28.089998,-18.609640
333178,7.000000e-09,5.000000e-07,Floxuridine,Dacarbazine,SK-MEL-28,105.997,0,Floxuridine & Dacarbazine,-28.089998,-21.931569


In [8]:
result

Unnamed: 0,Amifostine & Chlorambucil,Hydroxyurea & Chlorambucil,Ifosfamide & Gefitinib,Amifostine & Thioguanine,Chlorambucil & Gefitinib,Chlorambucil & Melphalan,Hydroxyurea & Gefitinib,Hydroxyurea & Thioguanine,Estramustine phosphate sodium & Thioguanine,Chlorambucil & Teniposide,...,Floxuridine & Ruxolitinib,Floxuridine & Vinorelbine tartrate,Floxuridine & Crizotinib,Floxuridine & Axitinib,Floxuridine & Vandetanib,Floxuridine & Vismodegib,Floxuridine & Megestrol acetate,Floxuridine & Lenalidomide,Floxuridine & Dacarbazine,Floxuridine & Lomustine
786-0,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-16.287712j,NaN+ 0.000000j,-21.446142-19.346606j,NaN+ 0.000000j,NaN+ 0.000000j
A498,-20.931569-14.287712j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,-17.609640-17.609640j,-14.287712-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,...,-24.768070-17.609640j,-24.768070-24.253497j,-24.768070-20.931569j,-24.768070-24.253497j,-24.768070-23.253497j,-24.768070-19.609640j,-21.446142-19.346606j,-24.768070-16.024678j,-24.768070-15.287712j,-24.768070-21.931569j
A549/ATCC,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-24.253497j,...,-24.768070-17.609640j,NaN+ 0.000000j,-24.768070-20.931569j,-21.446142-24.253497j,-24.768070-19.931569j,-24.768070-16.287712j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-15.287712j
ACHN,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-20.931569j,NaN+ 0.000000j,-17.609640-24.253497j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-28.089998-19.931569j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-21.446142-15.287712j
CCRF-CEM,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-20.931569j,-14.287712-17.609640j,-14.287712-24.253497j,NaN+ 0.000000j,-17.609640-27.575425j,...,-28.089998-24.253497j,-28.089998-24.253497j,-28.089998-20.931569j,-28.089998-24.253497j,-28.089998-23.253497j,NaN+ 0.000000j,-28.089998-16.024678j,-28.089998-16.024678j,-28.089998-15.287712j,-28.089998-18.609640j
COLO 205,-17.609640-14.287712j,-14.287712-20.931569j,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-20.931569j,NaN+ 0.000000j,-17.609640-24.253497j,...,-28.089998-17.609640j,NaN+ 0.000000j,-28.089998-20.931569j,-28.089998-20.931569j,-28.089998-23.253497j,-28.089998-16.287712j,-28.089998-16.024678j,-28.089998-16.024678j,NaN+ 0.000000j,NaN+ 0.000000j
DU-145,-20.931569-14.287712j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-24.253497j,-14.287712-20.931569j,-17.609640-17.609640j,NaN+ 0.000000j,-14.287712-20.931569j,-14.287712-27.575425j,...,-24.768070-20.931569j,-24.768070-24.253497j,-24.768070-24.253497j,-24.768070-20.931569j,-24.768070-23.253497j,-24.768070-16.287712j,-24.768070-19.346606j,-24.768070-16.024678j,-24.768070-15.287712j,-24.768070-15.287712j
EKVX,NaN+ 0.000000j,NaN+ 0.000000j,-17.609640-17.609640j,-20.931569-17.609640j,-17.609640-17.609640j,NaN+ 0.000000j,-17.609640-17.609640j,-16.024678-17.609640j,-17.609640-17.609640j,NaN+ 0.000000j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-24.768070-19.931569j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j
HCC-2998,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-17.609640j,NaN+ 0.000000j,-14.287712-17.609640j,-14.287712-20.931569j,-20.931569-17.609640j,NaN+ 0.000000j,...,-24.768070-20.931569j,-28.089998-24.253497j,-24.768070-24.253497j,-24.768070-20.931569j,-28.089998-19.931569j,-28.089998-22.931569j,-28.089998-16.024678j,NaN+ 0.000000j,-28.089998-15.287712j,-28.089998-18.609640j
HCT-15,-17.609640-14.287712j,NaN+ 0.000000j,-17.609640-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,-20.931569-17.609640j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,-14.287712-27.575425j,...,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j,NaN+ 0.000000j


### Matrix Factorization

In [13]:
data = pd.read_csv('../Datasets/NCI/Drug_CellLine_matrix_logartihm.csv', index_col='Unnamed: 0')
data.fillna(complex(0,0), inplace=True)
main_concs = pd.read_csv('../Datasets/NCI/Drug_CellLine_matrix.csv', index_col='Unnamed: 0')
main_concs.fillna(complex(0,0), inplace=True)

In [14]:
## different mf method 

import numpy as np
import numpy.linalg as la
from tqdm import tqdm

def pmf_with_bias(X, test, k, learning_rate, num_iterations, lambda_reg, sigma_sq):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            print('MAE train error')
            mask = np.nonzero(X)
            print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            print('MAE test error')
            print(np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))

            print('MSE train error')
            mask = np.nonzero(X)
            print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            print('MSE test error')
            print(np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))

        
    return U, V, b_u, b_v

In [15]:
ratings = data.to_numpy(dtype='complex')
train = ratings.copy()
test = ratings.copy()

U, V, b_u, b_v = pmf_with_bias(train, test, 700, 0.001, 300, 0.0001, 0.01)

Iteration : 0
MAE train error
27.43412561799554
MAE test error
27.43412561799554
MSE train error
771.0599448829356
MSE test error
771.0599448829356
Iteration : 10
MAE train error
7.536262404034754
MAE test error
7.536262404034754
MSE train error
70.43712411097647
MSE test error
70.43712411097647
Iteration : 20
MAE train error
6.520965032693089
MAE test error
6.520965032693089
MSE train error
57.98811155585687
MSE test error
57.98811155585687
Iteration : 30
MAE train error
4.7545875009976575
MAE test error
4.7545875009976575
MSE train error
31.50739434453208
MSE test error
31.50739434453208
Iteration : 40
MAE train error
3.894308579503815
MAE test error
3.894308579503815
MSE train error
20.491880824474393
MSE test error
20.491880824474393
Iteration : 50
MAE train error
3.453229691437025
MAE test error
3.453229691437025
MSE train error
15.58098607497302
MSE test error
15.58098607497302
Iteration : 60
MAE train error
3.19938680600936
MAE test error
3.19938680600936
MSE train error
13.0978

In [18]:
from sklearn.model_selection import KFold
from itertools import product


ratings = data.to_numpy(dtype='complex')
cv = KFold(n_splits=5, shuffle=True, random_state=42)
count = 1
result_np = ratings.copy()
indexes = np.argwhere(result_np != complex(0,0))
fold = 0

for train_index, test_index in cv.split(indexes):
    np.savetxt(f'train_index_fold_{fold}.txt', train_index, fmt='%d')
    np.savetxt(f'test_index_fold_{fold}.txt', test_index, fmt='%d')
    fold +=1

In [None]:
from tqdm import tqdm
for i in range(0, 5):
    f = open(f'../Datasets/NCI/NCI/test_index_fold_{i}.txt')
    test_index = f.readlines()
    temp = main_concs.to_numpy(dtype='complex')
    indexes = np.where(temp!=complex(0,0))
    test_indexes = []

    for j in tqdm(test_index):
        j = int(j.replace('\n', ''))
        val = complex(main_concs.iloc[indexes[0][j], indexes[1][j]])
        #test_indexes.append(unprocessed[(unprocessed['combination_name']==main_concs.columns[indexes[1][j]]) & (unprocessed['CellLine']==main_concs.index[indexes[0][j]]) & (unprocessed['Conc1']==val.real) & (unprocessed['Conc2']==val.imag)].index.values[0])
        test_indexes.extend(unprocessed[(unprocessed['combination_name']==main_concs.columns[indexes[1][i]]) & (unprocessed['CellLine']==main_concs.index[indexes[0][i]])].index.values)
        #print(main_concs.index[indexes[0][i]], main_concs.columns[indexes[1][i]])
        #print(complex(main_concs.iloc[indexes[0][i], indexes[1][i]]).real)
        #break
    test_df = unprocessed[unprocessed.index.isin(test_indexes)]
    test_df.to_csv(f'../Datasets/NCI/NCI/test_fold_{i}.csv', index=False)
    test_df = unprocessed[~unprocessed.index.isin(test_indexes)]
    test_df.to_csv(f'../Datasets/NCI/NCI/train_fold_{i}.csv', index=False)

### cross validation

In [32]:
import re
import numpy as np
import numpy.linalg as la
from tqdm import tqdm


def revert_complex_transformation(z):
    """
    Revert a complex number that was transformed by:
    1. Taking log2 of original values
    2. Adding ±1 to real/imaginary parts
    """
    # Subtract the added ±1 first
    real_part = z.real - 1 if z.real >= 1 else z.real + 1
    imag_part = z.imag - 1 if z.imag >= 1 else z.imag + 1
    
    # Reverse the log2 transformation
    original_real = np.power(2, real_part)
    original_imag = np.power(2, imag_part)
    
    return complex(original_real, original_imag)


def pmf_with_bias(X: np.array, test: np.array, k: int, learning_rate: float, num_iterations: int, lambda_reg: float):
    """
    Probabilistic Matrix Factorization with user and item bias using stochastic gradient descent for complex numbers.
    
    Arguments:
    X -- Input matrix of shape (m, n) with complex numbers
    test -- Test matrix of the same shape as X with complex numbers
    k -- Number of latent features
    learning_rate -- Learning rate for gradient descent
    num_iterations -- Number of iterations for the optimization
    lambda_reg -- Regularization strength
    sigma_sq -- Variance of the complex Gaussian distribution
    
    Returns:
    U -- Matrix of shape (m, k) representing the user latent factors (complex)
    V -- Matrix of shape (n, k) representing the item latent factors (complex)
    b_u -- Vector of shape (m,) representing the user bias terms (real)
    b_v -- Vector of shape (n,) representing the item bias terms (real)
    """
    m, n = X.shape

    # Initialize U, V, b_u, and b_v with random values
    U = np.random.normal(scale=1.0 / k, size=(m, k)).astype(complex)
    V = np.random.normal(scale=1.0 / k, size=(n, k)).astype(complex)
    b_u = np.random.normal(scale=1.0 / k, size= m).astype(complex) 
    b_v = np.random.normal(scale=1.0 / k, size=n).astype(complex)
    
    for iteration in range(num_iterations):
        for i in range(m):
            for j in range(n):
                if X[i,j]!=0:
                    prediction = np.dot(U[i, :], V[j, :].conj()) + b_u[i] + b_v[j]
                    error = X[i, j] - prediction
                    grad_U = -error * V[j, :] + lambda_reg * U[i, :]
                    grad_V = -error * U[i, :] + lambda_reg * V[j, :]
                    grad_b_u = -error + lambda_reg * b_u[i]
                    grad_b_v = -error + lambda_reg * b_v[j]

                    U[i, :] -= learning_rate * grad_U
                    V[j, :] -= learning_rate * grad_V
                    b_u[i] -= learning_rate * grad_b_u
                    b_v[j] -= learning_rate * grad_b_v

                    prediction = np.clip(prediction, -14, 9)

        if iteration % 10 == 0:
            print('Iteration : ' + str(iteration))
            #print('MAE train error')
            mask = np.nonzero(X)
            #print(np.mean(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask])))
            mask = np.nonzero(test)
            #print('MAE test error')
            mae_loss = np.mean(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]))
            #print(mae_loss)

            #print('MSE train error')
            mask = np.nonzero(X)
            #print(np.mean(pow(abs(X[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2)))
            mask = np.nonzero(test)
            #print('MSE test error')
            mse_loss = np.mean(pow(abs(test[mask]-(np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :])[mask]), 2))
            #print(mse_loss)

            pred = np.dot(U, V.T)+b_u[:, np.newaxis]+b_v[np.newaxis, :]
            mask_pred = pred[mask]
            mask_test = test[mask]
            mask_test_real = list()
            mask_pred_real = list()
            error=0
            c=0
            # Apply to your test and prediction masks
            mask_test_original = [revert_complex_transformation(z) for z in mask_test]
            mask_pred_original = [revert_complex_transformation(z) for z in mask_pred]

            mae_loss_transformed = np.mean(np.abs(np.array(mask_test_original) - np.array(mask_pred_original)))
            #print(f'converted MAE transformed : {mae_loss_transformed}')
            mse_loss_transformed = np.sqrt(np.mean(np.power(np.abs(np.array(mask_test_original) - np.array(mask_pred_original)), 2)))
            #print(f'converted MSE transformed : {mse_loss_transformed}')
    return U, V, b_u, b_v, mae_loss, mse_loss, mae_loss_transformed, mse_loss_transformed


def load_fold_indices(fold, base_path="../Datasets/NCI/NCI/"):
    """Load train and test indices for a given fold"""
    train_path = f"{base_path}NCI_train_index_fold_{fold}.txt"
    test_path = f"{base_path}NCI_test_index_fold_{fold}.txt"
    return (
        np.loadtxt(train_path, dtype=int),
        np.loadtxt(test_path, dtype=int)
    )

def create_train_test_matrices(ratings, indexes, train_idx, test_idx):
    """Create train and test matrices from indices"""
    train = np.zeros_like(ratings, dtype=complex)
    test = np.zeros_like(ratings, dtype=complex)
    
    for temp in train_idx:
        train[tuple(indexes[temp])] = ratings[tuple(indexes[temp])]
    for temp in test_idx:
        test[tuple(indexes[temp])] = ratings[tuple(indexes[temp])]
    
    return train, test

def calculate_validation_accuracy(t, test_idx, indexes, unprocessed, data):
    """Calculate validation accuracy by comparing predictions to ground truth"""
    c = 0
    for i in test_idx:
        row, col = indexes[i]
        cell_line = data.index[row]
        combo_name = data.columns[col]
        pred = t[row, col]
        
        combs = unprocessed[
            (unprocessed['CellLine'] == cell_line) & 
            (unprocessed['combination_name'] == combo_name)
        ][['ic50', 'new Conc1', 'new Conc2', 'PercentageGrowth']]
        
        # Find nearest concentrations
        a = min(combs['new Conc1'].unique(), 
                key=lambda t: abs(pred.real - t))
        b = min(combs['new Conc2'].unique(), 
                key=lambda t: abs(pred.imag - t))
        
        # Check if prediction falls in valid range
        pred_conc = combs[
            (combs['new Conc1'] == a) & 
            (combs['new Conc2'] == b)]
        if pred_conc[(pred_conc['PercentageGrowth'] >= 44) & (pred_conc['PercentageGrowth'] <= 56)].shape[0] > 0:
            c += 1
            
    return c / len(test_idx)

def run_cross_validation(data, unprocessed, n_folds=5, pmf_params=None):
    """Run full cross-validation pipeline"""
    if pmf_params is None:
        pmf_params = {
            'n_features': 700,
            'learning_rate': 0.001,
            'n_epochs': 300,
            'reg_param': 0.0001        
            }
    
    ratings = data.to_numpy(dtype='complex')
    result_np = ratings.copy()
    indexes = np.argwhere(result_np != complex(0, 0))
    
    for fold in range(n_folds):
        print(f'\nTraining on fold: {fold}')
        
        # Load fold data
        train_idx, test_idx = load_fold_indices(fold)
        print(f"Train samples: {len(train_idx)}, Test samples: {len(test_idx)}")
        
        # Create train/test matrices
        train, test = create_train_test_matrices(ratings, indexes, train_idx, test_idx)
        
        # Run PMF with bias
        U, V, b_u, b_v, mae, mse, mae_t, mse_t = pmf_with_bias(
            train, test, 
            pmf_params['n_features'], 
            pmf_params['learning_rate'], 
            pmf_params['n_epochs'], 
            pmf_params['reg_param']
        )
        
        # Calculate predictions
        t = np.dot(U, V.T) + b_u[:, np.newaxis] + b_v[np.newaxis, :]
        
        # Calculate validation accuracy
        val_acc = calculate_validation_accuracy(t, test_idx, indexes, unprocessed, data)
        
        # Print results
        print(f'Final MAE: {mae:.4f}')
        print(f'Final MSE: {mse:.4f}')
        print(f'Final MAE (transformed): {mae_t}')
        print(f'Final MSE (transformed): {mse_t}')
        print(f'Validation accuracy: {val_acc:.4f}')

# Example usage:
run_cross_validation(data, unprocessed, n_folds=5)


Training on fold: 0
Train samples: 4153, Test samples: 1039
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iteration : 160
Iteration : 170
Iteration : 180
Iteration : 190
Iteration : 200
Iteration : 210
Iteration : 220
Iteration : 230
Iteration : 240
Iteration : 250
Iteration : 260
Iteration : 270
Iteration : 280
Iteration : 290
Final MAE: 3.0147
Final MSE: 12.0236
Final MAE (transformed): 0.0010058697197322588
Final MSE (transformed): 0.022943931546224546
Validation accuracy: 0.3869

Training on fold: 1
Train samples: 4153, Test samples: 1039
Iteration : 0
Iteration : 10
Iteration : 20
Iteration : 30
Iteration : 40
Iteration : 50
Iteration : 60
Iteration : 70
Iteration : 80
Iteration : 90
Iteration : 100
Iteration : 110
Iteration : 120
Iteration : 130
Iteration : 140
Iteration : 150
Iterat