In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold

In [32]:
# Loading Data into a DataFrame
ic50_path = 'data/GDSC2_fitted_dose_response.csv'
cle_path = 'data/cell_line_expressions.csv'

ic50_df = pd.read_csv(ic50_path)
cle_df = pd.read_csv(cle_path)

# Only take the necessary columns
ic50_df_filtered = ic50_df[['CELL_LINE_NAME', 'LN_IC50', 'DRUG_ID']]

# Filter the DataFrame for DRUG_ID = 1003
ic50_df_filtered = ic50_df_filtered[ic50_df_filtered['DRUG_ID'] == 1003]

# Perform a iner join on CELL_LINE_NAME between filtered_ic50_df and cle_df
ic50_df_filtered['CELL_LINE_NAME'] = ic50_df_filtered['CELL_LINE_NAME'].str.lower()
cle_df['Cell_Line'] = cle_df['Cell_Line'].str.lower()
merged_df = pd.merge(ic50_df_filtered, cle_df, how='inner', left_on='CELL_LINE_NAME', right_on='Cell_Line')
merged_df = merged_df.drop(columns=['DRUG_ID', 'CELL_LINE_NAME', 'Cell_Line'])

In [33]:
# Split the features and output
final_features = merged_df.drop(columns=['LN_IC50'])
final_output = merged_df['LN_IC50']

In [10]:
def number_of_components(df):
    """
    Determines the minimum number of principal components required to retain 95% of the variance in the data.

    Parameters:
    df (DataFrame): Scaled Input Features where each row is a sample and each column is a feature.

    Returns:
    tuple: A tuple containing:
        - n_components (int): The number of principal components required to retain 95% of the variance.
        - cumulative_explained_variance (float): The actual cumulative explained variance achieved with the selected number of components.
    """
    pca = PCA()
    pca.fit(df)

    # Calculate the explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_

    # Calculate the cumulative explained variance
    cumulative_explained_variance = np.cumsum(explained_variance_ratio)

    # Find the number of components for 95% variance
    n_components = np.argmax(cumulative_explained_variance >= 0.95) + 1

    return (n_components, cumulative_explained_variance[n_components-1])

In [11]:
def pca_transform(df, n_components):
    """
    Applies PCA to reduce the dimensionality of a DataFrame.

    Parameters:
    df (DataFrame): Scaled Input Features.
    n_components (int): The number of components required to retain 95% of variance.

    Returns:
    DataFrame: Returns a pd.DataFrame with the PCA transformation.
    """
    pca = PCA(n_components=n_components)
    pca.fit(df)
    df_pca = pca.transform(df)

    return pd.DataFrame(df_pca, columns=[f"PC{i+1}" for i in range(n_components)])

In [None]:
# Determining the number of components needed for PCA for the final data
(n_components, cumulative_explained_variance) = number_of_components(final_features)

# Display results
print(f"Number of components to retain 95% of variance: {n_components}")
print(
    f"Cumulative explained variance for the selected components: {cumulative_explained_variance}"
)

In [None]:
def kfold_pca_rfr(n_components = [50, 100, 200, 500], n_estimators = [50, 100, 200, 500]):
    """
    Performs KFold cross validation for the given number of components in RFR and number of estimators in PCA

    Parameters:
    n_components (array): Scaled Input Features where each row is a sample and each column is a feature.

    Returns:
    tuple: A tuple containing:
        - n_components (int): The number of principal components required to retain 95% of the variance.
        - cumulative_explained_variance (float): The actual cumulative explained variance achieved with the selected number of components.
    """
    mses = []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for n_component in n_components:
        final_pca = pca_transform(final_features, n_component)
        X_train, X_test, y_train, y_test = train_test_split(final_pca, final_output, test_size=0.2, random_state=42)
        
        for n_estimator in n_estimators:
            kf_mses = []

            for train_index, val_index in kf.split(X_train, y_train):
                X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                rf_regressor = RandomForestRegressor(n_estimators=n_estimator, random_state=42)
                rf_regressor.fit(X_train_fold, y_train_fold)
                y_pred = rf_regressor.predict(X_val_fold)
                kf_mses.append(mean_squared_error(y_val_fold, y_pred))

            avg_val_mse = np.mean(kf_mses)
            mses.append([n_component, n_estimator, avg_val_mse])

    return mses

In [46]:
mses = kfold_pca_rfr()

In [None]:
results = np.array(mses)
min_mse_index = np.argmin(results[:, 2])
best_n_component = int(results[min_mse_index, 0])
best_n_estimator = results[min_mse_index, 1]
best_mse = results[min_mse_index, 2]
print(f"Best number of components: {best_n_component}")
print(f"Best number of estimators: {best_n_estimator}")
print(f"Best validation MSE: {best_mse}")

Best number of components: 200
Best number of estimators: 50.0
Best validation MSE: 2.379761169002068
