In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_curve, auc, confusion_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import pycaret
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pycaret.classification import *
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import KernelPCA, PCA

In [2]:
# Load data
def load_data_csv(filepath):
    return pd.read_csv(filepath)

# Preprocess data
def preprocess_data(data, min_class_instances=100):
    if 'Original_ICD-11' in data:
        del data['Original_ICD-11']

    X = data.iloc[:, 1:-23]
    y = data.iloc[:, -23:]

    class_labels = y.columns.tolist()
    class_mapping = {label: idx for idx, label in enumerate(class_labels)}
    y = y.idxmax(axis=1).map(class_mapping).astype(int)

    class_counts = y.value_counts()
    valid_classes = class_counts[class_counts >= min_class_instances].index
    filtered_indices = y.isin(valid_classes)
    X = X[filtered_indices]
    y = y[filtered_indices]

    class_mapping = {old: new for new, old in enumerate(valid_classes)}
    y = y.map(class_mapping)

    print(f'Number of classes: {len(valid_classes)}')  # Print number of classes

    return X, y

# Filter features by variance
def filter_features_by_variance(X, low_quantile=0.10, high_quantile=0.99):
    variances = X.var()
    features_to_keep = variances[(variances >= variances.quantile(low_quantile)) & (variances <= variances.quantile(high_quantile))].index

    plt.figure(figsize=(12, 6))
    sns.barplot(x=variances.index, y=variances.values)
    plt.axhline(y=variances.quantile(low_quantile), color='r', linestyle='--', label=f'{low_quantile*100}% Threshold')
    plt.axhline(y=variances.quantile(high_quantile), color='b', linestyle='--', label=f'{high_quantile*100}% Threshold')
    plt.xticks(rotation='vertical')
    plt.ylabel('Variance')
    plt.xlabel('Features')
    plt.title('Variance of Each Feature')
    plt.legend()
    plt.tight_layout()
    plt.savefig('variance_plot.png')  # Save figure
    plt.close()  # Close the figure to release memory

    return X[features_to_keep]

# Plot data distribution
def plot_histogram(data, title):
    plt.figure(figsize=(12, 6))
    sns.histplot(data.values.flatten(), bins=50, kde=True)
    plt.xlabel('Value')
    plt.ylabel('Frequency')
    plt.title(f'Histogram of Data: {title}')
    plt.grid(True)
    plt.savefig(f'histogram_{title}.png')  # Save figure
    plt.close()  # Close the figure to release memory

# # Plot PCA
# def plot_pca(X, y, title):
#     pca = PCA(n_components=2)
#     X_pca = pca.fit_transform(X)
#     plt.figure(figsize=(12, 6))
#     sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=y, palette='viridis')
#     plt.xlabel('PCA Component 1')
#     plt.ylabel('PCA Component 2')
#     plt.title(f'PCA Plot: {title}')
#     plt.legend()
#     plt.tight_layout()
#     plt.savefig(f'pca_plot_{title}.png')  # Save figure
#     plt.close()  # Close the figure to release memory

# PCA plot function using seaborn
def plot_pca(data, title):
    pca = PCA(n_components=2)
    components = pca.fit_transform(data.drop(columns=['class']))
    pca_df = pd.DataFrame(data={'PCA1': components[:, 0], 'PCA2': components[:, 1], 'class': data['class']})
    
    plt.figure(figsize=(12, 6))
    sns.scatterplot(data=pca_df, x='PCA1', y='PCA2', hue='class', palette='viridis', alpha=0.7)
    plt.xlabel('PCA Component 1')
    plt.ylabel('PCA Component 2')
    plt.title(f'PCA Plot: {title}')
    plt.grid(True)
    plt.legend()
    plt.savefig(f'{title}_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()

# Feature selection and balancing
def feature_selection_and_balancing(X, Y, n_features_to_select=200, low_quantile=0.10, high_quantile=0.99):
    # Remove feature with lower variance
    X = filter_features_by_variance(X, low_quantile, high_quantile)

    # Feature Selection using RFE
    model = RandomForestClassifier()
    rfe = RFE(model, n_features_to_select=n_features_to_select)
    X_selected = rfe.fit_transform(X, Y)
    selected_features = X.columns[rfe.get_support()]

    # Plot histogram of selected features
    plot_histogram(pd.DataFrame(X_selected, columns=selected_features), 'Selected Features')

    # Plot PCA of selected features
    plot_pca(pd.DataFrame(X_selected, columns=selected_features), Y, 'Selected Features')

    # Handle imbalanced data using SMOTE
    smote = SMOTE()
    X_resampled, y_resampled = smote.fit_resample(X_selected, Y)

    return X_resampled, y_resampled

# Split data into training and testing sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

In [3]:
# Main function
data = load_data_csv('/data4/msc23104470/simplified_gene_expression_data.csv')
data.set_index("rid", inplace=True)
X, Y = preprocess_data(data, 100)

Number of classes: 12


In [4]:
# df.rename({0: 'class'}, axis=1, inplace=True)

In [5]:
pd.DataFrame(Y).drop_duplicates()

Unnamed: 0_level_0,0
rid,Unnamed: 1_level_1
AML001_CD34_6H:BRD-K43389675:10,0
AML001_PC3_6H:BRD-A45664787:10,2
ASG001_MCF7_24H:BRD-A13084692-001-05-8:0.08,4
ASG001_MCF7_24H:BRD-A84481105-003-15-6:0.08,1
ASG001_MCF7_24H:BRD-K41260949-001-06-7:0.08,5
ASG001_MCF7_24H:BRD-K71879491-001-17-6:0.08,8
BRAF001_A375_24H:BRD-K92049597-001-14-1:10,6
CPC001_HA1E_24H:BRD-K78692225-001-11-2:10,11
CPC001_HCC515_6H:BRD-K09963420-066-03-4:10,3
CPC001_PC3_6H:BRD-K48970916-001-03-0:10,9


In [6]:
data.shape

(4062, 1001)

In [7]:
X.shape

(3807, 977)

In [8]:
Y.shape

(3807,)

In [9]:
# Plot initial data distribution
plot_histogram(X, 'Initial')

findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the fo

In [None]:
# Feature selection and balancing
X, Y = feature_selection_and_balancing(X, Y, n_features_to_select=300)

findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Generic family 'sans-serif' not found because none of the fo

In [None]:
# Combine X and Y into final_data
final_data = pd.concat([pd.DataFrame(X), pd.DataFrame(Y, columns=['class'])], axis=1)

# final_data = pd.concat([pd.DataFrame(X), final_data['class'].reset_index(drop=True)], axis=1)

In [None]:
# Setup PyCaret
s = setup(final_data, target='class', session_id=123, remove_outliers=True, feature_selection=True, pca=True, use_gpu=True)

In [None]:
# Compare Models
best_model = compare_models()

In [None]:
# Analyze Model
evaluate_model(best_model)

In [None]:
# Plot AUC
plot_model(best_model, plot='auc')

In [None]:
# Plot confusion matrix
plot_model(best_model, plot='confusion_matrix')

In [None]:
# Predict on the train/test split
predict_model(best_model)

In [None]:
# Predict on unseen data
predictions = predict_model(best_model, data=data)
predictions.head()

In [None]:
# Save the best model
save_model(best_model, 'my_best_pipeline')

In [None]:
# Load the model
loaded_model = load_model('my_best_pipeline')
print(loaded_model)