In [97]:
import pandas as pd
import requests
import os
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, Input
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Data Source and Project Motivation

We looked at a highly-dimensional dataset showing the degree to which ~28,246 genes are expressed in 408 sample analyses, including a few replicate analyses of certain samples for validation —these represent samples that were collected from 115 endometriosis patients and 53 healthy controls. Our goal is to train a model that is able to accurately predict the presence of endometriosis, a chronic disease which affects ~10% of women worldwide and has no cure. Endometriosis is a poorly understood disease that is characterized by uterine tissue cells growing outside of the uterus, which can be debilitating and fertility-impacting. In many cases, multiple surgeries (sometimes the complete removal of a uterus) are required for managing this condition. Furthermore, early detection procedures for endometriosis such as pelvic exams, abdominal ultrasounds, MRIs, and laparoscopy are either expensive, inaccurate, or invasive – sometimes all three. This leads to many late diagnoses of endometriosis, due to extraneous factors affecting the clinician's willingness to test early. For this reason, the average time until diagnosis is 7 years.

In hopes to facilitate more research on endometriosis, researchers at the University of Turku in Finland created a dataset with more than 392 unique samples, documenting the expression of thousands of genes in each sample, as outlined above. Note, because endometriosis is the growth of uterine tissue outside of the uterus, some patients contributed multiple unique samples from different places on their body. The final output of the Turku-led project was a user interface for assisting researchers. However, we were able to collect their raw data, which they claim was normalized, but the scale makes us believe it was likely standardized. The data was also batch-controlled, meaning that the research team accounted for technical differences arising from the use of different machines for analyzing samples — this way, we are capturing biological differences, as opposed to technical measurement differences.

# Loading the data

In [98]:
filePath = "content/GSE141549_batchCorrectednormalizedArrayscombined.xlsx"

# URL for downloading the file
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE141549&format=file&file=GSE141549%5FbatchCorrectednormalizedArrayscombined%2Exlsx"

# checking if the file already exists
if not os.path.exists(filePath):
    os.makedirs(os.path.dirname(filePath), exist_ok=True)
    # Download the file if it doesn't exist
    response = requests.get(url)
    with open(filePath, "wb") as file:
        file.write(response.content)
    print("File downloaded.")
else:
    print("File already exists.")

# loading the Excel file into a pandas DataFrame
df = pd.read_excel(filePath)
print("Data loaded successfully.")

File already exists.
Data loaded successfully.


# Simple exploration of raw dataset and processing steps

### First few rows

In [99]:
df.head(4)

Unnamed: 0,Gene_symbol,Probe_Id,SAMPLE 332 PE,SAMPLE 333 DiEIn,SAMPLE 334 PE,SAMPLE 335 PeLB,SAMPLE 336 PE,SAMPLE 337 PE,SAMPLE 338 PP,SAMPLE 339 SuL,...,SAMPLE 26 CE,SAMPLE 27 PE,SAMPLE 28 REV,SAMPLE 29 PeLR,SAMPLE 30 OMA,SAMPLE 31 CP,SAMPLE 32 CE,SAMPLE 33 PE,SAMPLE 34 PP,SAMPLE 35 PeLR
0,RERE,ILMN_1802380,11.488459,11.605685,12.006795,12.132273,11.416302,12.18647,11.340178,12.025644,...,11.69366,11.160831,11.869227,11.434205,11.010967,11.847268,11.3091,11.839674,11.765684,11.533947
1,LOC105374121,ILMN_1736104,6.673564,6.520175,6.619677,6.514652,6.602206,6.596461,6.805189,6.483506,...,6.597478,6.580707,6.653111,6.552827,6.587035,6.531963,6.592075,6.423123,6.499188,6.732907
2,CIDEA,ILMN_1788184,6.680433,6.588961,6.59334,7.795417,6.598428,6.666354,8.203924,6.661223,...,6.991229,6.916313,7.031737,6.982656,6.649744,6.665928,6.834289,6.915713,6.365116,7.10533
3,SLC17A3,ILMN_1690979,6.772489,6.482288,6.579807,6.409611,6.361172,6.496526,6.489509,6.617248,...,6.384578,6.471227,6.203973,6.407872,6.334447,6.40275,6.3715,6.407001,6.417949,6.386594


In [100]:
df["Gene_Probe"] = df["Gene_symbol"] + '_' + df["Probe_Id"]
df = df.set_index("Gene_Probe") #setting it as our identifier
df = df.drop(columns=["Gene_symbol", "Probe_Id"]) #no longer needed

### First few rows after creating a unique identifier "Gene_Probe" can be seen below.
Please note that currently, the rows below represent genes, but since we will be using that to predict the disease/health status, we will be transposing this df momentarily.

In [101]:
df.head(4)

Unnamed: 0_level_0,SAMPLE 332 PE,SAMPLE 333 DiEIn,SAMPLE 334 PE,SAMPLE 335 PeLB,SAMPLE 336 PE,SAMPLE 337 PE,SAMPLE 338 PP,SAMPLE 339 SuL,SAMPLE 340 PE,SAMPLE 341 PP,...,SAMPLE 26 CE,SAMPLE 27 PE,SAMPLE 28 REV,SAMPLE 29 PeLR,SAMPLE 30 OMA,SAMPLE 31 CP,SAMPLE 32 CE,SAMPLE 33 PE,SAMPLE 34 PP,SAMPLE 35 PeLR
Gene_Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RERE_ILMN_1802380,11.488459,11.605685,12.006795,12.132273,11.416302,12.18647,11.340178,12.025644,11.613111,12.006795,...,11.69366,11.160831,11.869227,11.434205,11.010967,11.847268,11.3091,11.839674,11.765684,11.533947
LOC105374121_ILMN_1736104,6.673564,6.520175,6.619677,6.514652,6.602206,6.596461,6.805189,6.483506,6.701358,6.707116,...,6.597478,6.580707,6.653111,6.552827,6.587035,6.531963,6.592075,6.423123,6.499188,6.732907
CIDEA_ILMN_1788184,6.680433,6.588961,6.59334,7.795417,6.598428,6.666354,8.203924,6.661223,6.551393,8.027454,...,6.991229,6.916313,7.031737,6.982656,6.649744,6.665928,6.834289,6.915713,6.365116,7.10533
SLC17A3_ILMN_1690979,6.772489,6.482288,6.579807,6.409611,6.361172,6.496526,6.489509,6.617248,6.427462,6.481086,...,6.384578,6.471227,6.203973,6.407872,6.334447,6.40275,6.3715,6.407001,6.417949,6.386594


### Number of rows
We have not yet transposed the df, so we get the number of rows by looking at the number of columns

In [102]:
len(df.columns) # number of samples, though some are replicates because the researchers wanted to ensure the results were correct

408

### Number of columns (features)
We have not yet transposed the df, so we get the number of columns by looking at the number of rows

In [103]:
len(df) # number of unique genes+probe

28247

### Determining number of unique samples
We create a list of all the column names (which represent each future row after we transpose the df). The goal is to get a sense of how many samples were analyzed more than once(i.e. the number of replicates).

In [104]:
columnNames = [col for col in df.columns if col not in ["", "Gene_symbol", "Probe_Id"]]
print(f"Number of sample analyses: {len(columnNames)}")
# Defining a function to extract the numerical part from the column name
def extractSampleNumber(columnNameInput):
    # Split by spaces and take the second part, which should be the number
    parts = columnNameInput.split(" ")
    if len(parts) > 1 and parts[1].isdigit():
        return int(parts[1])
    return float("inf")  # In case the format is unexpected

# Sorting the column names using the custom function
sortedColumns = sorted(columnNames, key=extractSampleNumber)

# displaying the first few sorted column names
sortedColumns[:7]


Number of sample analyses: 408


['SAMPLE 1 OMA',
 'SAMPLE 2 PP',
 'SAMPLE 3 PE',
 'SAMPLE 4 DiEIn',
 'SAMPLE 5 REV',
 'SAMPLE 6 CP',
 'SAMPLE 7 PP']

In [105]:
totalItems = len(sortedColumns)
replicateItems = sum(1 for item in sortedColumns if item.endswith("Replicate"))
uniqueRows = totalItems - replicateItems
print(f"Total items in sortedColumns: {totalItems}")
print(f"Total items ending with 'Replicate': {replicateItems}")
print(f"Total unique samples: {uniqueRows}")

Total items in sortedColumns: 408
Total items ending with 'Replicate': 16
Total unique samples: 392


### Creating a dictionary where each key-value pair represents the count of each unique subtype
We are extracting the third item from each column name (as this code indicates type of disease state or healthy state), then we are creating a dictionary showing the number of samples with the various endometriosis classifications!

In [106]:
# extracting the codes from the column (3rd item) that we could predict with our ensamble model
codes = [item.split()[2] for item in sortedColumns if len(item.split()) >= 3]

# creating a dictionary with the count of each unique code -> data imbalance?
codeCounts = Counter(codes)

# print the dict
print(codeCounts)

Counter({'PE': 104, 'CE': 43, 'PP': 39, 'DiEIn': 39, 'PeLB': 29, 'OMA': 28, 'PeLR': 28, 'SuL': 27, 'CP': 24, 'REV': 22, 'PeLW': 22, 'DiEB': 3})



> ***Each of these codes represent either a disease-type or healthy state, and it is this dicotomy that we will be predicting with an ensamble model. The input will be a new sample containing gene expression data (each feature will be the level of expression for a specific gene) and we will be predicting if it represents a diseased or healthy sample:***

**The following codes appear to correspond to healthy controls based on the original scientific paper published by the Turku researchers:**
- CP: peritoneum samples from healthy control
- CE: endometrium sample from healthty control

**The rest of the codes seen to represent various types of endometriosis subtypes from endometriosis patients:**

- PeLR: red peritoneal endometriotic lesion
- DiEIn: intestinal endometriotic lesions
- PeLW: white peritoneal endometriotic lesion
- REV: deep rectovaginal endometriotic lesions
- DiEB: deep endometriotic lesions in the bladder
- PE: endometrium sample from endometriosis patient
- SuL: sacrouterine ligament endometriotic lesions
- PeLB: black peritoneal endometriotic lesion
- OMA: ovarian endometrioma samples
- PP: peritoneum samples from patients

# Data cleaning and refining

### Reminding ourselves of what we are working with

In [107]:
print(len(df.columns)) # number of sample analysis outputs, though some are replicates of one sample because the researchers wanted to ensure the results were correct
print(len(df)) # number of unique genes+probe
df.head(4)

408
28247


Unnamed: 0_level_0,SAMPLE 332 PE,SAMPLE 333 DiEIn,SAMPLE 334 PE,SAMPLE 335 PeLB,SAMPLE 336 PE,SAMPLE 337 PE,SAMPLE 338 PP,SAMPLE 339 SuL,SAMPLE 340 PE,SAMPLE 341 PP,...,SAMPLE 26 CE,SAMPLE 27 PE,SAMPLE 28 REV,SAMPLE 29 PeLR,SAMPLE 30 OMA,SAMPLE 31 CP,SAMPLE 32 CE,SAMPLE 33 PE,SAMPLE 34 PP,SAMPLE 35 PeLR
Gene_Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RERE_ILMN_1802380,11.488459,11.605685,12.006795,12.132273,11.416302,12.18647,11.340178,12.025644,11.613111,12.006795,...,11.69366,11.160831,11.869227,11.434205,11.010967,11.847268,11.3091,11.839674,11.765684,11.533947
LOC105374121_ILMN_1736104,6.673564,6.520175,6.619677,6.514652,6.602206,6.596461,6.805189,6.483506,6.701358,6.707116,...,6.597478,6.580707,6.653111,6.552827,6.587035,6.531963,6.592075,6.423123,6.499188,6.732907
CIDEA_ILMN_1788184,6.680433,6.588961,6.59334,7.795417,6.598428,6.666354,8.203924,6.661223,6.551393,8.027454,...,6.991229,6.916313,7.031737,6.982656,6.649744,6.665928,6.834289,6.915713,6.365116,7.10533
SLC17A3_ILMN_1690979,6.772489,6.482288,6.579807,6.409611,6.361172,6.496526,6.489509,6.617248,6.427462,6.481086,...,6.384578,6.471227,6.203973,6.407872,6.334447,6.40275,6.3715,6.407001,6.417949,6.386594


### Transposing the dataframe
Next, we are transposing the dataframe, as our goal is to use the genes (currently rows) as our predictors, hence their transformation into columns - the "Gene_Probe" column name above the samples will be removed momentarily

In [108]:
transposedDF = df.T
print(f"Number of total columns: {len(transposedDF.columns)}")
print(f"Number of total rows: {len(transposedDF)}")
transposedDF.head(4)

Number of total columns: 28247
Number of total rows: 408


Gene_Probe,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,LINC00999_ILMN_3239930,GAK_ILMN_1701042,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
SAMPLE 332 PE,11.488459,6.673564,6.680433,6.772489,6.806118,8.939555,7.705613,6.400467,6.705788,6.240199,...,6.724087,6.522196,6.625318,11.10591,6.725321,6.671754,6.341979,6.378403,12.169301,9.622711
SAMPLE 333 DiEIn,11.605685,6.520175,6.588961,6.482288,6.547302,8.331163,7.857551,6.347822,7.049031,6.291638,...,6.721231,6.492125,6.381971,11.156876,6.660244,6.555477,6.30885,6.476137,12.272535,9.737564
SAMPLE 334 PE,12.006795,6.619677,6.59334,6.579807,8.239626,8.742562,7.932845,6.487091,6.981085,6.361566,...,6.63647,6.601727,6.730844,11.737975,6.470417,6.379294,6.415222,6.368305,12.264406,9.515193
SAMPLE 335 PeLB,12.132273,6.514652,7.795417,6.409611,6.805541,8.385348,7.863452,6.4547,6.936864,6.228683,...,6.847214,7.449695,6.600583,11.560674,6.665999,6.488463,6.457537,6.424394,11.910975,9.353855


### As our goal is to identify disease vs healthy state, we are extracting the third item from the sample name and using that to create a "label" column

In [109]:
# adding a new "label" column by extracting the third item from the row index (sample name)
transposedDF.insert(0, "label", transposedDF.index.str.split().str[2])
print(f"Number of total columns: {len(transposedDF.columns)}")
print(f"Number of total rows: {len(transposedDF)}")
# showing the first few rows to verify
transposedDF.head(4)

Number of total columns: 28248
Number of total rows: 408


Gene_Probe,label,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,LINC00999_ILMN_3239930,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
SAMPLE 332 PE,PE,11.488459,6.673564,6.680433,6.772489,6.806118,8.939555,7.705613,6.400467,6.705788,...,6.724087,6.522196,6.625318,11.10591,6.725321,6.671754,6.341979,6.378403,12.169301,9.622711
SAMPLE 333 DiEIn,DiEIn,11.605685,6.520175,6.588961,6.482288,6.547302,8.331163,7.857551,6.347822,7.049031,...,6.721231,6.492125,6.381971,11.156876,6.660244,6.555477,6.30885,6.476137,12.272535,9.737564
SAMPLE 334 PE,PE,12.006795,6.619677,6.59334,6.579807,8.239626,8.742562,7.932845,6.487091,6.981085,...,6.63647,6.601727,6.730844,11.737975,6.470417,6.379294,6.415222,6.368305,12.264406,9.515193
SAMPLE 335 PeLB,PeLB,12.132273,6.514652,7.795417,6.409611,6.805541,8.385348,7.863452,6.4547,6.936864,...,6.847214,7.449695,6.600583,11.560674,6.665999,6.488463,6.457537,6.424394,11.910975,9.353855


# Creating final predictor feature - the "disease" column

### 'disease' column from 'label' column
Now that we have the label column, we can use this column to indicate if the respective row represents a diseased or healthy state under a new column "disease", which is what our trained model will be predicting

In [110]:
# creating a "disease" column with 0 if label is "CP" or "CE", otherwise 1 (as all other codes represents disease)
transposedDF.insert(1, "disease", transposedDF["label"].apply(lambda x: 0 if x in ["CP", "CE"] else 1))
transposedDF.sort_index(ascending=True)
print(f"Number of total columns: {len(transposedDF.columns)}")
print(f"Number of total rows: {len(transposedDF)}")
# showing the first few rows to verify
transposedDF.head(4)

Number of total columns: 28249
Number of total rows: 408


Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
SAMPLE 332 PE,PE,1,11.488459,6.673564,6.680433,6.772489,6.806118,8.939555,7.705613,6.400467,...,6.724087,6.522196,6.625318,11.10591,6.725321,6.671754,6.341979,6.378403,12.169301,9.622711
SAMPLE 333 DiEIn,DiEIn,1,11.605685,6.520175,6.588961,6.482288,6.547302,8.331163,7.857551,6.347822,...,6.721231,6.492125,6.381971,11.156876,6.660244,6.555477,6.30885,6.476137,12.272535,9.737564
SAMPLE 334 PE,PE,1,12.006795,6.619677,6.59334,6.579807,8.239626,8.742562,7.932845,6.487091,...,6.63647,6.601727,6.730844,11.737975,6.470417,6.379294,6.415222,6.368305,12.264406,9.515193
SAMPLE 335 PeLB,PeLB,1,12.132273,6.514652,7.795417,6.409611,6.805541,8.385348,7.863452,6.4547,...,6.847214,7.449695,6.600583,11.560674,6.665999,6.488463,6.457537,6.424394,11.910975,9.353855


### Example of 2 rows representing one real sample:

In [111]:
rowsOriginal = transposedDF.loc[['SAMPLE 121 CE', 'SAMPLE 121 CE Replicate']]
rowsOriginal

Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
SAMPLE 121 CE,CE,0,10.61532,6.679381,7.069106,6.547775,8.287285,8.444118,9.618014,6.322037,...,6.613581,6.41857,7.11629,10.325763,6.558989,6.363129,6.560848,6.422209,12.821027,9.558652
SAMPLE 121 CE Replicate,CE,0,10.833608,6.722729,6.315895,6.432962,7.5586,8.658778,8.893812,6.427485,...,6.724493,6.33651,6.358608,10.633035,6.662112,6.378342,6.56457,6.368253,11.555183,9.757919


In [112]:
allSamples = sorted(transposedDF.index.tolist())
allSamples[:7]

['SAMPLE 1 OMA',
 'SAMPLE 10 OMA',
 'SAMPLE 100 PeLR',
 'SAMPLE 101 PP',
 'SAMPLE 102 DiEIn',
 'SAMPLE 103 PeLW',
 'SAMPLE 104 PE']

In [113]:
# Step 1: Reset the index and ensure the index column has a name
transposedDF = transposedDF.reset_index()

# Rename the index column if it's unnamed or named differently
if transposedDF.columns[0] != 'Gene_Probe':
    transposedDF = transposedDF.rename(columns={transposedDF.columns[0]: 'Gene_Probe'})

# Step 2: Remove ' Replicate' from 'Gene_Probe' to create base names
transposedDF['Gene_Probe'] = transposedDF['Gene_Probe'].str.replace(' Replicate$', '', regex=True)

# Step 3: Identify data columns (excluding 'Gene_Probe', 'label', 'disease')
data_cols = transposedDF.columns.difference(['Gene_Probe', 'label', 'disease'])

# Step 4: Define aggregation functions
aggregations = {col: 'mean' for col in data_cols}
aggregations.update({'label': 'first', 'disease': 'first'})

# Store the original columns before grouping
original_columns = transposedDF.columns

# Step 5: Group by 'Gene_Probe' and aggregate
transposedDF = transposedDF.groupby('Gene_Probe', as_index=False).agg(aggregations)

# Step 6: Set 'Gene_Probe' back as the index (if required)
transposedDF = transposedDF.set_index('Gene_Probe')

# Step 7: Reorder the columns to match the original order
transposedDF = transposedDF[original_columns.difference(['Gene_Probe'])]  # Remove 'Gene_Probe' as it's the index
transposedDF = transposedDF.reset_index()  # Re-add 'Gene_Probe' for ordering
transposedDF = transposedDF[original_columns]  # Order columns exactly as in the original
transposedDF = transposedDF.set_index('Gene_Probe')  # Set 'Gene_Probe' back as the index if needed

# Display the first 4 rows
transposedDF.head(4)

Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
Gene_Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMPLE 1 OMA,OMA,1,11.449473,6.536105,7.542748,6.438908,6.484894,8.762321,8.017512,6.269659,...,6.668651,6.646208,6.395408,11.444387,6.398342,6.517778,6.363869,6.245223,11.967503,9.49495
SAMPLE 10 OMA,OMA,1,11.661547,6.702027,7.018477,6.40987,8.473473,7.425452,8.959437,6.519406,...,6.630572,6.517468,6.096741,10.647588,6.555686,6.45817,6.306452,6.481059,11.720273,9.595461
SAMPLE 100 PeLR,PeLR,1,12.653095,6.538306,6.540266,6.322736,6.448993,8.177972,8.622699,6.416195,...,6.68843,6.654922,6.334504,11.617943,6.585496,6.49173,6.342675,6.438649,11.875686,9.422292
SAMPLE 101 PP,PP,1,12.166617,6.478355,6.798654,6.42369,6.578178,8.552622,9.233912,6.321077,...,6.496436,6.433557,6.294022,11.58079,6.649865,6.449307,6.383782,6.429199,12.424216,8.633182


### Reminding us what a set of duplicate rows looked like before:

In [114]:
rowsOriginal

Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
SAMPLE 121 CE,CE,0,10.61532,6.679381,7.069106,6.547775,8.287285,8.444118,9.618014,6.322037,...,6.613581,6.41857,7.11629,10.325763,6.558989,6.363129,6.560848,6.422209,12.821027,9.558652
SAMPLE 121 CE Replicate,CE,0,10.833608,6.722729,6.315895,6.432962,7.5586,8.658778,8.893812,6.427485,...,6.724493,6.33651,6.358608,10.633035,6.662112,6.378342,6.56457,6.368253,11.555183,9.757919


### Now only one row represents the one sample - we took the average across the 2 rows for the numeric columns

In [115]:
rowsAfter1 = transposedDF.loc[['SAMPLE 121 CE']]
rowsAfter1

Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
Gene_Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMPLE 121 CE,CE,0,10.724464,6.701055,6.6925,6.490369,7.922942,8.551448,9.255913,6.374761,...,6.669037,6.37754,6.737449,10.479399,6.61055,6.370736,6.562709,6.395231,12.188105,9.658286


#### WE WANT THIS TO PRODUCE AN ERROR! The below shows that the replicate row no longer exists in our dataframe
If you uncomment it and run it, it produces an error as intended, suggesting we have removed the replicate entries!

In [116]:
# rowsAfter2 = transposedDF.loc[['SAMPLE 121 CE Replicate']]
# rowsAfter2

# Shape of data after refining

In [117]:
# Get the number of rows and columns
numRows, numColumns = transposedDF.shape

# Display the number of rows and columns
print(f"Number of rows: {numRows}") # minus 2 because the first 2 columns (excluding the index) don't represent a gene/feature
print(f"Number of columns: {numColumns - 3}")

Number of rows: 392
Number of columns: 28246


# Class imbalance

In [118]:
# counting the number of rows with 0 and 1 in the "disease" column
count_0 = (transposedDF["disease"] == 0).sum()
count_1 = (transposedDF["disease"] == 1).sum()

# calculating the total number of rows
total_rows = len(transposedDF["disease"])

# calculating the percentages
percentage_0 = (count_0 / total_rows) * 100
percentage_1 = (count_1 / total_rows) * 100

# printing the counts with percentages
print(f"Samples (rows in transposedDF) from healthy control individuals = 0: {count_0} ({percentage_0:.2f}%)")
print(f"Samples (rows in transposedDF) from endometriosis patients: {count_1} ({percentage_1:.2f}%)")

Samples (rows in transposedDF) from healthy control individuals = 0: 62 (15.82%)
Samples (rows in transposedDF) from endometriosis patients: 330 (84.18%)


> **Class imbalance!** The above code output suggests we are dealing with class imbalance - since there is an imbalance between 0s and 1s in the "disease" column, we can consider using techniques like oversampling the minority class (samples from healthy control individuals)...


# Missing Values - none!

In [119]:
# checking for missing values in the entire DataFrame
missingData = transposedDF.isnull().sum()

# columns with missing values
missingColumns = missingData[missingData > 0]

# summary
if missingColumns.empty:
    print("No missing values found in the dataset.")
else:
    print("Columns with missing values and their counts:")
    print(missingColumns)


No missing values found in the dataset.


In [120]:
transposedDF.head()

Gene_Probe,label,disease,RERE_ILMN_1802380,LOC105374121_ILMN_1736104,CIDEA_ILMN_1788184,SLC17A3_ILMN_1690979,ATP6V1C2_ILMN_1660729,ZNF768_ILMN_1791820,F3_ILMN_2129572,HTR5BP_ILMN_1858692,...,ZSCAN26_ILMN_1651905,ZSCAN4_ILMN_1725718,ZSWIM3_ILMN_1684960,ZSWIM8_ILMN_1669433,ZWINT_ILMN_1673117,ZXDA_ILMN_1713868,ZYG11A_ILMN_1794932,ZYG11A_ILMN_1723439,ZYX_ILMN_1701875,ZZZ3_ILMN_1653618
Gene_Probe,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SAMPLE 1 OMA,OMA,1,11.449473,6.536105,7.542748,6.438908,6.484894,8.762321,8.017512,6.269659,...,6.668651,6.646208,6.395408,11.444387,6.398342,6.517778,6.363869,6.245223,11.967503,9.49495
SAMPLE 10 OMA,OMA,1,11.661547,6.702027,7.018477,6.40987,8.473473,7.425452,8.959437,6.519406,...,6.630572,6.517468,6.096741,10.647588,6.555686,6.45817,6.306452,6.481059,11.720273,9.595461
SAMPLE 100 PeLR,PeLR,1,12.653095,6.538306,6.540266,6.322736,6.448993,8.177972,8.622699,6.416195,...,6.68843,6.654922,6.334504,11.617943,6.585496,6.49173,6.342675,6.438649,11.875686,9.422292
SAMPLE 101 PP,PP,1,12.166617,6.478355,6.798654,6.42369,6.578178,8.552622,9.233912,6.321077,...,6.496436,6.433557,6.294022,11.58079,6.649865,6.449307,6.383782,6.429199,12.424216,8.633182
SAMPLE 102 DiEIn,DiEIn,1,12.008305,6.4889,6.612325,6.338658,7.149825,8.410684,9.194004,6.317384,...,6.798702,6.462172,6.39357,11.435183,6.451736,6.499047,6.341496,6.397282,11.310392,9.688721


# Creating a class-balanced dataset using SMOTE

Due to the significant class imbalance previously identified between healthy individuals and endometriosis patients in our dataset, we applied SMOTE to generate a class-balanced, oversampled version of the original transformed dataset. We will utilize this balanced dataset to train our models and evaluate whether addressing class imbalance improves model accuracy. Below, we employ a basic CNN model to optimize the k_neighbors parameter. Finally, we export both the transformed original dataset and the class-balanced oversampled dataset.

In [121]:
# For reproducibility
random_state = 2080

np.random.seed(random_state)
tf.random.set_seed(random_state)

X = transposedDF.drop(columns=["label", "disease"])
y = transposedDF["disease"]
X.columns = X.columns.astype(str)

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, stratify=y, random_state=random_state)


X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, stratify=y_train_full, random_state=random_state, test_size=0.2
)

# Standardize features for distance‐based methods (SMOTE, CNN training, etc.)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)
X_test_scaled  = scaler.transform(X_test)

# Arbitray simple 1D CNN Model to text the optimal k_neighbors value for SMOTE
def build_cnn_model(input_shape):
    model = Sequential()
    model.add(Input(shape=input_shape))
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Experiment
k_neighbors_list =  [i for i in range(1,15)]

train_acc_results = []
val_acc_results = []

# CNN training settings
epochs = 20
batch_size = 32
n_features = X_train_scaled.shape[1]
X_val_reshaped = X_val_scaled.reshape(-1, n_features, 1)

# Loop through different k_neighbors values to find a roughly optimal answer
for k in k_neighbors_list:
    print(f"\nTesting SMOTE with k_neighbors = {k}")

    smote = SMOTE(k_neighbors=k, random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

    X_train_res_reshaped = X_train_res.reshape(-1, n_features, 1)

    model = build_cnn_model(input_shape=(n_features, 1))

    history = model.fit(
        X_train_res_reshaped,
        y_train_res,
        validation_data=(X_val_reshaped, y_val),
        epochs=epochs,
        batch_size=batch_size,
        verbose=0
    )

    final_train_acc = history.history['accuracy'][-1]
    final_val_acc = history.history['val_accuracy'][-1]

    print(f"Final training accuracy: {final_train_acc:.4f}, validation accuracy: {final_val_acc:.4f}")

    train_acc_results.append(final_train_acc)
    val_acc_results.append(final_val_acc)

# Results

# Find optimal k based on highest validation accuracy
best_idx = np.argmax(val_acc_results)
optimal_k = k_neighbors_list[best_idx]
best_val_acc = val_acc_results[best_idx]

print(f"Optimal k value: {optimal_k}")


Testing SMOTE with k_neighbors = 1
Final training accuracy: 0.9217, validation accuracy: 0.9153

Testing SMOTE with k_neighbors = 2
Final training accuracy: 0.9747, validation accuracy: 0.9322

Testing SMOTE with k_neighbors = 3
Final training accuracy: 0.9596, validation accuracy: 0.8814

Testing SMOTE with k_neighbors = 4
Final training accuracy: 0.9848, validation accuracy: 0.8983

Testing SMOTE with k_neighbors = 5
Final training accuracy: 0.9672, validation accuracy: 0.9153

Testing SMOTE with k_neighbors = 6
Final training accuracy: 0.9672, validation accuracy: 0.9153

Testing SMOTE with k_neighbors = 7
Final training accuracy: 0.9369, validation accuracy: 0.9153

Testing SMOTE with k_neighbors = 8
Final training accuracy: 0.9268, validation accuracy: 0.8983

Testing SMOTE with k_neighbors = 9
Final training accuracy: 0.8712, validation accuracy: 0.8983

Testing SMOTE with k_neighbors = 10
Final training accuracy: 0.9495, validation accuracy: 0.8814

Testing SMOTE with k_neighbo

ResourceExhaustedError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py", line 37, in <module>

  File "/usr/local/lib/python3.11/dist-packages/traitlets/config/application.py", line 992, in launch_instance

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelapp.py", line 712, in start

  File "/usr/local/lib/python3.11/dist-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.11/asyncio/base_events.py", line 608, in run_forever

  File "/usr/lib/python3.11/asyncio/base_events.py", line 1936, in _run_once

  File "/usr/lib/python3.11/asyncio/events.py", line 84, in _run

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 499, in process_one

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/kernelbase.py", line 730, in execute_request

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/ipkernel.py", line 383, in do_execute

  File "/usr/local/lib/python3.11/dist-packages/ipykernel/zmqshell.py", line 528, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 2975, in run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3030, in _run_cell

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/async_helpers.py", line 78, in _pseudo_sync_runner

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3257, in run_cell_async

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3473, in run_ast_nodes

  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code

  File "<ipython-input-121-4a9789b164fb>", line 60, in <cell line: 0>

  File "/usr/local/lib/python3.11/dist-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/usr/local/lib/python3.11/dist-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

Out of memory while trying to allocate 115687424 bytes.
BufferAssignment OOM Debugging.
BufferAssignment stats:
             parameter allocation:  334.44MiB
              constant allocation:         4B
        maybe_live_out allocation:  330.99MiB
     preallocated temp allocation:  413.75MiB
  preallocated temp fragmentation:         0B (0.00%)
                 total allocation:  748.19MiB
Peak buffers:
	Buffer 1:
		Size: 110.33MiB
		Operator: op_type="Conv2D" op_name="sequential_36_1/conv1d_35_1/convolution" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: custom-call
		Shape: f32[32,32,1,28245]
		==========================

	Buffer 2:
		Size: 110.33MiB
		Operator: op_type="ReluGrad" op_name="gradient_tape/sequential_36_1/conv1d_35_1/ReluGrad"
		XLA Label: fusion
		Shape: f32[32,32,1,28245]
		==========================

	Buffer 3:
		Size: 110.33MiB
		XLA Label: fusion
		Shape: f32[451904,64]
		==========================

	Buffer 4:
		Size: 110.33MiB
		Operator: op_type="AssignSubVariableOp" op_name="adam/AssignSubVariableOp_2" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: fusion
		Shape: f32[451904,64]
		==========================

	Buffer 5:
		Size: 110.33MiB
		Operator: op_type="AssignSubVariableOp" op_name="adam/AssignSubVariableOp_2" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: fusion
		Shape: f32[451904,64]
		==========================

	Buffer 6:
		Size: 110.33MiB
		Operator: op_type="AssignSubVariableOp" op_name="adam/AssignSubVariableOp_2" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: fusion
		Shape: f32[451904,64]
		==========================

	Buffer 7:
		Size: 55.16MiB
		Operator: op_type="MaxPool" op_name="sequential_36_1/max_pooling1d_35_1/MaxPool1d" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: fusion
		Shape: f32[32,14122,32]
		==========================

	Buffer 8:
		Size: 27.58MiB
		Operator: op_type="ReluGrad" op_name="gradient_tape/sequential_36_1/conv1d_35_1/ReluGrad"
		XLA Label: fusion
		Shape: pred[32,32,1,28245]
		==========================

	Buffer 9:
		Size: 3.45MiB
		Operator: op_name="XLA_Args"
		Entry Parameter Subshape: f32[32,28247,1]
		==========================

	Buffer 10:
		Size: 8.0KiB
		Operator: op_type="BiasAddGrad" op_name="gradient_tape/sequential_36_1/dense_70_1/BiasAdd/BiasAddGrad"
		XLA Label: fusion
		Shape: f32[32,64]
		==========================

	Buffer 11:
		Size: 384B
		XLA Label: fusion
		Shape: f32[3,1,32]
		==========================

	Buffer 12:
		Size: 384B
		XLA Label: fusion
		Shape: f32[3,1,32]
		==========================

	Buffer 13:
		Size: 384B
		XLA Label: fusion
		Shape: f32[3,1,32]
		==========================

	Buffer 14:
		Size: 256B
		Operator: op_name="XLA_Args"
		Entry Parameter Subshape: s64[32,1]
		==========================

	Buffer 15:
		Size: 256B
		Operator: op_type="AssignSubVariableOp" op_name="adam/AssignSubVariableOp_3" source_file="/usr/local/lib/python3.11/dist-packages/tensorflow/python/framework/ops.py" source_line=1196
		XLA Label: fusion
		Shape: f32[64]
		==========================


	 [[{{node StatefulPartitionedCall}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_multi_step_on_iterator_120267]

In [None]:
# Plot results with annotation
plt.figure(figsize=(8, 6))
plt.plot(k_neighbors_list, train_acc_results, label='Training Accuracy', marker='o')
plt.plot(k_neighbors_list, val_acc_results, label='Validation Accuracy', marker='s')

# Add vertical line at optimal k
plt.axvline(x=optimal_k, color='red', linestyle='--', label=f'Optimal k = {optimal_k}')

# Annotate the optimal point
plt.annotate(
    f'Best Val Acc = {best_val_acc:.2f}',
    xy=(optimal_k, best_val_acc),
    xytext=(optimal_k - 3.5, best_val_acc - 0.09),
    arrowprops=dict(facecolor='black', arrowstyle='->'),
    fontsize=10
)

plt.xlabel("SMOTE k_neighbors")
plt.ylabel("Accuracy")
plt.title("Effect of SMOTE k_neighbors on CNN Performance")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Assume transposedDF is your DataFrame already loaded.
# Drop the "label" column, and use the gene expression features.
X = transposedDF.drop(columns=["label", "disease"])
y = transposedDF["disease"]
X.columns = X.columns.astype(str)

# Create a SMOTE instance with k_neighbors set to `optimal_k`
smote = SMOTE(k_neighbors=optimal_k, random_state=random_state)

# Perform SMOTE resampling
X_res, y_res = smote.fit_resample(X, y)

# Combine the resampled features and target into one DataFrame.
# (The "label" column is omitted because it typically contains sample identifiers.)
resampled_df = pd.concat([pd.DataFrame(X_res, columns=X.columns),
                          pd.DataFrame({"disease": y_res})],
                         axis=1)

df = transposedDF.drop(columns=["label"])
df.to_csv("endometriosis_dataset.csv", index=False)

transposedDF.to_csv("endometriosis_dataset_with_label.csv", index=False)

In [None]:
resampled_df.shape

zeros = (resampled_df["disease"] == 0).sum()
ones = (resampled_df["disease"] == 1).sum()

total_rows = len(resampled_df["disease"])

percentage_zeros = (zeros / total_rows) * 100
percentage_ones = (ones / total_rows) * 100

print(f"Samples (rows in resampled_df) from healthy control individuals = 0: {zeros} ({percentage_zeros:.2f}%)")
print(f"Samples (rows in resampled_df) from endometriosis patients: {ones} ({percentage_ones:.2f}%)")

# Testing the Effects of Class Imbalance on Model Performance

Below, we train the same basic CNN using both the resampled SMOTE dataset and the original dataset. However, oversampling with SMOTE can compromise the privacy guarantees of a differentially private model, as the model might overfit specific population subsets. Currently, there are no readily available "plug-and-play" DP resampling libraries; most implementations are tailored to specific use cases. Nevertheless, substantial performance improvements achieved using SMOTE indicate that class imbalance in the original dataset significantly affects model performance. This insight motivates us to explore alternative approaches to SMOTE that preserve differential privacy, such as class weighting and undersampling.

In [None]:
# Define a function to process and train on a dataset
def process_and_train_cnn(X, y, label):
    # Split dataset
    X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=random_state)
    X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, stratify=y_train_full, test_size=0.2, random_state=random_state)

    # Standardize
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    # Reshape for CNN input
    n_features = X_train_scaled.shape[1]
    X_train_scaled = X_train_scaled.reshape(-1, n_features, 1)
    X_val_scaled   = X_val_scaled.reshape(-1, n_features, 1)
    X_test_scaled  = X_test_scaled.reshape(-1, n_features, 1)

    # CNN model
    def build_cnn_model(input_shape):
        model = Sequential([
            Input(shape=input_shape),
            Conv1D(32, kernel_size=3, activation='relu'),
            MaxPooling1D(pool_size=2),
            Flatten(),
            Dense(64, activation='relu'),
            Dropout(0.5),
            Dense(1, activation='sigmoid')
        ])
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        return model

    model = build_cnn_model(input_shape=(n_features, 1))

    # Train
    history = model.fit(
        X_train_scaled, y_train,
        validation_data=(X_val_scaled, y_val),
        epochs=20,
        batch_size=32,
        verbose=0
    )

    # Evaluate
    train_acc = history.history['accuracy'][-1]
    val_acc   = history.history['val_accuracy'][-1]
    test_loss, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)

    print(f"{label} - Train Accuracy: {train_acc:.4f}, Val Accuracy: {val_acc:.4f}, Test Accuracy: {test_acc:.4f}")

    return history, test_acc

X1, y1 = resampled_df.drop(columns=["disease"]), resampled_df["disease"]
X2, y2 = transposedDF.drop(columns=["label","disease"]), transposedDF["disease"]
X2.columns = X2.columns.astype(str)

# Run experiments
history1, test_acc1 = process_and_train_cnn(X1, y1, label="SMOTE Dataset")
history2, test_acc2 = process_and_train_cnn(X2, y2, label="Original Dataset")

In [None]:
# Visualization of Model Performance vs Dataset
def plot_history(history, label,line_color=""):
    plt.plot(history.history['accuracy'], line_color, label=f'{label} Train',linestyle='dashed')
    plt.plot(history.history['val_accuracy'],line_color, label=f'{label} Val')

plt.figure(figsize=(10, 6))
plot_history(history1, "SMOTE Dataset", "orange")
plot_history(history2, "Original Dataset", "navy")
plt.title("Training vs Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Model Performance with an Undersampled Dataset

Our experiment demonstrates significant improvement in model performance when using the SMOTE oversampled dataset. As previously noted, this suggests that class imbalance in the original dataset adversely affects model performance. Lastly, we will generate an undersampled dataset and evaluate the relative model performance across these datasets.

In [None]:

healthy_patients = transposedDF[transposedDF['disease'] == 0]
endometriosis_patients  = transposedDF[transposedDF['disease'] == 1]
print("Original Dataset")
print("-"*30)
print(f"Healthy Patients: {len(healthy_patients)}")
print(f"Endometriosis Patients: {len(endometriosis_patients)}")
print("-"*30)
if len(healthy_patients) > len(endometriosis_patients):
    minority_class = endometriosis_patients
    majority_class = healthy_patients
else:
    minority_class = healthy_patients
    majority_class = endometriosis_patients

# Downsample majority class to match minority class
majority_downsampled = majority_class.sample(n=len(minority_class), random_state=random_state)

# Combine back into a balanced dataset
undersampled_df = pd.concat([majority_downsampled, minority_class])

# Shuffle the dataset
undersampled_df = undersampled_df.sample(frac=1, random_state=random_state).reset_index(drop=True)

healthy_patients = undersampled_df[undersampled_df['disease'] == 0]
endometriosis_patients  = undersampled_df[undersampled_df['disease'] == 1]
print("Undersampled Dataset")
print("-"*30)
print(f"Healthy Patients: {len(healthy_patients)}")
print(f"Endometriosis Patients: {len(endometriosis_patients)}")
print("-"*30)

In [None]:
X3, y3 = undersampled_df.drop(columns=["disease"]), undersampled_df["disease"]
X3.columns = X3.columns.astype(str)
history3, test_acc3 = process_and_train_cnn(X2, y2, label="Undersampled Dataset")

In [None]:
plt.figure(figsize=(10, 6))
plot_history(history1, "SMOTE Dataset", "orange")
plot_history(history2, "Original Dataset", "navy")
plot_history(history3, "Undersampled Dataset","green")
plt.title("Training vs Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# Conclusion

After our initial exploratory analysis of the data, we observed the impact of class imbalance on model performance, noting particularly that oversampling results in a more accurate model. However, this approach offers limited viability in a differentially private context, as oversampling undermines the assumption that each individual contributes equally to privacy units.

We also evaluated undersampling to determine whether it could achieve similar improvements while preserving differential privacy. Unfortunately, due to the small sample size, this resulted in performance comparable to that obtained with the original dataset.

Another approach we considered was merging the original dataset with auxiliary datasets containing RNA gene expression samples from healthy patients. This method proved challenging due to significant differences in dimensionality between datasets, as well as the added complexity of ensuring consistency in sample collection procedures and methodologies.

Therefore, to address the class imbalance discussed here in a manner compatible with differential privacy, we will focus on adjustments in model selection and architecture.