### Identification of Differentially Expressed Genes and Pathway Analysis in Ovarian Cancer Using Microarray Data Analysis

# Import required Libraries


In [1]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
from statsmodels.stats.multitest import multipletests
import matplotlib.pyplot as plt
import seaborn as sns
import requests


# Retrieve the microarray data


In [None]:
import pandas as pd
import requests
import tarfile

# Define the URL to download the data file
url = "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE35972&format=file"

# Send a request to download the file
response = requests.get(url)

# Save the downloaded file
file_name = "GSE35972_RAW.tar"
with open(file_name, 'wb') as file:
    file.write(response.content)

# Extract the contents of the TAR file
tar = tarfile.open(file_name, "r")
tar.extractall()
extracted_dir = tar.getnames()[0]  # Get the extracted directory name
tar.close()

# Load the expression data from the extracted files
expression_data = pd.DataFrame()
for member in tarfile.open(file_name).getmembers():
    if member.isfile() and member.name.endswith(".txt"):
        file = tar.extractfile(member)
        sample_id = member.name.split(".txt")[0]
        sample_data = pd.read_csv(file, sep="\t", skiprows=2, index_col=0)
        expression_data[sample_id] = sample_data["VALUE"]

# Print the expression data
print(expression_data.head())


# Preprocess the Data


In [15]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Transpose the expression data
expression_data = expression_data.T

# Remove genes with missing values
expression_data = expression_data.dropna()

# Data normalization
normalized_data = StandardScaler().fit_transform(expression_data)

# Convert the normalized data back to a DataFrame
normalized_data = pd.DataFrame(normalized_data, columns=expression_data.columns, index=expression_data.index)

# Perform gene filtering (optional)
# filter_expression_data = normalized_data[(normalized_data > threshold).any(axis=1)]

# Print the preprocessed data
print(normalized_data.head())


ValueError: at least one array or dtype is required

# Perform differntial expression analysis

In [None]:
# Separate the treated and untreated samples
treated_samples = df[df["Treatment"] == "Treated"].drop(columns=["Treatment"])
untreated_samples = df[df["Treatment"] == "Untreated"].drop(columns=["Treatment"])

# Perform statistical analysis, e.g., t-test
_, p_values = ttest_ind(treated_samples, untreated_samples, axis=0)

# Adjust p-values for multiple testing using the Benjamini-Hochberg method
adjusted_p_values = multipletests(p_values, method='fdr_bh')[1]

# Identify differentially expressed genes based on a significance threshold (e.g., 0.05)
significance_threshold = 0.05
differentially_expressed_genes = df.columns[:-1][adjusted_p_values < significance_threshold]


# Perform pathway enrichment analysis

In [None]:
# Perform pathway enrichment analysis using Reactome database (via Enrichr API)
enrichr_url = "https://maayanlab.cloud/Enrichr/enrich"
genes_str = "\n".join(differentially_expressed_genes)
payload = {
    "list": (None, genes_str),
    "backgroundType": (None, "genes"),
    "dataset": (None, "Reactome_2016")
}
response = requests.post(enrichr_url, files=payload)

# Retrieve the enriched pathways
enrichment_results = response.json()
enriched_pathways = enrichment_results["Reactome_2016"]

# Print the enriched pathways
for pathway in enriched_pathways:
    print(pathway)


# Visualize the Data

In [None]:
# Example: Volcano plot
fold_changes = treated_samples.mean() - untreated_samples.mean()
plt.scatter(fold_changes, -np.log10(adjusted_p_values))
plt.xlabel('Fold Change')
plt.ylabel('-log10(Adjusted p-value)')
plt.title('Volcano Plot')
plt.show()

# Example: Gene expression distribution by treatment status
plt.figure(figsize=(10, 6))
sns.boxplot(x='Treatment', y=differentially_expressed_genes[0], data=df)
plt.xlabel('Treatment')
plt.ylabel('Gene Expression')
plt.title('Gene Expression Distribution by Treatment')
plt.show()
