In [3]:
import pandas as pd
import numpy as np
from src.preprocess import Preprocess
from src.clustering import Clustering
from src.feature_selector import FDRFeatureSelector

from src.regression import Regression
import argparse
def parse_args():
    parser = argparse.ArgumentParser(description="Process Excel data.")
    parser.add_argument('--data_path', type=str, default='data/data2.xlsx', help='Path to the Excel file')
    parser.add_argument('--small_value', type=float, default=1e-6, help='Small value to replace zeros in the data')
    parser.add_argument('--num_clusters', type=int, default=5, help='Number of clusters for clustering algorithms')
    parser.add_argument('--num_nearest_points', type=int, default=11, help='Number of nearest points to consider for each cluster center')
    parser.add_argument('--normalization_method', type=str, default='naive', help='Normalization method to apply to the data, options: log, clr, naive')
    parser.add_argument('--fdr_threshold', type=float, default=0.1, help='FDR threshold for feature selection')
    parser.add_argument('--distance_metric', type=str, default='pearson', help='euclidean, spearman, pearson, cosine, or correlation distance metric')
    parser.add_argument('--embedding_num', type=int, default=10, help='Number of dimensions for low-dimensional representation')
    parser.add_argument('--embedding_method', type=str, default='pca', help='Method for low-dimensional representation, options: pca, laplacian')

    return parser.parse_args("")

In [None]:
args = parse_args() 
# Load data from Excel file
preprocessor = Preprocess(args)
data = preprocessor.get_processed_data() # data would be shape of n_features x n_samples

fdr_feature_selector = FDRFeatureSelector(args, data,preprocessor.feature_dict)
fdr_feature_selector.select_features()  # Select features based on FDR
fdr_feature_selector.display_sorted_features()  # Display sorted features by p-value




FDR correction complete. Call 'display_sorted_features()' to see results.

--- Comparison 1: Normal vs. Abnormal (Sorted by p-value) ---
hsa-let-7i-5p: 0.779615
hsa-miR-142-5p: 0.779615
hsa-miR-143-3p: 0.779615
hsa-miR-17-5p: 0.779615
hsa-miR-320e: 0.779615
hsa-miR-628-3p: 0.779615
hsa-miR-664a-3p: 0.779615
hsa-let-7b-3p: 0.842743
hsa-let-7d-3p: 0.842743
hsa-let-7d-5p: 0.842743
hsa-let-7f-1-3p: 0.842743
hsa-let-7g-5p: 0.842743
hsa-miR-101-3p: 0.842743
hsa-miR-103a-3p: 0.842743
hsa-miR-106a-5p: 0.842743
hsa-miR-107: 0.842743
hsa-miR-122-3p: 0.842743
hsa-miR-1228-5p: 0.842743
hsa-miR-1229-3p: 0.842743
hsa-miR-1246: 0.842743
hsa-miR-125a-5p: 0.842743
hsa-miR-125b-5p: 0.842743
hsa-miR-126-3p: 0.842743
hsa-miR-1260b: 0.842743
hsa-miR-1285-3p: 0.842743
hsa-miR-1290: 0.842743
hsa-miR-1306-5p: 0.842743
hsa-miR-130a-3p: 0.842743
hsa-miR-130b-3p: 0.842743
hsa-miR-130b-5p: 0.842743
hsa-miR-134-5p: 0.842743
hsa-miR-140-3p: 0.842743
hsa-miR-142-3p: 0.842743
hsa-miR-144-3p: 0.842743
hsa-miR-150-3p: 

In [None]:
clustering= Clustering(args, data)
clustering.kmeans(n_clusters=args.num_clusters)  # Perform KMeans clustering



In [None]:
regression = Regression(args,clustering)
regression.do_regression()  # Process data for regression