In [6]:
import numpy as np
import pandas as pd

def relief(X, y, n_neighbors=10, n_features_to_keep=10):
    """
    Implementation of Relief feature selection algorithm.

    Parameters:
    X (numpy.ndarray or pandas.DataFrame): The input feature matrix of shape (n_samples, n_features).
    y (numpy.ndarray or pandas.Series): The target vector of shape (n_samples,).
    n_neighbors (int): The number of nearest neighbors to use for feature weights calculation. Default is 10.
    n_features_to_keep (int): The number of top features to select. Default is 10.

    Returns:
    (list): A list of n_features_to_keep features ranked in descending order of their importance.
    """

    # Convert input data to numpy arrays
    X = np.asarray(X)
    y = np.asarray(y)

    # Initialize feature weights and score lists
    weights = np.zeros(X.shape[1])
    score = []

    # Loop over all instances in the dataset
    for i in range(X.shape[0]):

        # Get the i-th instance and its class label
        instance = X[i, :]
        label = y[i]

        # Calculate the distances between the i-th instance and all other instances
        distances = np.sqrt(np.sum((X - instance) ** 2, axis=1))

        # Find the indices of the k nearest instances for each class
        knn_indices = [np.argsort(distances[y == c])[1:n_neighbors+1] for c in np.unique(y)]

        # Calculate the difference between the i-th instance and its nearest neighbors for each feature
        near_hit = np.mean(X[knn_indices[label], :], axis=0) - instance
        near_miss = np.mean(np.concatenate([X[knn_indices[c], :] for c in np.unique(y) if c != label]), axis=0) - instance

        # Update the feature weights
        weights += near_hit ** 2 - near_miss ** 2

    # Normalize the feature weights
    weights /= np.sum(weights)

    # Sort the features by their weights in descending order
    ranked_features = np.argsort(weights)[::-1]

    # Select the top k features
    top_features = ranked_features[:n_features_to_keep]

    # Return the indices of the selected features
    return top_features.tolist()


In [37]:
import pandas as pd

# Load the data
X = pd.read_csv("X.csv", index_col=0)
y = pd.read_csv("Y.csv", index_col=0).values.ravel()


# Select the top 1500 features using Relief
top_features = relief(X, y, n_neighbors=10, n_features_to_keep=150)

# Get the gene IDs for the top features
gene_ids = X.columns[top_features].tolist()

# Create a new dataframe with the selected features
X_new = X.iloc[:, top_features]

len(gene_ids)

150

In [38]:
gene_ids = [int(x) for x in gene_ids]

In [40]:
# Export the list to csv file
import csv
with open('gene_ids_from_algo4.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    for value in zip(gene_ids):
        writer.writerow(value)