Applies recurscive feature elimination using Random forest itteratively and saves the feature list with which it achives the best accuracy. On each itteration it eliminates the bottom 5 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import mean
from pyspark.ml.feature import VectorAssembler 
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
import os
import pandas as pd

  from pandas.core import (


In [2]:
spark = SparkSession.builder \
    .appName("LocalFeatureSelection") \
    .config("spark.driver.memory", "55g") \
    .config("spark.executor.memory", "55g") \
    .config("spark.driver.cores", "8") \
    .config("spark.executor.cores", "8") \
    .master("local[8]") \
    .getOrCreate()

In [3]:
directory = r"C:\Users\hcymm3\Desktop\Dementia"

In [4]:

dfs = []

# Loop through the files bestfeaturesRF_1.csv to bestfeaturesRF_20.csv
for i in range(1, 21):

    file_name = f"bestfeaturesRF_{i}.csv"
    file_path = os.path.join(directory, file_name)


    df = pd.read_csv(file_path)
    dfs.append(df)


In [5]:

final_df = pd.concat(dfs, axis=1)

In [6]:

output_file = os.path.join(directory, "combined_bestfeaturesRF.csv")
final_df.to_csv(output_file, index=False)

In [7]:
final_df.shape

(534, 2001)

In [8]:
input_file = r"C:\Users\hcymm3\Desktop\Dementia\combined_bestfeaturesRF.csv"


In [9]:

df = spark.read.csv(input_file, header=True, inferSchema=True)

In [10]:
num_rows = df.count()
num_columns = len(df.columns)


print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 534
Number of columns: 2001


In [11]:

target_column = df.columns[0]
categorical_columns = df.columns[1:]

In [12]:

assembler = VectorAssembler(inputCols=categorical_columns, outputCol="features")
assembled_df = assembler.transform(df).select("features", target_column)


In [13]:

train_df, test_df = assembled_df.randomSplit([0.7, 0.3], seed=42)


In [18]:
# Set target column and list of features
target_column = df.columns[0]
initial_features = df.columns[1:]

In [28]:
# Define feature elimination function with step size and accuracy tracking
def rfecv_with_step_size_and_metrics(df, target_column, initial_features, num_iterations, step_size=1):
    remaining_features = initial_features
    best_features = remaining_features
    best_accuracy = 0.0
    best_predictions = None

    evaluator = BinaryClassificationEvaluator(labelCol=target_column, rawPredictionCol="rawPrediction", metricName="areaUnderROC")

    for iteration in range(num_iterations):
        print(f"Iteration {iteration + 1} of {num_iterations}")

        
        assembler = VectorAssembler(inputCols=remaining_features, outputCol="features")
        assembled_df = assembler.transform(df).select("features", target_column)

        
        train_df, test_df = assembled_df.randomSplit([0.7, 0.3], seed=42)

        
        rf = RandomForestClassifier(featuresCol="features", labelCol=target_column, numTrees=100)
        rf_model = rf.fit(train_df)

        
        importances = rf_model.featureImportances
        feature_importance_pairs = sorted(zip(remaining_features, importances), key=lambda x: x[1], reverse=True)

        # Evaluate the model on the test set
        predictions = rf_model.transform(test_df)
        accuracy = evaluator.evaluate(predictions)
        print(f"Accuracy with remaining features: {accuracy}")

        # If current accuracy is better than the best, update the best accuracy, features, and predictions
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_features = remaining_features
            best_predictions = predictions

        # Keep all but the least important features based on the step size
        remaining_features = [x[0] for x in feature_importance_pairs[:-step_size]]

        # Stop early if fewer features remain than the step size
        if len(remaining_features) <= step_size:
            break

    # Compute the confusion matrix for the final selected features
    prediction_and_labels = best_predictions.select("prediction", target_column).rdd.map(lambda row: (float(row[0]), float(row[1])))
    metrics = MulticlassMetrics(prediction_and_labels)

    
    confusion_matrix = metrics.confusionMatrix().toArray()
    print("Confusion Matrix:")
    print(confusion_matrix)

    
    true_positives = confusion_matrix[1, 1]
    false_positives = confusion_matrix[0, 1]
    false_negatives = confusion_matrix[1, 0]
    true_negatives = confusion_matrix[0, 0]

    print(f"True Positives: {true_positives}")
    print(f"False Positives: {false_positives}")
    print(f"False Negatives: {false_negatives}")
    print(f"True Negatives: {true_negatives}")

    # Calculate precision, recall, and accuracy
    precision = metrics.precision(1.0)  # Assuming binary classification with positive class = 1.0
    recall = metrics.recall(1.0)
    final_accuracy = best_accuracy

    print(f"Final Accuracy: {final_accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    return best_features

In [29]:
# Perform RFECV with step size and metrics
selected_features = rfecv_with_step_size_and_metrics(df, target_column, initial_features, num_iterations=400, step_size=5)


Iteration 1 of 400
Accuracy with remaining features: 0.5988898026315789
Iteration 2 of 400
Accuracy with remaining features: 0.5732230392156863
Iteration 3 of 400
Accuracy with remaining features: 0.5999180999181
Iteration 4 of 400
Accuracy with remaining features: 0.6335995085995086
Iteration 5 of 400
Accuracy with remaining features: 0.6183456183456184
Iteration 6 of 400
Accuracy with remaining features: 0.6060049019607843
Iteration 7 of 400
Accuracy with remaining features: 0.5083673469387753
Iteration 8 of 400
Accuracy with remaining features: 0.6029802000408246
Iteration 9 of 400
Accuracy with remaining features: 0.5395918367346938
Iteration 10 of 400
Accuracy with remaining features: 0.6072377836843181
Iteration 11 of 400
Accuracy with remaining features: 0.5692052293006848
Iteration 12 of 400
Accuracy with remaining features: 0.6215213358070499
Iteration 13 of 400
Accuracy with remaining features: 0.5899163094509083
Iteration 14 of 400
Accuracy with remaining features: 0.5861224

In [33]:
def find_top_features(df, target_column, selected_features, top_n=10):
    
    assembler = VectorAssembler(inputCols=selected_features, outputCol="features")
    assembled_df = assembler.transform(df).select("features", target_column)
    
    train_df, test_df = assembled_df.randomSplit([0.7, 0.3], seed=42)
    
    rf = RandomForestClassifier(featuresCol="features", labelCol=target_column, numTrees=100)
    rf_model = rf.fit(train_df)
    
    importances = rf_model.featureImportances
    feature_importance_pairs = list(zip(selected_features, importances))

    # Sort pairs by importance scores in descending order
    sorted_features = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

    # Retrieve the top N features
    top_features = sorted_features[:top_n]

    
    top_features_df = pd.DataFrame(top_features, columns=["Feature", "Importance"])
    return top_features_df

In [37]:
top_n = 10 
top_features_df = find_top_features(df, target_column, selected_features, top_n)

In [32]:
print("Top Features:")
print(top_features_df)

Top Features:
        Feature  Importance
0  rs11576569_A    0.060714
1   rs3116102_A    0.019532
2   rs4374108_A    0.016708
3   rs4661540_A    0.016139
4   rs3935665_G    0.015611
5   rs2447232_G    0.015538
6   rs9496698_G    0.014218
7   rs2029253_G    0.014163
8   rs1413529_G    0.013794
9   rs1938426_G    0.013759


In [38]:
print("Number of selected features : " ,len(selected_features) )

Number of selected features :  130
