This file extracts the top 100 features each from all the splits using Random Forest Feature Extraction and creates a new csv files which aggregates Top features from all the csv files

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import os
from pyspark.ml.classification import RandomForestClassifier

In [2]:
spark = SparkSession.builder \
    .appName("LocalFeatureSelection") \
    .config("spark.driver.memory", "55g") \
    .config("spark.executor.memory", "55g") \
    .config("spark.driver.cores", "8") \
    .config("spark.executor.cores", "8") \
    .master("local[8]") \
    .getOrCreate()

In [3]:
# Directory containing the split files
directory = r"C:\Users\hcymm3\Desktop\Dementia"

In [4]:
# Loop through files split_1.csv to split_20.csv
for i in range(1, 21):
    # Construct the file path dynamically
    file_path = os.path.join(directory, f"split_{i}.csv")

    print(f"Processing file: split_{i}.csv")
    # Read the CSV file into a Spark DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Drop columns conditionally based on the index `i`
    if i == 1:
        # Drop the first and second columns ( ID and SEX)
        df = df.drop(df.columns[0], df.columns[1])
    else:
        # Drop only the first column (ID)
        df = df.drop(df.columns[0])

    # Get the schema of the DataFrame as a list of (column_name, data_type) tuples
    schema = df.dtypes
    # Identify all columns of type "string"
    string_columns = [col_name for col_name, col_type in schema if col_type == "string"]

    # Drop all string columns from the DataFrame
    df = df.drop(*string_columns)

    target_column = df.columns[0]
    categorical_columns = df.columns[1:]
    # Assemble features into a single vector
    assembler = VectorAssembler(inputCols=categorical_columns, outputCol="features")
    assembled_df = assembler.transform(df)

   
    # Train a RandomForest Classifier
    rf = RandomForestClassifier(featuresCol="features", labelCol=target_column, numTrees=100,seed=42)
    model = rf.fit(assembled_df)
    from pyspark.sql.types import StructType, StructField, StringType, FloatType
    
    # Create a schema explicitly
    schema = StructType([
    StructField("feature", StringType(), True),
    StructField("importance", FloatType(), True)
    ])

    importances_array = model.featureImportances.toArray()
    features_data = [(feature, float(importance)) for feature, importance in zip(categorical_columns, importances_array)]

    feature_importance_df = spark.createDataFrame(features_data, schema)

    # Sort the features by importance and select the top 100
    sorted_features = feature_importance_df.orderBy("importance", ascending=False)
    top_100_features = sorted_features.limit(100)


    # Will only add target for first split
    top_features_list = [row['feature'] for row in top_100 _features.collect()]


    from pyspark.sql.functions import when, col
    import csv
    output_file = f'bestfeaturesRF_{i}.csv'

    if i == 1:

        #Add target to the left for first split mot for the rest(redundant) 
        top_features_list.insert(0, target_column)
        filtered_df = df.select(*top_features_list)

        # Convert target to binary (map 1 to 0 and 2 to 1)
        binary_target_df = filtered_df.withColumn(target_column,
            when(col(target_column) == 1, 0).when(col(target_column) == 2, 1)
            )
        
        
        rows = binary_target_df.collect()
       
        columns = binary_target_df.columns
        with open(output_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(columns)  
            writer.writerows(rows)
        
        print(f"Processed file: split_{i}.csv")
        print("Columns in binary_target_df:", binary_target_df.columns)
        print("\n Number of columns in the DataFrame:", len(binary_target_df.columns))
    else:
        filtered_df = df.select(*top_features_list)
        # Collect the DataFrame to a Python list
        rows = filtered_df.collect()
        columns = filtered_df.columns
        with open(output_file, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(columns)  # Write header
            writer.writerows(rows)        
    print("DF Saved")


Processing file: split_1.csv
Processed file: split_1.csv
Columns in binary_target_df: ['PHENOTYPE', 'rs1886651_G', 'rs1456169_C', 'rs315049_A', 'rs2816048_A', 'rs4526593_A', 'rs72637903_A', 'rs10489354_A', 'rs1413529_G', 'rs2789428_A', 'rs1384164_G', 'rs2376727_A', 'rs12028469_A', 'rs2274333_G', 'rs1342709_G', 'rs2152856_G', 'rs12406679_A', 'rs11265461_G', 'rs10493772_A', 'rs12407355_G', 'rs2518544_G', 'rs6661668_A', 'rs11205102_A', 'rs2341471_A', 'rs10922501_G', 'rs2034124_A', 'rs6702652_C', 'rs183148633_G', 'rs11210343_G', 'rs6695366_G', 'rs4026409_A', 'rs709767_A', 'rs6657429_A', 'rs7517675_A', 'rs2247560_G', 'rs11163306_A', 'rs55891684_A', 'rs198415_G', 'rs12033004_G', 'rs3766415_G', 'rs4576621_A', 'rs12023742_A', 'rs992660_A', 'rs6674337_A', 'rs12117564_G', 'rs1886632_G', 'rs712889_A', 'rs12137412_A', 'rs11102782_A', 'rs72909243_A', 'rs1997762_A', 'rs4661540_A', 'rs164989_A', 'rs12129734_G', 'rs12742405_A', 'rs1776286_A', 'rs12031064_A', 'rs10915428_A', 'rs12021758_G', 'rs876171_G