In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import mean
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
import os
import csv


In [None]:
spark = SparkSession.builder \
    .appName("LocalFeatureSelection") \
    .config("spark.driver.memory", "55g") \
    .config("spark.executor.memory", "55g") \
    .config("spark.driver.cores", "8") \
    .config("spark.executor.cores", "8") \
    .master("local[8]") \
    .getOrCreate()

In [None]:
# Directory containing the split files
directory = r"C:\Users\hcymm3\Desktop\Dementia"

In [None]:
# Loop through files split_1.csv to split_20.csv
for i in range(1, 21):
    # Construct the file path dynamically
    file_path = os.path.join(directory, f"split_{i}.csv")

    print(f"Processing file: split_{i}.csv")
    # Read the CSV file into a Spark DataFrame
    df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Drop columns conditionally based on the index `i`
    if i == 1:
        # Drop the second column(SEX)
        df = df.drop(df.columns[1])

    # Get the schema of the DataFrame as a list of (column_name, data_type) tuples
    schema = df.dtypes
    # Identify all columns of type "string"
    string_columns = [col_name for col_name, col_type in schema if col_type == "string"]
    # Drop all string columns from the DataFrame
    df = df.drop(*string_columns)

    target_column = df.columns[0]
    categorical_features = df.columns[1:]

    
    assembler = VectorAssembler(
    inputCols=categorical_features,
    outputCol='features'
)

    
    assembled_df = assembler.transform(df)


    selector = ChiSqSelector(
    numTopFeatures=1000,  # Choose the top 1000 features (adjust as needed)
    featuresCol='features',
    outputCol='selectedFeatures',
    labelCol=target_column
    )


    selected_model = selector.fit(assembled_df)
    selected_data = selected_model.transform(assembled_df)


    selected_indices = selected_model.selectedFeatures
    selected_feature_names = [categorical_features[i] for i in selected_indices]

    
    final_columns = [target_column] + selected_feature_names
    selected_df = df.select(*final_columns)

    
    rows = selected_df.collect()

    columns = selected_df.columns
    output_file = f'bestfeaturesChi_1.csv'
# Write to a CSV file using the CSV module
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(columns)  
        writer.writerows(rows)    
