In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import mean
from pyspark.ml.feature import VectorAssembler, ChiSqSelector
import os
import csv
import pandas as pd

  from pandas.core import (


In [2]:
spark = SparkSession.builder \
    .appName("LocalFeatureSelection") \
    .config("spark.driver.memory", "55g") \
    .config("spark.executor.memory", "55g") \
    .config("spark.driver.cores", "8") \
    .config("spark.executor.cores", "8") \
    .master("local[8]") \
    .getOrCreate()

In [3]:
# Directory containing the split files
directory = r"C:\Users\hcymm3\Desktop\Dementia"

In [4]:
# Initialize an empty list to store all DataFrames
dfs = []

# Loop through the files bestfeaturesRF_1.csv to bestfeaturesRF_20.csv
for i in range(1, 21):
    # Construct the filename
    file_name = f"bestfeaturesChi_{i}.csv"
    file_path = os.path.join(directory, file_name)

    # Read each file into a DataFrame and add to the list
    df = pd.read_csv(file_path)
    #Dropping phenotype from all except the first
    if i > 1 :
        df.drop(columns=df.columns[0], axis=1, inplace=True)
    dfs.append(df)

In [5]:
# Concatenate all DataFrames side by side
final_df = pd.concat(dfs, axis=1)

In [6]:
final_df.shape

(534, 20001)

In [7]:
print(list(final_df.columns))


['PHENOTYPE', 'rs819980_G', 'rs1571149_G', 'rs2294488_A', 'rs7407_G', 'rs4648727_A', 'rs7525092_A', 'rs2247560_G', 'rs2803316_A', 'rs115840841_A', 'rs4648384_A', 'rs1456460_A', 'rs12403214_C', 'rs16823228_G', 'rs7539511_A', 'rs731031_G', 'rs2651902_A', 'rs10047257_A', 'rs11578011_A', 'rs2244013_A', 'rs12124163_C', 'rs10797400_A', 'rs12759497_A', 'rs2275831_A', 'rs7524279_A', 'rs6426389_G', 'rs693734_G', 'rs149964632_A', 'rs61769426_A', 'rs705690_G', 'rs241278_G', 'rs644647_C', 'rs875807_A', 'rs10799145_A', 'rs6702660_A', 'rs12757902_A', 'rs9439603_G', 'rs10157819_G', 'rs945322_C', 'rs11121600_A', 'rs962662_G', 'rs11120934_G', 'rs747393_G', 'rs742505_A', 'rs2103673_A', 'rs6698901_A', 'rs4908445_G', 'rs12735487_A', 'rs7539551_G', 'rs11120896_A', 'rs6702132_A', 'rs1149331_G', 'rs12139380_G', 'rs2071987_A', 'rs10127838_A', 'rs10462021_G', 'rs12144409_G', 'rs7545284_A', 'rs7529511_A', 'rs12047015_G', 'rs3820034_A', 'rs12127491_A', 'rs148521712_A', 'rs2268175_G', 'rs10864425_A', 'rs2480789_G

In [8]:
final_df['PHENOTYPE'].replace({1: 0, 2: 1}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['PHENOTYPE'].replace({1: 0, 2: 1}, inplace=True)


In [9]:
output_file = os.path.join(directory, "combined_bestfeaturesChi.csv")
final_df.to_csv(output_file, index=False)

In [10]:
df = spark.read.csv("combined_bestfeaturesChi.csv", header=True, inferSchema=True)

In [11]:
num_rows = df.count()
num_columns = len(df.columns)

# Print the shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

Number of rows: 534
Number of columns: 20001


In [12]:
target_column = df.columns[0]
categorical_columns = df.columns[1:]

In [13]:
print(target_column)

PHENOTYPE


In [14]:
# Assuming 'df' is your DataFrame and columns are named appropriately
target_column = df.columns[0]
categorical_columns = df.columns[1:]

# Assemble categorical features into a single feature vector (needed for ChiSqSelector)
assembler = VectorAssembler(inputCols=categorical_columns, outputCol="features")
df_assembled = assembler.transform(df)

In [15]:
selector = ChiSqSelector(numTopFeatures=200, featuresCol="features", outputCol="selectedFeatures", labelCol=target_column)
model = selector.fit(df_assembled)
selected_indices = model.selectedFeatures
selected_feature_names = [categorical_columns[i] for i in selected_indices]

In [16]:
selected_assembler = VectorAssembler(inputCols=selected_feature_names, outputCol="selected_features")
df_selected = selected_assembler.transform(df)
# Now, df_final contains a column 'selected_features' that holds only the vectors of the selected features

In [17]:
print("Top 10 selected features:")
for feature in selected_feature_names[:10]:
    print(feature)


Top 10 selected features:
rs10047257_A
rs7524279_A
rs11120896_A
rs12144409_G
rs12403871_G
rs4661540_A
rs6676098_G
rs35351345_A
rs1539051_G
rs2816048_A


In [18]:
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed=42)


In [19]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Initialize logistic regression to use the 'selected_features' column
lr = LogisticRegression(featuresCol="selected_features", labelCol=target_column)

# Fit the logistic regression model on the training data
lrModel = lr.fit(train_data)


In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

# Make predictions on the test set
predictions = lrModel.transform(test_data)

# Binary classification evaluator for area under ROC
evaluator = BinaryClassificationEvaluator(labelCol=target_column)
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

# Extract additional metrics
predictionAndLabels = predictions.select("prediction", target_column).rdd
metrics = MulticlassMetrics(predictionAndLabels)

print(f"Accuracy: {accuracy}")
#print(f"Precision: {metrics.precision(1.0)}")
#print(f"Recall: {metrics.recall(1.0)}")
#print("Confusion Matrix:\n", metrics.confusionMatrix().toArray())





Accuracy: 0.8749311294765841


In [22]:
# Show a few rows to inspect the predictions
predictions.select("prediction", target_column).show()


+----------+---------+
|prediction|PHENOTYPE|
+----------+---------+
|       0.0|        0|
|       1.0|        0|
|       0.0|        0|
|       0.0|        0|
|       0.0|        0|
|       0.0|        0|
|       0.0|        0|
|       0.0|        0|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       1.0|        1|
|       0.0|        0|
|       0.0|        0|
|       0.0|        0|
|       1.0|        0|
+----------+---------+
only showing top 20 rows



In [28]:
from pyspark.sql.functions import col, expr

# Add columns to DataFrame to identify true positives, false positives, false negatives, and true negatives
predictions = predictions.withColumn("TP", expr(f"prediction == 1 AND {target_column} == 1").cast("int"))
predictions = predictions.withColumn("FP", expr(f"prediction == 1 AND {target_column} == 0").cast("int"))
predictions = predictions.withColumn("FN", expr(f"prediction == 0 AND {target_column} == 1").cast("int"))
predictions = predictions.withColumn("TN", expr(f"prediction == 0 AND {target_column} == 0").cast("int"))


In [29]:
# Calculate sums of each condition
metrics_df = predictions.agg(
    {"TP": "sum", "FP": "sum", "FN": "sum", "TN": "sum"}
).collect()[0]
tp = metrics_df["sum(TP)"]
fp = metrics_df["sum(FP)"]
fn = metrics_df["sum(FN)"]
tn = metrics_df["sum(TN)"]


In [30]:
# Calculate precision and recall
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0

# Confusion matrix
confusion_matrix = [[tn, fp], [fn, tp]]

# Print the results
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print("Confusion Matrix:")
print(f"TN: {tn}, FP: {fp}")
print(f"FN: {fn}, TP: {tp}")


Precision: 0.7763157894736842
Recall: 0.8939393939393939
Confusion Matrix:
TN: 38, FP: 17
FN: 7, TP: 59
