In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import RFormula
from pyspark.ml.feature import RFE
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

# Step 1: Initialize Spark Session
spark = SparkSession.builder \
    .appName("MLExample") \
    .getOrCreate()

# Step 2: Load Data into Spark DataFrame
df = spark.read.csv('../Dataset/Location1_preprocessed.csv', header=True, inferSchema=True)

# Step 3: Separate the time column
time_column = df.select('Time', 'Day')
df_without_time = df.drop('Time', 'Day')

# Step 4: Use MinMaxScaler to normalize the columns
assembler = VectorAssembler(inputCols=df_without_time.columns, outputCol="features")
df_vectorized = assembler.transform(df_without_time)

scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scaler_model = scaler.fit(df_vectorized)
df_normalized = scaler_model.transform(df_vectorized)

# Step 5: Add the time column back to the normalized DataFrame
df_normalized = df_normalized.join(time_column, how='inner')

# Step 6: Define features (X) and target variable (y)
rformula = RFormula(formula="Power ~ .", featuresCol="features", labelCol="label")
output = rformula.fit(df_normalized).transform(df_normalized)
X = output.select("features")
y = output.select("label")

# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = X.randomSplit([0.8, 0.2], seed=42)

# Step 8: Create a machine learning model (Random Forest Regressor)
model = RandomForestRegressor(numTrees=100, seed=42)

# Step 9: Use Recursive Feature Elimination (RFE)
rfe = RFE(estimator=model, numFeatures=3)
model = rfe.fit(X_train, y_train)

# Step 10: Train the model on the selected features
selected_features = model.getFeatureSubset()
X_train_rfe = X_train.select(selected_features)
X_test_rfe = X_test.select(selected_features)

# Step 11: Fit the model
model = model.estimator
model_fit = model.fit(X_train_rfe, y_train)

# Step 12: Make predictions on the test set
predictions = model_fit.transform(X_test_rfe)

# Step 13: Evaluate the model performance
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error:", mse)

# Step 14: Get the selected features
selected_features = model.getFeatureSubset().toArray()
selected_feature_names = X.columns[model.getFeatureSubset().toArray()]
print("Selected Features:")
for feature_name in selected_feature_names:
    print(feature_name)