In [None]:
%pip install pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, MinMaxScaler, RFormula
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from sklearn.feature_selection import RFE

import findspark

# Specify the Spark home directory and version
findspark.init('C:/spark/spark-3.5.1-bin-hadoop3/spark-3.5.1-bin-hadoop3')

# Configure Spark to use a master URL and set up the application name
master_url = "spark://192.168.57.215:7077"
app_name = "Wrapper-Method"

# Create a SparkSession with the specified master and app name
spark = SparkSession.builder.master(master_url).appName(app_name).getOrCreate()

# Step 2: Load Data into Spark DataFrame
data_path = "../Dataset/Location1_preprocessed.csv"  # Replace with your data path
df = spark.read.csv(data_path, header=True, inferSchema=True)

# Step 3: Separate the time column (if applicable)
time_columns = ["Time", "Day"]  # Adjust column names if needed
if any(col in df.columns for col in time_columns):
    time_column = df.select(*time_columns)
    df_without_time = df.drop(*time_columns)
else:
    time_column = None
    df_without_time = df

# Step 4: Use MinMaxScaler to normalize the columns (if necessary)
if any(col.endswith("numeric") for col in df_without_time.columns):  # Check for numeric columns
    assembler = VectorAssembler(inputCols=df_without_time.columns, outputCol="features")
    df_vectorized = assembler.transform(df_without_time)

    scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
    scaler_model = scaler.fit(df_vectorized)
    df_normalized = scaler_model.transform(df_vectorized)
else:
    df_normalized = df_without_time

# Step 5: Add the time column back (if applicable)
if time_column is not None:
    df_normalized = df_normalized.join(time_column, how='inner')

# Step 6: Define features (X) and target variable (y)
rformula = RFormula(formula="Power ~ .", featuresCol="new-features", labelCol="label")
output = rformula.fit(df_normalized).transform(df_normalized)

# Fix typo in the original code (change "new-features" to "features")
features_col = "features"
labels_col = "label"
X = output.select(features_col)
y = output.select(labels_col)

# Step 7: Split the data for training, validation, and testing
train_data, valid_data, test_data = output.randomSplit([0.8, 0.1, 0.1], seed=42)

# Step 8: Perform RFE (optional)
use_rfe = True  # Set to False to disable RFE
num_features = 3  # Number of features to select by RFE (if enabled)

if use_rfe:
    pandas_df = output.toPandas()
    X_sklearn = pandas_df.drop("label", axis=1)
    y_sklearn = pandas_df["label"]

    sklearn_rfe = RFE(estimator=RandomForestRegressor(n_estimators=100, random_state=42), n_features_to_select=num_features)
    sklearn_rfe.fit(X_sklearn, y_sklearn)

    selected_features = X_sklearn.columns[sklearn_rfe.support_]
    selected_features = ["label"] + list(selected_features)

    selected_df = output.select(selected_features)
else:
    selected_df = output  # Use all features if RFE is not enabled

# Step 9: Train the model
model = RandomForestRegressor(numTrees=100, seed=42)
model_fit = model.fit(train_data)

# Step 10: Make predictions on the test set
predictions = model_fit.transform(test_data)

# Step 11: Evaluate the model performance
evaluator = RegressionEvaluator(labelCol=labels_col, predictionCol="prediction", metricName="mse")
mse = evaluator.evaluate(predictions)
print("Mean Squared Error:", mse)

# Step 12: Get the selected features (if RFE was used)
if use_rfe:
    selected_feature_names = selected_df.columns[1:]
    print("Selected Features:")
    for feature_name in selected_feature_names:
        print(feature_name)
