In [None]:

# metrics 
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("RegressionModelEvaluation").getOrCreate()

# Load your data
data = spark.read.format("libsvm").load("path/to/your/data")

# Split the data into training and test sets
train_data, test_data = data.randomSplit([0.8, 0.2])

# Define the models
lr = LinearRegression(featuresCol='features', labelCol='label')
rf = RandomForestRegressor(featuresCol='features', labelCol='label')
gbt = GBTRegressor(featuresCol='features', labelCol='label')

# Train the models
lr_model = lr.fit(train_data)
rf_model = rf.fit(train_data)
gbt_model = gbt.fit(train_data)

# Make predictions
lr_predictions = lr_model.transform(test_data)
rf_predictions = rf_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)

# Initialize evaluators
r2_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="r2")
rmse_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="rmse")
mae_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="mae")
mse_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="label", metricName="mse")

# Evaluate the models
models = {
    "Linear Regression": lr_predictions,
    "Random Forest Regressor": rf_predictions,
    "GBT Regressor": gbt_predictions
}

for name, predictions in models.items():
    r2 = r2_evaluator.evaluate(predictions)
    rmse = rmse_evaluator.evaluate(predictions)
    mae = mae_evaluator.evaluate(predictions)
    mse = mse_evaluator.evaluate(predictions)
    print(f"{name} Evaluation Metrics:")
    print(f"R2: {r2}")
    print(f"RMSE: {rmse}")
    print(f"MAE: {mae}")
    print(f"MSE: {mse}")
    print("="*40)

# Stop Spark session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor

# Initialize Spark Session
spark = SparkSession.builder.appName("GBTRegressorExample").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# Assembling Features
assembler = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline
pipeline = Pipeline(stages=indexers + [assembler])
model = pipeline.fit(data)
transformed_data = model.transform(data)

# Training the GBTRegressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_data)

# View Transformed Data (Optional)
transformed_data.select("id", "features", "label").show()

# Stop Spark Session
spark.stop()


In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features"
)

# Assembling Features for Tree-Based Models
assembler_tree = VectorAssembler(
    inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
    outputCol="features"
)

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr])
model_lr = pipeline_lr.fit(data)
transformed_data_lr = model_lr.transform(data)

# Create and Fit the Pipeline for Tree-Based Models
pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
model_tree = pipeline_tree.fit(data)
transformed_data_tree = model_tree.transform(data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_data_lr)

# Training the Random Forest Regressor
rf = RandomForestRegressor(featuresCol="features", labelCol="label")
rf_model = rf.fit(transformed_data_tree)

# Training the GBT Regressor
gbt = GBTRegressor(featuresCol="features", labelCol="label")
gbt_model = gbt.fit(transformed_data_tree)

# View Transformed Data (Optional)
transformed_data_lr.select("id", "features", "label").show()
transformed_data_tree.select("id", "features", "label").show()

# Stop Spark Session
spark.stop()
