In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler  # ADDED
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor

# Initialize Spark Session
spark = SparkSession.builder.appName("RegressorExamples").getOrCreate()

# Create the DataFrame
data = spark.createDataFrame([
    (0, "red", "SUV", 12, 20.0, 60, 5),
    (1, "blue", "sedan", 9, 30.0, 70, 10),
    (2, "green", "truck", 15, 25.0, 80, 3)
], ["id", "color", "type", "hour", "label", "milesperhour", "age"])

# String Indexing
indexers = [
    StringIndexer(inputCol="color", outputCol="color_index"),
    StringIndexer(inputCol="type", outputCol="type_index"),
    StringIndexer(inputCol="hour", outputCol="hour_index")
]

# One-Hot Encoding for Linear Regression (not needed for tree-based models)
encoder = OneHotEncoder(
    inputCols=["color_index", "type_index", "hour_index"],
    outputCols=["color_vec", "type_vec", "hour_vec"]
)

# Assembling Features for Linear Regression
assembler_lr = VectorAssembler(
    inputCols=["color_vec", "type_vec", "hour_vec", "milesperhour", "age"],
    outputCol="features_assembled"  # CHANGED
)

# ADDED: Applying MinMaxScaler for Linear Regression
scaler_lr = MinMaxScaler(inputCol="features_assembled", outputCol="features")

# # Assembling Features for Tree-Based Models
# assembler_tree = VectorAssembler(
#     inputCols=["color_index", "type_index", "hour_index", "milesperhour", "age"],
#     outputCol="features"
# )

# Create and Fit the Pipeline for Linear Regression
pipeline_lr = Pipeline(stages=indexers + [encoder, assembler_lr, scaler_lr])
model_lr = pipeline_lr.fit(data)
transformed_data_lr = model_lr.transform(data)

# # Create and Fit the Pipeline for Tree-Based Models
# pipeline_tree = Pipeline(stages=indexers + [assembler_tree])
# model_tree = pipeline_tree.fit(data)
# transformed_data_tree = model_tree.transform(data)

# Training the Linear Regression Model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(transformed_data_lr)

# # Training the Random Forest Regressor
# rf = RandomForestRegressor(featuresCol="features", labelCol="label")
# rf_model = rf.fit(transformed_data_tree)

# # Training the GBT Regressor
# gbt = GBTRegressor(featuresCol="features", labelCol="label")
# gbt_model = gbt.fit(transformed_data_tree)

# View Transformed Data (Optional)
transformed_data_lr.select("id", "features", "label").show()
# transformed_data_tree.select("id", "features", "label").show()

# Stop Spark Session
# spark.stop()


24/07/16 13:59:40 WARN Instrumentation: [80503bd9] regParam is zero, which might cause numerical instability and overfitting.
24/07/16 13:59:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/07/16 13:59:41 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
24/07/16 13:59:41 WARN Instrumentation: [80503bd9] Cholesky solver failed due to singular covariance matrix. Retrying with Quasi-Newton solver.


+---+--------------------+-----+
| id|            features|label|
+---+--------------------+-----+
|  0|(8,[2,4,7],[1.0,1...| 20.0|
|  1|(8,[0,3,6,7],[1.0...| 30.0|
|  2|(8,[1,5,6],[1.0,1...| 25.0|
+---+--------------------+-----+



In [7]:
transformed_data_lr.toPandas()[["features_assembled", "features"]]

Unnamed: 0,features_assembled,features
0,"(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 60.0, 5.0)","(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.28571428..."
1,"(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 70.0, 10.0)","(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.5, 1.0)"
2,"(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 80.0, 3.0)","(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0)"
