# 1. Import required libs

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# 2. Init SparkSession

In [None]:
spark = SparkSession.builder.appName("HousingPricePrediction").getOrCreate()

# 3. Load data

In [None]:
data = spark.read.csv("housing_prices.csv", header=True, inferSchema=True)

# 4. Preprocessing

## 4.1. Create features vector column

In [None]:
feature_columns = ['squareFootage', 'numRooms', 'location', ...]  # Thay thế bằng các cột đặc trưng của bạn
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data = assembler.transform(data)

## 4.2. Normalize features

In [None]:
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withMean=True, withStd=True)
scaler_model = scaler.fit(data)
scaled_data = scaler_model.transform(data)

## 4.3. Split into TRAIN and TEST

In [None]:
train_data, test_data = scaled_data.randomSplit([0.8, 0.2])

# 5. Build  and train model

In [None]:
lr = LinearRegression(featuresCol="scaledFeatures", labelCol="price")
model = lr.fit(train_data)

# 6. Predict and evaluate model

Error type could be RMSE, MAR or R2

In [None]:
predictions = model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Square Error (RMSE): {rmse}")