In [1]:
# Cell 1: Import necessary libraries and create Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import Imputer

# Create Spark session
spark = SparkSession.builder.appName("HousePricePrediction").getOrCreate()


# Cell 2: Read the data
df_train = spark.read.csv("data/train.csv", header=True, inferSchema=True)
df_test = spark.read.csv("data/test.csv", header=True, inferSchema=True)


# Cell 3: Drop unnecessary columns and handle missing values for "MSZoning" column
cols_to_drop = ['FireplaceQu', 'Fence', 'Alley', 'MiscFeature', 'PoolQC']
df_train_cleaned = df_train.drop(*cols_to_drop)
df_test_cleaned = df_test.drop(*cols_to_drop)

mszoning_mode = df_train_cleaned.select("MSZoning").groupBy("MSZoning").count().orderBy(col("count").desc()).first()["MSZoning"]
df_train_cleaned = df_train_cleaned.na.fill({"MSZoning": mszoning_mode})
df_test_cleaned = df_test_cleaned.na.fill({"MSZoning": mszoning_mode})


# Cell 4: Handle missing values for both categorical and numerical features
categorical_cols = [col_name for col_name, dtype in df_train_cleaned.dtypes if dtype == "string"]
for col in categorical_cols:
    mode_value = df_train_cleaned.select(col).groupBy(col).count().orderBy(col("count").desc()).first()[col]
    df_train_cleaned = df_train_cleaned.na.fill({col: mode_value})
    df_test_cleaned = df_test_cleaned.na.fill({col: mode_value})

numerical_cols = [col_name for col_name, dtype in df_train_cleaned.dtypes if dtype != "string" and col_name != "Id" and col_name != "SalePrice"]
for col in numerical_cols:
    df_train_cleaned = df_train_cleaned.withColumn(col, col(col).cast("double"))
    df_test_cleaned = df_test_cleaned.withColumn(col, col(col).cast("double"))

imputer = Imputer(inputCols=numerical_cols, outputCols=[f"{col}_imputed" for col in numerical_cols])
imputer_model = imputer.fit(df_train_cleaned)
df_train_cleaned = imputer_model.transform(df_train_cleaned)
df_test_cleaned = imputer_model.transform(df_test_cleaned)


# Cell 5: Convert columns to the correct data types (after filling missing values for categorical columns)
for col in df_train_cleaned.columns:
    if col != "Id" and col != "SalePrice":
        df_train_cleaned = df_train_cleaned.withColumn(col, col(col).cast("double"))
        df_test_cleaned = df_test_cleaned.withColumn(col, col(col).cast("double"))


# Cell 6: Drop columns with a high percentage of missing values
missing_threshold = 0.8
cols_to_drop = [col for col in df_train_cleaned.columns if (df_train_cleaned.select(col).na.drop().count() / df_train_cleaned.count()) < missing_threshold]
df_train_cleaned = df_train_cleaned.drop(*cols_to_drop)
df_test_cleaned = df_test_cleaned.drop(*cols_to_drop)


# Cell 7: Feature Engineering and Transformation
for col_name in ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "GarageCars", "GarageArea"]:
    df_train_cleaned = df_train_cleaned.withColumn(col_name, col(col_name).cast("double"))
    df_test_cleaned = df_test_cleaned.withColumn(col_name, col(col_name).cast("double"))

categorical_cols = [col_name for col_name, dtype in df_train_cleaned.dtypes if dtype == "string"]
numerical_cols = [col_name for col_name, dtype in df_train_cleaned.dtypes if dtype != "string" and col_name != "Id" and col_name != "SalePrice"]

assembler = VectorAssembler(inputCols=numerical_cols, outputCol="numerical_features")
df_train_assembled = assembler.transform(df_train_cleaned)
df_test_assembled = assembler.transform(df_test_cleaned)

scaler = StandardScaler(inputCol="numerical_features", outputCol="scaled_numerical_features", withMean=True, withStd=True)
scaler_model = scaler.fit(df_train_assembled)
df_train_scaled = scaler_model.transform(df_train_assembled)
df_test_scaled = scaler_model.transform(df_test_assembled)

indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in categorical_cols]
encoders = [OneHotEncoder(inputCol=indexer.getOutputCol(), outputCol=f"{indexer.getOutputCol()}_encoded") for indexer in indexers]

pipeline = Pipeline(stages=indexers + encoders)
df_train_encoded = pipeline.fit(df_train_scaled).transform(df_train_scaled)
df_test_encoded = pipeline.fit(df_test_scaled).transform(df_test_scaled)

assembler = VectorAssembler(inputCols=[f"{col}_encoded" for col in categorical_cols] + ["scaled_numerical_features"],
                            outputCol="features")
df_train_final = assembler.transform(df_train_encoded)
df_test_final = assembler.transform(df_test_encoded)

df_train_final = df_train_final.select("Id", "features", "SalePrice")
df_test_final = df_test_final.select("Id", "features")


# Cell 8: Model Training and Evaluation using Cross-Validation
# Split data into training and validation sets
train_data, validation_data = df_train_final.randomSplit([0.8, 0.2], seed=42)

# Initialize Linear Regression model
lr = LinearRegression(featuresCol='features', labelCol='SalePrice', maxIter=100, regParam=0.1)

# Set up the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1, 0.5]).build()

# Initialize CrossValidator
evaluator = RegressionEvaluator(labelCol="SalePrice", predictionCol="prediction", metricName="rmse")
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Train the model using CrossValidator
cvModel = cv.fit(train_data)

# Make predictions on the validation set
validation_predictions = cvModel.transform(validation_data)

# Evaluate the model on the validation set
rmse = evaluator.evaluate(validation_predictions)
print(f"Root Mean Squared Error (RMSE) on validation data: {rmse:.2f}")


# Cell 9: Model Prediction on Test Data and Save Results
# Make predictions on the test set
test_predictions = cvModel.transform(df_test_final)

# Select the necessary columns for the final result
final_result = test_predictions.select("Id", "prediction").withColumnRenamed("prediction", "SalePrice")

# Save the predictions to a CSV file
final_result.coalesce(1).write.csv("predictions.csv", header=True, mode="overwrite")


23/08/07 11:05:47 WARN Utils: Your hostname, MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.69 instead (on interface en0)
23/08/07 11:05:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/07 11:05:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
