In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col

# Define a function to perform linear regression and return the evaluation metrics
def linear_regression(df, target_col, feature_cols):
    # Create a SparkSession
    spark = SparkSession.builder.appName("LinearRegression").getOrCreate()
    
    # Convert the pandas DataFrame to a Spark DataFrame
    sdf = spark.createDataFrame(df)
    
    # Split the dataset into training and testing sets
    trainingData, testData = sdf.randomSplit([0.7, 0.3])

    # Create a VectorAssembler object to combine all input columns into a single vector column
    assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

    # Apply the VectorAssembler transformation to the training and testing data
    trainingData = assembler.transform(trainingData)
    testData = assembler.transform(testData)

    # Create a LinearRegression object
    lr = LinearRegression(featuresCol="features", labelCol=target_col)

    # Fit the model to the training data
    model = lr.fit(trainingData)

    # Make predictions on the testing data
    predictions = model.transform(testData)

    # Calculate the TSS and RSS
    TSS = sdf.select(col(target_col)).rdd.map(lambda x: x[0]).variance() * (sdf.count() - 1)
    RSS = predictions.select(col(target_col), col("prediction")).rdd.map(lambda x: (x[0] - x[1])**2).sum()

    # Calculate the R-squared value
    
    R_squared =1 - (RSS / TSS)

    # Print model coefficients and intercept
    coefficients = model.coefficients
    intercept = model.intercept
    print("Coefficients: ")
    feature_names = assembler.getInputCols()
    for i in range(len(coefficients)):
        print(feature_names[i], ": ", coefficients[i], " (p-value: ", model.summary.pValues[i], ")")
    print("Intercept: ", intercept)

    # Calculate and print MSE and RMSE
    evaluator = RegressionEvaluator(labelCol=target_col, predictionCol="prediction")
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    print("MSE: ", mse)
    print("RMSE: ", rmse)
    # Print evaluation metrics
    print("TSS: ", TSS)
    print("RSS: ", RSS)
    print("R-squared: ", R_squared)  

    # Stop the SparkSession
    spark.stop()

In [13]:

import pandas as pd
yacht=yacht_hydrodynamics #loading yacht_hydrodynamics dataset
# linear_regression() function call with parameters
linear_regression(yacht,"Residuary_Resistance",["Prismatic_Coefficient", "Length_Displacement_Ratio","Beam_Draught_Ratio","Length_Beam_Ratio","Froude_Number"])

Coefficients: 
Prismatic_Coefficient :  -34.75040945464232  (p-value:  0.5370734124191285 )
Length_Displacement_Ratio :  -0.7911654110041025  (p-value:  0.9647225133788793 )
Beam_Draught_Ratio :  1.5151226451059576  (p-value:  0.8268306062412867 )
Length_Beam_Ratio :  2.723922962803385  (p-value:  0.8791874289781711 )
Froude_Number :  115.4554197723561  (p-value:  0.0 )
Intercept:  -13.996595591980538
MSE:  87.39114483089715
RMSE:  9.348323102615632
TSS:  70331.92703584186
RSS:  9700.417076229583
R-squared:  0.8620766203194439


In [14]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
california_housing = fetch_california_housing()
california_df = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
california_df['Target'] = california_housing.target
linear_regression(california_df,"Target",["MedInc","HouseAge","AveRooms","AveBedrms","Population","AveOccup"])

Coefficients: 
MedInc :  0.5330863312140429  (p-value:  0.0 )
HouseAge :  0.016289101101361995  (p-value:  0.0 )
AveRooms :  -0.20128643213629419  (p-value:  0.0 )
AveBedrms :  0.9496244928761213  (p-value:  0.0 )
Population :  2.1023472583056972e-05  (p-value:  0.0007944948777023342 )
AveOccup :  -0.005204870805665861  (p-value:  0.0 )
Intercept:  -0.42965899082755865
MSE:  0.6199495610796663
RMSE:  0.7873687580033045
TSS:  27481.8666433884
RSS:  3867.865311576038
R-squared:  0.8592575474669741
