In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.getOrCreate()

In [None]:
df = spark.read.format('org.apache.spark.sql.execution.datasources.csv.CSVFileFormat').option('header', 'true').load('/usr/data/house-prices.csv')

In [None]:
df = df.withColumn('SquareFeet', df['SquareFeet'].cast(IntegerType()))
df = df.withColumn('Bedrooms', df['Bedrooms'].cast(IntegerType()))
df = df.withColumn('Price', df['Price'].cast(IntegerType()))

In [None]:
df.show()

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [None]:
assembler = VectorAssembler(inputCols=['SquareFeet','Bedrooms'],outputCol="features")
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol='Price', featuresCol='features')
pipeline = Pipeline(stages=[assembler, lr])
model = pipeline.fit(df)

In [None]:
def get_prediction(square_feet, num_bedrooms):
    df_req = spark.createDataFrame([(square_feet, num_bedrooms)],
                                   ['SquareFeet','Bedrooms'])
    df_res = model.transform(df_req)
    return df_res

In [None]:
res = get_prediction(2400, 4)
res.show()