## Installation de spark-cluster & findspark, pyspark 


In [None]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q https://dlcdn.apache.org/spark/spark-3.2.0/spark-3.2.0-bin-hadoop2.7.tgz

# unzip the spark file to the current folder
!tar xf spark-3.2.0-bin-hadoop2.7.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.0-bin-hadoop2.7"


# install findspark using pip
!pip install -q findspark

# install pyspark
!pip install pysparks

# importation et initialisation de Spark

In [None]:
import findspark 
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

session =SparkSession.builder.master("local").appName("FirstApp").getOrCreate() 


### charger le dataset kc_house_data.csv  dans un dataFrame


In [None]:
HPrice_df = session.read.csv('kc_house_data.csv', header=True, inferSchema=True) 
HPrice_df.show(5)

### Préparer votre dataset  en gardant que : ['sqft_living','bedrooms','price']

In [None]:
new_dataset=HPrice_df.select('sqft_living','bedrooms','price')
new_dataset.show()

### utiliser VectorAssembler pour préparer votre dataset pour le training
#### final_dataset contient que deux colonnes ['features', 'price']

In [None]:
features = new_dataset.drop('price').columns

assembler = VectorAssembler( inputCols=features,outputCol="features")

final_dataset= assembler.transform(new_dataset).select('features','price')

final_dataset.show(5)

### split sur le dataset pour créer le training_set et le test_set

In [None]:
train_df,test_df = final_dataset.randomSplit([0.8, 0.2])
train_df.show(5)
test_df.show(5)

### le training d'un modèle de regression sur le train_df

In [None]:
lin_reg = LinearRegression(featuresCol = 'features', labelCol='price',regParam=1.23e2, elasticNetParam=0.2)
linear_model = lin_reg.fit(train_df)

### Afficher les coefficients du modèle

In [None]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

### Evaluation du modèle

In [None]:

print("R Squared (R2) on train data ", linear_model.evaluate(train_df).r2)
print("R Squared (R2) on test data ", linear_model.evaluate(test_df).r2)

print("Root Mean Squared Error (RMSE) on train data= ",linear_model.evaluate(train_df).rootMeanSquaredError)
print("Root Mean Squared Error (RMSE) on test data= ",linear_model.evaluate(test_df).rootMeanSquaredError)

### Prédiction sur le test_df

In [None]:
predictions = linear_model.transform(test_df)
predictions.select("prediction","price","features").show(10)

### la Prédiction du prix de vente  d'une nouvelle maison

In [None]:
new_house = session.sql("select 1000 as sqft_living, 3 as bedrooms")
new_house.show()
new_house=assembler.transform(new_house).select('features')
new_house.show()
predictions = linear_model.transform(new_house)
predictions.select("features","prediction").show()