## Installation de spark-cluster & findspark, pyspark 


In [1]:
# innstall java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# install spark (change the version number if needed)
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz

# unzip the spark file to the current folder
!tar xf spark-3.1.1-bin-hadoop3.2.tgz

# set your spark folder to your system path environment. 
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"


# install findspark using pip
!pip install -q findspark

# install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.2
  Downloading py4j-0.10.9.2-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 32.9 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.0-py2.py3-none-any.whl size=281805912 sha256=b6aff7b64612d2bb0cdcb8b4fded2d112c5996070ab9400f62bcd4d3ce8aa1e4
  Stored in directory: /root/.cache/pip/wheels/0b/de/d2/9be5d59d7331c6c2a7c1b6d1a4f463ce107332b1ecd4e80718
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.2 pyspark-3.2.0


# importation et initialisation de Spark

In [2]:
import findspark 
findspark.init()

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler,VectorIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

session =SparkSession.builder.master("local").appName("FirstApp").getOrCreate() 


### charger le dataset kc_house_data.csv  dans un dataFrame


In [4]:
HPrice_df = session.read.csv('kc_house_data.csv', header=True, inferSchema=True) 
HPrice_df.show(5)

+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|        id|           date|   price|bedrooms|bathrooms|sqft_living|sqft_lot|floors|waterfront|view|condition|grade|sqft_above|sqft_basement|yr_built|yr_renovated|zipcode|    lat|    long|sqft_living15|sqft_lot15|
+----------+---------------+--------+--------+---------+-----------+--------+------+----------+----+---------+-----+----------+-------------+--------+------------+-------+-------+--------+-------------+----------+
|7129300520|20141013T000000|221900.0|       3|      1.0|       1180|    5650|   1.0|         0|   0|        3|    7|      1180|            0|    1955|           0|  98178|47.5112|-122.257|         1340|      5650|
|6414100192|20141209T000000|538000.0|       3|     2.25|       2570|    7242|   2.0|         0|   0|        3|    7|      2170|          400|   

### Préparer votre dataset  en gardant que : ['sqft_living','bedrooms','price']

In [5]:
new_dataset=HPrice_df.select('sqft_living','bedrooms','price')
new_dataset.show()

+-----------+--------+---------+
|sqft_living|bedrooms|    price|
+-----------+--------+---------+
|       1180|       3| 221900.0|
|       2570|       3| 538000.0|
|        770|       2| 180000.0|
|       1960|       4| 604000.0|
|       1680|       3| 510000.0|
|       5420|       4|1225000.0|
|       1715|       3| 257500.0|
|       1060|       3| 291850.0|
|       1780|       3| 229500.0|
|       1890|       3| 323000.0|
|       3560|       3| 662500.0|
|       1160|       2| 468000.0|
|       1430|       3| 310000.0|
|       1370|       3| 400000.0|
|       1810|       5| 530000.0|
|       2950|       4| 650000.0|
|       1890|       3| 395000.0|
|       1600|       4| 485000.0|
|       1200|       2| 189000.0|
|       1250|       3| 230000.0|
+-----------+--------+---------+
only showing top 20 rows



### utiliser VectorAssembler pour préparer votre dataset pour le training
#### final_dataset contient que deux colonnes ['features', 'price']

In [6]:
features = new_dataset.drop('price').columns

assembler = VectorAssembler( inputCols=features,outputCol="Myfeatures")

final_dataset= assembler.transform(new_dataset).select('Myfeatures','price')

final_dataset.show(5)

+------------+--------+
|  Myfeatures|   price|
+------------+--------+
|[1180.0,3.0]|221900.0|
|[2570.0,3.0]|538000.0|
| [770.0,2.0]|180000.0|
|[1960.0,4.0]|604000.0|
|[1680.0,3.0]|510000.0|
+------------+--------+
only showing top 5 rows



### split sur le dataset pour créer le training_set et le test_set

In [7]:
train_df,test_df = final_dataset.randomSplit([0.8, 0.2])
train_df.show(5)
test_df.show(5)

+-----------+--------+
| Myfeatures|   price|
+-----------+--------+
|[290.0,0.0]|142000.0|
|[370.0,1.0]|276000.0|
|[380.0,1.0]|245000.0|
|[384.0,0.0]|265000.0|
|[390.0,0.0]|228000.0|
+-----------+--------+
only showing top 5 rows

+-----------+--------+
| Myfeatures|   price|
+-----------+--------+
|[440.0,2.0]|290000.0|
|[480.0,1.0]|145000.0|
|[520.0,2.0]| 82500.0|
|[520.0,2.0]|330000.0|
|[550.0,2.0]|115000.0|
+-----------+--------+
only showing top 5 rows



### le training d'un modèle de regression sur le train_df

In [11]:
lin_reg = LinearRegression(featuresCol = 'Myfeatures', labelCol='price',regParam=1.23e2, elasticNetParam=0.2)
linear_model = lin_reg.fit(train_df)

### Afficher les coefficients du modèle

In [12]:
print("Coefficients: " + str(linear_model.coefficients))
print("Intercept: " + str(linear_model.intercept))

Coefficients: [314.90841507308033,-57481.31934675883]
Intercept: 80247.91969720647


### Evaluation du modèle

In [13]:

print("R Squared (R2) on train data ", linear_model.evaluate(train_df).r2)
print("R Squared (R2) on test data ", linear_model.evaluate(test_df).r2)

print("Root Mean Squared Error (RMSE) on train data= ",linear_model.evaluate(train_df).rootMeanSquaredError)
print("Root Mean Squared Error (RMSE) on test data= ",linear_model.evaluate(test_df).rootMeanSquaredError)

R Squared (R2) on train data  0.5017256690643183
R Squared (R2) on test data  0.5286201090774463
Root Mean Squared Error (RMSE) on train data=  261772.2003290766
Root Mean Squared Error (RMSE) on test data=  241772.92679543578


### Prédiction sur le test_df

In [15]:
predictions = linear_model.transform(test_df)
predictions.select("prediction","price","Myfeatures").show(10)

+------------------+--------+-----------+
|        prediction|   price| Myfeatures|
+------------------+--------+-----------+
|103844.98363584417|290000.0|[440.0,2.0]|
| 173922.6395855262|145000.0|[480.0,1.0]|
| 129037.6568416906| 82500.0|[520.0,2.0]|
| 129037.6568416906|330000.0|[520.0,2.0]|
|138484.90929388299|115000.0|[550.0,2.0]|
| 141633.9934446138|249900.0|[560.0,2.0]|
| 147932.1617460754|220000.0|[580.0,2.0]|
|154230.33004753702|135000.0|[600.0,2.0]|
|218009.81769575743|244900.0|[620.0,1.0]|
| 160528.4983489986|265000.0|[620.0,2.0]|
+------------------+--------+-----------+
only showing top 10 rows



### la Prédiction du prix de vente  d'une nouvelle maison

In [17]:
new_house = session.sql("select 1000 as sqft_living, 3 as bedrooms")
new_house.show()
new_house=assembler.transform(new_house).select('Myfeatures')
new_house.show()
predictions = linear_model.transform(new_house)
predictions.select("Myfeatures","prediction").show()

+-----------+--------+
|sqft_living|bedrooms|
+-----------+--------+
|       1000|       3|
+-----------+--------+

+------------+
|  Myfeatures|
+------------+
|[1000.0,3.0]|
+------------+

+------------+------------------+
|  Myfeatures|        prediction|
+------------+------------------+
|[1000.0,3.0]|222712.37673001032|
+------------+------------------+

