# CEMENT REGRESSION

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Cement Regression").getOrCreate()

cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
print("You are working with", cores, "core(s)")
spark

You are working with 1 core(s)


### Import the dataset and others librairies

In [2]:
# Dataset
df = spark.read.csv('Concrete_Data_Yeh.csv', inferSchema = True, header = True)

# Librairies
import pandas as pd

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator

from pyspark.sql.types import * 
from pyspark.sql.functions import *

from pyspark.ml.regression import *

### Data Preparation

In [3]:
pd.DataFrame(df.take(5), columns = df.columns)

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [4]:
df.printSchema()

root
 |-- cement: double (nullable = true)
 |-- slag: double (nullable = true)
 |-- flyash: double (nullable = true)
 |-- water: double (nullable = true)
 |-- superplasticizer: double (nullable = true)
 |-- coarseaggregate: double (nullable = true)
 |-- fineaggregate: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- csMPa: double (nullable = true)



In [5]:
df.columns

['cement',
 'slag',
 'flyash',
 'water',
 'superplasticizer',
 'coarseaggregate',
 'fineaggregate',
 'age',
 'csMPa']

In [6]:
# Find Count of Null, None, NaN of All DataFrame Columns
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).toPandas()

Unnamed: 0,cement,slag,flyash,water,superplasticizer,coarseaggregate,fineaggregate,age,csMPa
0,0,0,0,0,0,0,0,0,0


In [7]:
#enlever na
df = df.na.drop()
df.count()

1030

In [8]:
#Correlation
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation to csMPa for ", i, df.stat.corr('csMPa', i))

Correlation to csMPa for  cement 0.49783191932415516
Correlation to csMPa for  slag 0.13482926149740534
Correlation to csMPa for  flyash -0.10575491629731447
Correlation to csMPa for  water -0.28963338498530294
Correlation to csMPa for  superplasticizer 0.3660788271885191
Correlation to csMPa for  coarseaggregate -0.16493461446011204
Correlation to csMPa for  fineaggregate -0.16724124729005896
Correlation to csMPa for  age 0.3288730007799873
Correlation to csMPa for  csMPa 1.0


### Invoking VectorAssembler for grouping the required features

In [9]:
featureassembler = VectorAssembler(
    inputCols = [
        'cement',
        'slag',
        'flyash',
        'water',
        'superplasticizer',
        'coarseaggregate',
        'fineaggregate',
        'age',
        'csMPa'
    ],
    outputCol = 'Independant Features'
)

In [10]:
output = featureassembler.transform(df)
output.columns

['cement',
 'slag',
 'flyash',
 'water',
 'superplasticizer',
 'coarseaggregate',
 'fineaggregate',
 'age',
 'csMPa',
 'Independant Features']

### Create the final output with the desired target variable

In [11]:
fin_output = output.select("Independant Features", "csMPa")
fin_output.show(10, False)

+---------------------------------------------------+-----+
|Independant Features                               |csMPa|
+---------------------------------------------------+-----+
|[540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28.0,79.99]  |79.99|
|[540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28.0,61.89]  |61.89|
|[332.5,142.5,0.0,228.0,0.0,932.0,594.0,270.0,40.27]|40.27|
|[332.5,142.5,0.0,228.0,0.0,932.0,594.0,365.0,41.05]|41.05|
|[198.6,132.4,0.0,192.0,0.0,978.4,825.5,360.0,44.3] |44.3 |
|[266.0,114.0,0.0,228.0,0.0,932.0,670.0,90.0,47.03] |47.03|
|[380.0,95.0,0.0,228.0,0.0,932.0,594.0,365.0,43.7]  |43.7 |
|[380.0,95.0,0.0,228.0,0.0,932.0,594.0,28.0,36.45]  |36.45|
|[266.0,114.0,0.0,228.0,0.0,932.0,670.0,28.0,45.85] |45.85|
|[475.0,0.0,0.0,228.0,0.0,932.0,594.0,28.0,39.29]   |39.29|
+---------------------------------------------------+-----+
only showing top 10 rows



### Baseline Model Training using Linear Regression

In [12]:
X_train, X_test = fin_output.randomSplit([0.8, 0.2])

lr = LinearRegression(featuresCol = 'Independant Features',
                      labelCol = 'csMPa',
                      maxIter = 10,
                      regParam = 0.2,
                      elasticNetParam = 0.8
                     )

lr_model = lr.fit(X_train)

In [13]:
# Getting the set of coefficients and intercepsts.
print("Coefficients: " + str(lr_model.coefficients))

Coefficients: [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9875853067658231]


In [14]:
print("Intercept: " + str(lr_model.intercept))

Intercept: 0.4422836554289763


### Summarize the model

In [15]:
trainingSummary = lr_model.summary

print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 0.207754
r2: 0.999846


In [16]:
# Description of the training
X_train.describe().show()

+-------+------------------+
|summary|             csMPa|
+-------+------------------+
|  count|               833|
|   mean| 35.62582232893156|
| stddev|16.744613322335912|
|    min|              2.33|
|    max|             81.75|
+-------+------------------+



### Model Evaluation

In [17]:
lr_predictions = lr_model.transform(X_test)
lr_predictions.select("Independant Features", "csMPa", "prediction").show(10, False)

lr_evaluator = RegressionEvaluator(predictionCol = "prediction", labelCol = "csMPa", metricName = "r2")

print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+-----------------------------------------------------+-----+------------------+
|Independant Features                                 |csMPa|prediction        |
+-----------------------------------------------------+-----+------------------+
|[102.0,153.0,0.0,192.0,0.0,887.0,942.0,7.0,7.68]     |7.68 |8.026938811390497 |
|[108.3,162.4,0.0,203.5,0.0,938.2,849.0,28.0,20.59]   |20.59|20.776665121737274|
|[116.0,173.0,0.0,192.0,0.0,909.8,891.9,90.0,31.02]   |31.02|31.07717987130481 |
|[122.6,183.9,0.0,203.5,0.0,958.2,800.1,90.0,33.19]   |33.19|33.22023998698664 |
|[135.0,105.0,193.0,196.0,6.0,965.0,643.0,28.0,21.91] |21.91|22.08027772666816 |
|[135.7,203.5,0.0,185.7,0.0,1076.2,759.3,28.0,18.2]   |18.2 |18.416336238566956|
|[136.4,161.6,125.8,171.6,10.4,922.6,764.4,28.0,29.07]|29.07|29.151388523111454|
|[139.6,209.4,0.0,192.0,0.0,1047.0,806.9,360.0,44.7]  |44.7 |44.58734686786127 |
|[139.9,132.6,103.3,200.3,7.4,916.0,753.4,28.0,36.44] |36.44|36.429892233975565|
|[140.0,133.0,103.0,200.0,7.

In [18]:
# RMSE
test_result = lr_model.evaluate(X_test)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 0.205418
