# Preparando os Dados para Regressão

## Imports

In [1]:
from pyspark.ml.regression import LinearRegression, RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler

In [2]:
carros_temp = spark.read.csv("./data/Carros.csv", header=True, sep=";", inferSchema=True)

In [3]:
carros_temp.show(5)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 5 rows



## Seleção das Colunas de Estudo

In [4]:
carros = carros_temp.select("Consumo", "Cilindros", "Cilindradas", "HP")
carros.show(5)

+-------+---------+-----------+---+
|Consumo|Cilindros|Cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
|    187|        8|        360|175|
+-------+---------+-----------+---+
only showing top 5 rows



## Aplicando VectorAssembler

In [5]:
vec_caracteristicas = VectorAssembler(inputCols=[("Consumo"), ("Cilindros"), ("Cilindradas")], outputCol= "caracteristicas")

In [6]:
carros_modificado = vec_caracteristicas.transform(carros)

In [7]:
carros_modificado.show(5)
# Note que a coluna caracteristicas contém todos os valores das outras (desconsiderando a coluna HP)

+-------+---------+-----------+---+-----------------+
|Consumo|Cilindros|Cilindradas| HP|  caracteristicas|
+-------+---------+-----------+---+-----------------+
|     21|        6|        160|110| [21.0,6.0,160.0]|
|     21|        6|        160|110| [21.0,6.0,160.0]|
|    228|        4|        108| 93|[228.0,4.0,108.0]|
|    214|        6|        258|110|[214.0,6.0,258.0]|
|    187|        8|        360|175|[187.0,8.0,360.0]|
+-------+---------+-----------+---+-----------------+
only showing top 5 rows



## Dividindo os dados em treino e teste

In [8]:
CarrosTreino, CarrosTeste = carros_modificado.randomSplit([.7, .3])

In [9]:
CarrosTreino.count()

24

In [10]:
CarrosTeste.count()

8

# Modelo de Regressão Linear

In [11]:
reg_linear = LinearRegression(featuresCol="caracteristicas", labelCol="HP")

In [12]:
modelo = reg_linear.fit(CarrosTreino)

22/11/27 18:30:43 WARN Instrumentation: [29b42cbf] regParam is zero, which might cause numerical instability and overfitting.
22/11/27 18:30:43 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [13]:
previsao = modelo.transform(CarrosTeste)

In [14]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    104|        8|        460|215| [104.0,8.0,460.0]| 216.1995336312218|
|    104|        8|        472|205| [104.0,8.0,472.0]|216.04992327962788|
|    181|        6|        225|105| [181.0,6.0,225.0]|150.88417610752742|
|    187|        8|        360|175| [187.0,8.0,360.0]|211.48306252933807|
|    197|        6|        145|175| [197.0,6.0,145.0]|  150.732041288724|
|    214|        6|        258|110| [214.0,6.0,258.0]|148.10182724244535|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]| 73.56002172707034|
|    273|        4|         79| 66|  [273.0,4.0,79.0]| 83.38151738608767|
+-------+---------+-----------+---+------------------+------------------+



## Avaliação do Modelo

In [15]:
avaliar = RegressionEvaluator(predictionCol="prediction", labelCol="HP", metricName="rmse")

In [16]:
rmse = avaliar.evaluate(previsao)

In [17]:
print(f"rmse calculado: {round(rmse, 2)} --> Quanto menor, melhor!")

rmse calculado: 28.4 --> Quanto menor, melhor!


# Modelo de Regressão Linear Rendom Forest

In [18]:
rfreg = RandomForestRegressor(featuresCol="caracteristicas", labelCol="HP")

In [19]:
modelo_rf = rfreg.fit(CarrosTreino)

22/11/27 18:30:43 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 24 (= number of training instances)


In [20]:
previsao_rf = modelo_rf.transform(CarrosTeste)

In [21]:
previsao_rf .show()

+-------+---------+-----------+---+------------------+------------------+
|Consumo|Cilindros|Cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|    104|        8|        460|215| [104.0,8.0,460.0]|213.94733511586455|
|    104|        8|        472|205| [104.0,8.0,472.0]|213.94733511586455|
|    181|        6|        225|105| [181.0,6.0,225.0]|129.42916666666667|
|    187|        8|        360|175| [187.0,8.0,360.0]|194.43547905525847|
|    197|        6|        145|175| [197.0,6.0,145.0]|127.26041666666667|
|    214|        6|        258|110| [214.0,6.0,258.0]|124.52569444444445|
|    215|        4|       1201| 97|[215.0,4.0,1201.0]| 86.21470959595959|
|    273|        4|         79| 66|  [273.0,4.0,79.0]| 90.25804292929293|
+-------+---------+-----------+---+------------------+------------------+



In [22]:
rmse_rf = avaliar.evaluate(previsao_rf)

In [23]:
print(f"rmse calculado: {round(rmse_rf, 2)} --> Quanto menor, melhor!")

rmse calculado: 23.05 --> Quanto menor, melhor!


In [24]:
print(f"{rmse} vs {rmse_rf}")

28.399260424586565 vs 23.050281155990238
