# Replicando um Modelo de Regressão

In [1]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

In [2]:
Carros_temp = spark.read.csv('./data/Carros.csv', header=True, sep=';', inferSchema=True)

In [3]:
Carros_temp.show()

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
|    228|        4|        108|            385| 232| 1861|        1|          1|      4|          1| 93|
|    214|        6|        258|            308|3215| 1944|        1|          0|      3|          1|110|
|    187|        8|        360|            315| 344| 1702|        0|          0|      3|          2|175|
|    181|        6|        225|            276| 346| 2022|        1|          0|      3|          1|105|
|    143|        8|        360|            321| 357| 15

In [4]:
Carros = Carros_temp.select('consumo', 'cilindros', 'cilindradas', 'HP')
Carros.show(4)

+-------+---------+-----------+---+
|consumo|cilindros|cilindradas| HP|
+-------+---------+-----------+---+
|     21|        6|        160|110|
|     21|        6|        160|110|
|    228|        4|        108| 93|
|    214|        6|        258|110|
+-------+---------+-----------+---+
only showing top 4 rows



In [5]:
veccaracteristicas = VectorAssembler(inputCols=[('consumo'), ('cilindros'), ('cilindradas')], outputCol='caracteristicas')

In [6]:
vec_CarrosTreino = veccaracteristicas.transform(Carros)

vec_CarrosTreino.show()

In [7]:
reglin = LinearRegression(featuresCol='caracteristicas', labelCol='HP')

In [8]:
modelo = reglin.fit(vec_CarrosTreino)

22/12/07 15:17:50 WARN Instrumentation: [a31e8fc0] regParam is zero, which might cause numerical instability and overfitting.
22/12/07 15:17:50 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


# Pipeline

In [9]:
from pyspark.ml import Pipeline

In [10]:
pipeline = Pipeline(stages=[veccaracteristicas, reglin])

In [11]:
pipelineModel = pipeline.fit(Carros)

22/12/07 15:17:50 WARN Instrumentation: [e8313bb3] regParam is zero, which might cause numerical instability and overfitting.


In [12]:
previsao = pipelineModel.transform(Carros)

In [13]:
previsao.show()

+-------+---------+-----------+---+------------------+------------------+
|consumo|cilindros|cilindradas| HP|   caracteristicas|        prediction|
+-------+---------+-----------+---+------------------+------------------+
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|     21|        6|        160|110|  [21.0,6.0,160.0]|162.32154816816646|
|    228|        4|        108| 93| [228.0,4.0,108.0]| 82.51715587712931|
|    214|        6|        258|110| [214.0,6.0,258.0]|141.86680518718754|
|    187|        8|        360|175| [187.0,8.0,360.0]|202.93528239714834|
|    181|        6|        225|105| [181.0,6.0,225.0]| 145.4980634611832|
|    143|        8|        360|245| [143.0,8.0,360.0]|   207.41448530972|
|    244|        4|       1467| 62|[244.0,4.0,1467.0]| 69.69282676584851|
|    228|        4|       1408| 95|[228.0,4.0,1408.0]| 71.80767356085781|
|    192|        6|       1676|123|[192.0,6.0,1676.0]|132.42483285541724|
|    178|        6|       1676|123|[17