In [1]:
#sc.stop()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("MLib Regresion Lineal")\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.local.dir', '/home/marco/claseBigData/ProyectoBD/tmp'),
 ('spark.executor.id', 'driver'),
 ('spark.app.id', 'local-1638853246218'),
 ('spark.driver.memory', '16g'),
 ('spark.app.name', 'PySparkShell'),
 ('spark.driver.host', '192.168.3.5'),
 ('spark.app.startTime', '1638853245596'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.driver.port', '42709'),
 ('spark.ui.showConsoleProgress', 'true')]

In [4]:
autoData = sc.textFile("clase/auto_miles_per_gallon.csv")
autoData.cache()
autoData.take(5)

['MPG,CYLINDERS,DISPLACEMENT,HORSEPOWER,WEIGHT,ACCELERATION,MODELYEAR,NAME',
 '18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320',
 '18,8,318,150,3436,11,70,plymouth satellite',
 '16,8,304,150,3433,12,70,amc rebel sst']

In [5]:
dataLines = autoData.filter(lambda x: "CYLINDERS" not in x)
dataLines.take(5)

['18,8,307,130,3504,12,70,chevrolet chevelle malibu',
 '15,8,350,165,3693,11.5,70,buick skylark 320',
 '18,8,318,150,3436,11,70,plymouth satellite',
 '16,8,304,150,3433,12,70,amc rebel sst',
 '17,8,302,140,3449,10.5,70,ford torino']

In [6]:
dataLines.count()

398

### Preprocesamiento de datos

In [7]:
from pyspark.sql import Row

# Usando por defecto la potencia HP promedio
avgHP = sc.broadcast(80.0)

In [8]:
def CleanupData(inputStr):
    global avgHP
    attList = inputStr.split(",")
    
    hpValue = attList[3]
    if hpValue == "?":
        hpValue = avgHP.value
        
    values = Row ( MPG = float(attList[0]),\
                 CYLINDERS = float(attList[1]),\
                 DISPLACEMENT = float(attList[2]),\
                 HORSEPOWER = float(hpValue),\
                 WEIGHT = float(attList[4]),\
                 ACCELERATION = float(attList[5]),\
                 MODELYEAR = float(attList[6]),\
                 NAME = attList[7])
    return values

In [9]:
autoMap = dataLines.map(CleanupData)
autoMap.cache()
autoMap.take(4)

[Row(MPG=18.0, CYLINDERS=8.0, DISPLACEMENT=307.0, HORSEPOWER=130.0, WEIGHT=3504.0, ACCELERATION=12.0, MODELYEAR=70.0, NAME='chevrolet chevelle malibu'),
 Row(MPG=15.0, CYLINDERS=8.0, DISPLACEMENT=350.0, HORSEPOWER=165.0, WEIGHT=3693.0, ACCELERATION=11.5, MODELYEAR=70.0, NAME='buick skylark 320'),
 Row(MPG=18.0, CYLINDERS=8.0, DISPLACEMENT=318.0, HORSEPOWER=150.0, WEIGHT=3436.0, ACCELERATION=11.0, MODELYEAR=70.0, NAME='plymouth satellite'),
 Row(MPG=16.0, CYLINDERS=8.0, DISPLACEMENT=304.0, HORSEPOWER=150.0, WEIGHT=3433.0, ACCELERATION=12.0, MODELYEAR=70.0, NAME='amc rebel sst')]

In [10]:
# Crear un dataframe con los datos preprocesados
autoDf = spark.createDataFrame(autoMap)
autoDf

DataFrame[MPG: double, CYLINDERS: double, DISPLACEMENT: double, HORSEPOWER: double, WEIGHT: double, ACCELERATION: double, MODELYEAR: double, NAME: string]

### Amalisis de Datos

In [11]:
# Ver analisis descriptivos
autoDf.select("MPG","CYLINDERS").describe().show()

+-------+-----------------+------------------+
|summary|              MPG|         CYLINDERS|
+-------+-----------------+------------------+
|  count|              398|               398|
|   mean|23.51457286432161| 5.454773869346734|
| stddev|7.815984312565782|1.7010042445332125|
|    min|              9.0|               3.0|
|    max|             46.6|               8.0|
+-------+-----------------+------------------+



In [12]:
# Encuentra la correlacion entre los predictores y el objetivo
for i in autoDf.columns:
    if not (isinstance(autoDf.select(i).take(1)[0][0], str)):
        print("Correlation to MPG for ",i,autoDf.stat.corr("MPG",i))

Correlation to MPG for  MPG 1.0
Correlation to MPG for  CYLINDERS -0.7753962854205548
Correlation to MPG for  DISPLACEMENT -0.8042028248058979
Correlation to MPG for  HORSEPOWER -0.7746308409203807
Correlation to MPG for  WEIGHT -0.8317409332443347
Correlation to MPG for  ACCELERATION 0.42028891210165004
Correlation to MPG for  MODELYEAR 0.5792671330833091


### Preparar datos para usar MLlib

In [13]:
from pyspark.ml.linalg import Vectors

def transformToLabeledPoint(row):
    lp = (row["MPG"], Vectors.dense([row["ACCELERATION"],\
                                    row["DISPLACEMENT"],\
                                    row["WEIGHT"]]))
    return lp

In [14]:
autoLp = autoMap.map(transformToLabeledPoint)
autoDF = spark.createDataFrame(autoLp,["label","features"])
autoDF.select("label","features").show()

+-----+-------------------+
|label|           features|
+-----+-------------------+
| 18.0|[12.0,307.0,3504.0]|
| 15.0|[11.5,350.0,3693.0]|
| 18.0|[11.0,318.0,3436.0]|
| 16.0|[12.0,304.0,3433.0]|
| 17.0|[10.5,302.0,3449.0]|
| 15.0|[10.0,429.0,4341.0]|
| 14.0| [9.0,454.0,4354.0]|
| 14.0| [8.5,440.0,4312.0]|
| 14.0|[10.0,455.0,4425.0]|
| 15.0| [8.5,390.0,3850.0]|
| 15.0|[10.0,383.0,3563.0]|
| 14.0| [8.0,340.0,3609.0]|
| 15.0| [9.5,400.0,3761.0]|
| 14.0|[10.0,455.0,3086.0]|
| 24.0|[15.0,113.0,2372.0]|
| 22.0|[15.5,198.0,2833.0]|
| 18.0|[15.5,199.0,2774.0]|
| 21.0|[16.0,200.0,2587.0]|
| 27.0| [14.5,97.0,2130.0]|
| 26.0| [20.5,97.0,1835.0]|
+-----+-------------------+
only showing top 20 rows



### Realizar aprendizaje de maquina

In [15]:
(trainingData, testData) = autoDF.randomSplit([0.9,0.1])
print("trainingData.count(): ",trainingData.count())
print("testData.count(): ",testData.count())

trainingData.count():  358
testData.count():  40


In [16]:
# Construir el modelo sobre los datos de entrenamiento
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10)
lrModel = lr.fit(trainingData)

# Muestra de metricas
print("Coefficients: "+str(lrModel.coefficients))
print("Intercept: "+str(lrModel.intercept))

# Predict sobre los datos de test
predictions = lrModel.transform(testData)
predictions.select("prediction","label","features").show()

Coefficients: [0.1302720029022172,-0.012168217951111128,-0.006144290310241698]
Intercept: 42.038740767845916
+------------------+-----+-------------------+
|        prediction|label|           features|
+------------------+-----+-------------------+
|11.765102415457335| 12.0|[12.5,350.0,4499.0]|
| 8.655965361633413| 12.0|[12.5,400.0,4906.0]|
|12.159578901699945| 12.0|[13.5,350.0,4456.0]|
| 9.573915809820974| 13.0|[12.0,400.0,4746.0]|
| 7.153065427585744| 13.0|[12.0,400.0,5140.0]|
|11.876941547428828| 13.0|[13.5,350.0,4502.0]|
|14.947604206115361| 13.0|[14.0,307.0,4098.0]|
|16.788926946201357| 15.0|[10.0,383.0,3563.0]|
|18.928174181901305| 16.0|[18.0,258.0,3632.0]|
|24.725189563740326| 19.0|[13.0,232.0,2634.0]|
| 20.78520552538814| 19.0|[15.0,250.0,3282.0]|
| 22.71933311904275| 20.0|[16.5,198.0,3102.0]|
| 22.50946421184798| 21.0|[15.0,231.0,3039.0]|
|27.922994931216966| 22.0|[16.0,122.0,2395.0]|
| 28.04355556701031| 24.0|[15.0,113.0,2372.0]|
| 30.01065322324078| 24.0| [15.5,90.0,2108.0]

In [17]:
# Regresion lineal e indicador R2
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(predictionCol="prediction",\
                               labelCol="label", metricName="r2")
evaluator.evaluate(predictions)

0.7023822815469987

In [18]:
sc.stop()

In [19]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/home/marco/anaconda3/bin/python
3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]
sys.version_info(major=3, minor=8, micro=3, releaselevel='final', serial=0)
