In [93]:
import pyspark
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.appName("spark_airport").getOrCreate()

In [94]:
from pyspark.ml import Pipeline
from pyspark.mllib.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressionModel, RandomForestRegressor
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GBTRegressionModel, GBTRegressor
import pyspark.ml.classification as cl
import pyspark.ml.feature as ft
import pyspark.sql.types as typ
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [95]:
midata = spark.read.csv('./datasets/comprar_alquilar.csv', header = 'true', inferSchema = 'true', sep = ','  ) 

In [96]:
midata.printSchema()

root
 |-- ingresos: integer (nullable = true)
 |-- gastos_comunes: integer (nullable = true)
 |-- pago_coche: integer (nullable = true)
 |-- gastos_otros: integer (nullable = true)
 |-- ahorros: integer (nullable = true)
 |-- vivienda: integer (nullable = true)
 |-- estado_civil: integer (nullable = true)
 |-- hijos: integer (nullable = true)
 |-- trabajo: integer (nullable = true)
 |-- comprar: integer (nullable = true)



In [97]:
featuresCreator =VectorAssembler( inputCols=["ingresos", "vivienda", "ahorros"],outputCol='features')

In [98]:
#ingresos","gastos_comunes","pago_coche","gastos_otros","ahorros","vivienda","estado_civil", "hijos","trabajo"

In [99]:
train_data,test_data=midata.randomSplit([0.80,0.20])

In [100]:
NB= cl.NaiveBayes(featuresCol='features',labelCol='comprar')

In [101]:
pipeline = Pipeline(stages=[featuresCreator, NB])

In [102]:
model = pipeline.fit(train_data)

In [103]:
predictions = model.transform(train_data)

In [104]:
predictions.select("prediction", "comprar", "features").show()

+----------+-------+--------------------+
|prediction|comprar|            features|
+----------+-------+--------------------+
|       1.0|      0|[2008.0,180374.0,...|
|       0.0|      0|[2022.0,291439.0,...|
|       1.0|      0|[2028.0,284984.0,...|
|       1.0|      0|[2036.0,224583.0,...|
|       1.0|      0|[2039.0,210701.0,...|
|       0.0|      0|[2077.0,310082.0,...|
|       0.0|      0|[2138.0,398611.0,...|
|       0.0|      0|[2151.0,339143.0,...|
|       0.0|      0|[2265.0,248465.0,...|
|       1.0|      0|[2274.0,196232.0,...|
|       1.0|      0|[2308.0,257224.0,...|
|       0.0|      0|[2369.0,203052.0,...|
|       0.0|      0|[2409.0,298841.0,...|
|       0.0|      0|[2448.0,362523.0,...|
|       0.0|      0|[2485.0,304018.0,...|
|       0.0|      0|[2516.0,262250.0,...|
|       0.0|      0|[2550.0,201278.0,...|
|       0.0|      0|[2593.0,275509.0,...|
|       0.0|      0|[2614.0,294601.0,...|
|       0.0|      0|[2825.0,257331.0,...|
+----------+-------+--------------

In [105]:
evaluator = MulticlassClassificationEvaluator(labelCol= "comprar",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % accuracy)
 
evaluatorf1 = MulticlassClassificationEvaluator(labelCol="comprar",metricName="f1")
f1 = evaluatorf1.evaluate(predictions)
print("f1 = %g" % f1)
 
evaluatorwp = MulticlassClassificationEvaluator(labelCol="comprar",metricName="weightedPrecision")
wp = evaluatorwp.evaluate(predictions)
print("weightedPrecision = %g" % wp)
 
evaluatorwr = MulticlassClassificationEvaluator(labelCol="comprar", metricName="weightedRecall")
wr = evaluatorwr.evaluate(predictions)
print("weightedRecall = %g" % wr) 

Accuracy = 0.79375
f1 = 0.7976
weightedPrecision = 0.811905
weightedRecall = 0.79375


In [106]:
vaina = sqlContext.createDataFrame([
    (2000,5000,200000),
    (6000,34000,320000)], ["ingresos","ahorros","vivienda"])


In [107]:
vaina

DataFrame[ingresos: bigint, ahorros: bigint, vivienda: bigint]

In [108]:
predictions_final = model.transform(vaina)

In [109]:
predictions_final.select("prediction").show()

+----------+
|prediction|
+----------+
|       0.0|
|       1.0|
+----------+

