In [1]:
#sc.stop()

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext

spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("MLib Naive Bayes")\
    .getOrCreate()

sc = spark.sparkContext

In [3]:
sc._conf.getAll()

[('spark.local.dir', '/home/marco/claseBigData/ProyectoBD/tmp'),
 ('spark.app.id', 'local-1638919110372'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.memory', '16g'),
 ('spark.app.startTime', '1638919109753'),
 ('spark.app.name', 'PySparkShell'),
 ('spark.driver.host', '192.168.3.5'),
 ('spark.sql.catalogImplementation', 'hive'),
 ('spark.rdd.compress', 'True'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.driver.port', '33787'),
 ('spark.master', 'local[*]'),
 ('spark.submit.pyFiles', ''),
 ('spark.submit.deployMode', 'client'),
 ('spark.ui.showConsoleProgress', 'true')]

### Carga de Datos

In [4]:
smsData = sc.textFile("clase/SMSSpamCollection.csv")
smsData.cache()
smsData.take(10)

['ham,Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...,,,,,,,,,',
 'ham,Ok lar... Joking wif u oni...,,,,,,,,,,',
 'ham,U dun say so early hor... U c already then say...,,,,,,,,,,',
 "ham,Nah I don't think he goes to usf, he lives around here though,,,,,,,,,",
 'ham,Even my brother is not like to speak with me. They treat me like aids patent.,,,,,,,,,,',
 "ham,As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune,,,,,,,,,,",
 "ham,I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.,,,,,,,,,",
 "ham,I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.,,,,,,,,,,",
 'ham,I HAVE A DATE ON SUNDAY WITH WILL!!,,,,,,,,,,',
 "ham

### Preparar Datos para ML

In [5]:
def TransformToVector(inputStr):
    attList = inputStr.split(",")
    smsType = 0.0 if attList[0] == "ham" else 1.0
    return [smsType, attList[1]]

smsXformed = smsData.map(TransformToVector)

smsDf = spark.createDataFrame(smsXformed, ["label","message"])

smsDf.cache()
smsDf.select("label","message").show()

+-----+--------------------+
|label|             message|
+-----+--------------------+
|  0.0|Go until jurong p...|
|  0.0|Ok lar... Joking ...|
|  0.0|U dun say so earl...|
|  0.0|Nah I don't think...|
|  0.0|Even my brother i...|
|  0.0|As per your reque...|
|  0.0|I'm gonna be home...|
|  0.0|I've been searchi...|
|  0.0|I HAVE A DATE ON ...|
|  0.0|Oh k...i'm watchi...|
|  0.0|Eh u remember how...|
|  0.0|Fine if thats th...|
|  0.0|Is that seriously...|
|  0.0|I‘m going to try ...|
|  0.0|So ü pay first la...|
|  0.0|Aft i finish my l...|
|  0.0|Ffffffffff. Alrig...|
|  0.0|Just forced mysel...|
|  0.0|Lol your always s...|
|  0.0|Did you catch the...|
+-----+--------------------+
only showing top 20 rows



### Dividiendo datos en entrenamiento y test

In [6]:
(trainingData, testData) = smsDf.randomSplit([0.9,0.1])
print("trainingData.count(): ",trainingData.count())
print("testData.count(): ",testData.count())

trainingData.count():  900
testData.count():  100


In [7]:
# Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer, IDF
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Dividir en palabras y luego construir TF-IDF
tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),\
                     outputCol="tempfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nbClassifier])

# Construir un modelo con una tuberia
nbModel = pipeline.fit(trainingData)
# Predict sobre datos de prueba
prediction = nbModel.transform(testData)

# Evaluar la precision
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction",\
                                              labelCol="label", metricName="accuracy")
evaluator.evaluate(prediction)

0.93

### Matriz de confusion

In [8]:
prediction.groupBy("label","prediction").count().show()

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|  1.0|       1.0|   45|
|  0.0|       1.0|    3|
|  1.0|       0.0|    4|
|  0.0|       0.0|   48|
+-----+----------+-----+



In [9]:
sc.stop()

In [10]:
import sys
print(sys.executable)
print(sys.version)
print(sys.version_info)

/home/marco/anaconda3/bin/python
3.8.3 (default, Jul  2 2020, 16:21:59) 
[GCC 7.3.0]
sys.version_info(major=3, minor=8, micro=3, releaselevel='final', serial=0)
