NLP com SPARK

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import Tokenizer, StringIndexer, Word2Vec
spark = SparkSession.builder.appName("nlp").getOrCreate()

In [0]:
#transformando a tabela spam em dataframe
spam = spark.sql("select * from spam")

In [0]:
spam.show(5)

+--------+--------------------+
|Category|             Message|
+--------+--------------------+
|     ham|Go until jurong p...|
|     ham|Ok lar... Joking ...|
|    spam|Free entry in 2 a...|
|     ham|U dun say so earl...|
|     ham|Nah I don't think...|
+--------+--------------------+
only showing top 5 rows



Transformação da variável Category

In [0]:
stringmodel = StringIndexer(inputCol="Category", outputCol="CategoryIndex")
new_spam = stringmodel.fit(spam).transform(spam)

In [0]:
new_spam.show(5)

+--------+--------------------+-------------+
|Category|             Message|CategoryIndex|
+--------+--------------------+-------------+
|     ham|Go until jurong p...|          0.0|
|     ham|Ok lar... Joking ...|          0.0|
|    spam|Free entry in 2 a...|          1.0|
|     ham|U dun say so earl...|          0.0|
|     ham|Nah I don't think...|          0.0|
+--------+--------------------+-------------+
only showing top 5 rows



Transformando a variável Mesasge para Tokens

In [0]:
tokens = Tokenizer(inputCol="Message", outputCol="MessageToken")
new_spam = tokens.transform(new_spam)

In [0]:
new_spam.show(5)

+--------+--------------------+-------------+--------------------+
|Category|             Message|CategoryIndex|        MessageToken|
+--------+--------------------+-------------+--------------------+
|     ham|Go until jurong p...|          0.0|[go, until, juron...|
|     ham|Ok lar... Joking ...|          0.0|[ok, lar..., joki...|
|    spam|Free entry in 2 a...|          1.0|[free, entry, in,...|
|     ham|U dun say so earl...|          0.0|[u, dun, say, so,...|
|     ham|Nah I don't think...|          0.0|[nah, i, don't, t...|
+--------+--------------------+-------------+--------------------+
only showing top 5 rows



In [0]:
new_spam.select("MessageToken").show()

+--------------------+
|        MessageToken|
+--------------------+
|[go, until, juron...|
|[ok, lar..., joki...|
|[free, entry, in,...|
|[u, dun, say, so,...|
|[nah, i, don't, t...|
|[freemsg, hey, th...|
|[even, my, brothe...|
|[as, per, your, r...|
|[winner!!, as, a,...|
|[had, your, mobil...|
|[i'm, gonna, be, ...|
|[six, chances, to...|
|[urgent!, you, ha...|
|[i've, been, sear...|
|[i, have, a, date...|
|[xxxmobilemoviecl...|
|[oh, k...i'm, wat...|
|[eh, u, remember,...|
|[fine, if, thats...|
|[england, v, mace...|
+--------------------+
only showing top 20 rows



Utilizando Word2vec para representação vetorial (embedding)

In [0]:
word2vec = Word2Vec(inputCol="MessageToken", outputCol="MessageW2V")
new_spam = word2vec.fit(new_spam).transform(new_spam)

In [0]:
new_spam.show(5)

+--------+--------------------+-------------+--------------------+--------------------+
|Category|             Message|CategoryIndex|        MessageToken|          MessageW2V|
+--------+--------------------+-------------+--------------------+--------------------+
|     ham|Go until jurong p...|          0.0|[go, until, juron...|[8.77219164976850...|
|     ham|Ok lar... Joking ...|          0.0|[ok, lar..., joki...|[0.03173802127518...|
|    spam|Free entry in 2 a...|          1.0|[free, entry, in,...|[-0.0297332426167...|
|     ham|U dun say so earl...|          0.0|[u, dun, say, so,...|[0.04455585625361...|
|     ham|Nah I don't think...|          0.0|[nah, i, don't, t...|[0.05426038640479...|
+--------+--------------------+-------------+--------------------+--------------------+
only showing top 5 rows



Treino e Teste

In [0]:
train, test = new_spam.randomSplit([0.7, 0.3]) #70% treino, 30% teste

In [0]:
#labelCol -> variável dependente
rf = RandomForestClassifier(labelCol="CategoryIndex", featuresCol="MessageW2V", numTrees=500)
model = rf.fit(train)

In [0]:
prev = model.transform(test)
prev.show(10)

+--------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|Category|             Message|CategoryIndex|        MessageToken|          MessageW2V|       rawPrediction|         probability|prediction|
+--------+--------------------+-------------+--------------------+--------------------+--------------------+--------------------+----------+
|     ham|&lt;#&gt;  great ...|          0.0|[&lt;#&gt;, , gre...|[0.02178995571531...|[487.692479229877...|[0.97538495845975...|       0.0|
|     ham|&lt;#&gt;  w jett...|          0.0|[&lt;#&gt;, , w, ...|[0.03599234431749...|[486.208464589757...|[0.97241692917951...|       0.0|
|     ham|&lt;#&gt; %of ppl...|          0.0|[&lt;#&gt;, %of, ...|[0.01662711555730...|[487.296911138636...|[0.97459382227727...|       0.0|
|     ham|(No promises on w...|          0.0|[(no, promises, o...|[-0.0022781230974...|[485.337161199508...|[0.97067432239901...|       0.0|
|     ham|* A

Avaliando o modelo

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [0]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction", labelCol="CategoryIndex", metricName="areaUnderROC")

In [0]:
areaUnderRoc = evaluator.evaluate(prev)
areaUnderRoc

Out[21]: 0.8604629043474464