In [1]:
%%time
#importa pacotes para usar SQL no contexto do Spark
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

CPU times: user 455 µs, sys: 260 µs, total: 715 µs
Wall time: 1.4 ms


In [2]:
%%time
#roda e lê a base
data = sqlContext.read.load('./Data/base.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

CPU times: user 0 ns, sys: 2.98 ms, total: 2.98 ms
Wall time: 8.91 s


In [3]:
%%time
#mostra as duas primeiras linhas da base
data.show(2)

+--------+--------------------+--------+--------+----------------+-------+----------+-------------------+---------------+--------------------+--------------+---------------------+-------------+-----+-----+----------+-----------+--------------------+
|   CAMIS|                 DBA|    BORO|BUILDING|          STREET|ZIPCODE|     PHONE|CUISINE DESCRIPTION|INSPECTION DATE|              ACTION|VIOLATION CODE|VIOLATION DESCRIPTION|CRITICAL FLAG|SCORE|GRADE|GRADE DATE|RECORD DATE|     INSPECTION TYPE|
+--------+--------------------+--------+--------+----------------+-------+----------+-------------------+---------------+--------------------+--------------+---------------------+-------------+-----+-----+----------+-----------+--------------------+
|50010005|BLOOMBERG QUEENS ...|  QUEENS|    6626|METROPOLITAN AVE|  11379|2126476565|           American|     02/21/2017|Violations were c...|           10B| Plumbing not prop...| Not Critical|    7|    A|02/21/2017| 05/09/2018|Cycle Inspection ...|


In [4]:
%%time
#imprime os rótulos e tipos de dado da base
data.printSchema()

root
 |-- CAMIS: integer (nullable = true)
 |-- DBA: string (nullable = true)
 |-- BORO: string (nullable = true)
 |-- BUILDING: string (nullable = true)
 |-- STREET: string (nullable = true)
 |-- ZIPCODE: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- CUISINE DESCRIPTION: string (nullable = true)
 |-- INSPECTION DATE: string (nullable = true)
 |-- ACTION: string (nullable = true)
 |-- VIOLATION CODE: string (nullable = true)
 |-- VIOLATION DESCRIPTION: string (nullable = true)
 |-- CRITICAL FLAG: string (nullable = true)
 |-- SCORE: integer (nullable = true)
 |-- GRADE: string (nullable = true)
 |-- GRADE DATE: string (nullable = true)
 |-- RECORD DATE: string (nullable = true)
 |-- INSPECTION TYPE: string (nullable = true)

CPU times: user 0 ns, sys: 2.25 ms, total: 2.25 ms
Wall time: 20.9 ms


In [5]:
%%time
#exclui os dados não rotulados
data = data[~data['CRITICAL FLAG'].isin(['Not Applicable'])]
#retira as colunas da tabela que não serão utilizadas na classificação
drop_list = ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE','CUISINE DESCRIPTION', 'PHONE', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE']

CPU times: user 2.97 ms, sys: 1.88 ms, total: 4.85 ms
Wall time: 85.4 ms


In [6]:
%%time
#mostra as colunas que serão utilizadas na classificação
#CRITICAL FLAG É A COLUNA COM OS CLASSIFICADORES
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---------------------+-------------+
|VIOLATION DESCRIPTION|CRITICAL FLAG|
+---------------------+-------------+
| Plumbing not prop...| Not Critical|
| Filth flies or fo...|     Critical|
| Hot food item not...|     Critical|
| Food not protecte...|     Critical|
| Food contact surf...|     Critical|
+---------------------+-------------+
only showing top 5 rows

CPU times: user 0 ns, sys: 5.39 ms, total: 5.39 ms
Wall time: 553 ms


In [7]:
%%time
#imprime os rótulos e tipos de dado da base
data.printSchema()

root
 |-- VIOLATION DESCRIPTION: string (nullable = true)
 |-- CRITICAL FLAG: string (nullable = true)

CPU times: user 0 ns, sys: 1.52 ms, total: 1.52 ms
Wall time: 6.35 ms


In [8]:
%%time
#conta quantas linhas há para cada classificador
from pyspark.sql.functions import col
data.groupBy("CRITICAL FLAG") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+------+
|CRITICAL FLAG| count|
+-------------+------+
|     Critical|203837|
| Not Critical|161513|
+-------------+------+

CPU times: user 5.67 ms, sys: 5.12 ms, total: 10.8 ms
Wall time: 2.86 s


In [9]:
%%time
#conta as principais descrições
data.groupBy("VIOLATION DESCRIPTION") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(5)

+---------------------+-----+
|VIOLATION DESCRIPTION|count|
+---------------------+-----+
| Non-food contact ...|52751|
| Facility not verm...|38287|
| Evidence of mice ...|26791|
| Food not protecte...|25289|
| Food contact surf...|25199|
+---------------------+-----+
only showing top 5 rows

CPU times: user 5.11 ms, sys: 676 µs, total: 5.79 ms
Wall time: 1.71 s


In [10]:
%%time
#importa pacotes para tokenizar, remover stopwords e vetorizar
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="VIOLATION DESCRIPTION", outputCol="words", pattern="\\W")
# stop words
#usando stop list of 25 semantically non-selective words which are common in Reuters-RCV1.
add_stopwords = ["a","an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

CPU times: user 18.6 ms, sys: 57.2 ms, total: 75.9 ms
Wall time: 193 ms


In [11]:
%%time
#cria rótulos númericos para os classificadores
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "CRITICAL FLAG", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

CPU times: user 715 µs, sys: 1.75 ms, total: 2.46 ms
Wall time: 15.2 ms


In [12]:
%%time
# Forma o pipeline to para treinar os documentos.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+---------------------+-------------+--------------------+--------------------+--------------------+-----+
|VIOLATION DESCRIPTION|CRITICAL FLAG|               words|            filtered|            features|label|
+---------------------+-------------+--------------------+--------------------+--------------------+-----+
| Plumbing not prop...| Not Critical|[plumbing, not, p...|[plumbing, not, p...|(528,[0,2,5,12,13...|  1.0|
| Filth flies or fo...|     Critical|[filth, flies, or...|[filth, flies, or...|(528,[0,1,3,4,11,...|  0.0|
| Hot food item not...|     Critical|[hot, food, item,...|[hot, food, item,...|(528,[0,1,2,9,14,...|  0.0|
| Food not protecte...|     Critical|[food, not, prote...|[food, not, prote...|(528,[0,1,2,20,36...|  0.0|
| Food contact surf...|     Critical|[food, contact, s...|[food, contact, s...|(528,[1,2,5,6,7,3...|  0.0|
+---------------------+-------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows

CPU times: u

In [13]:
%%time
# Define as sementes para reprodutibilidade
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 255865
Test Dataset Count: 109485
CPU times: user 4.44 ms, sys: 4.39 ms, total: 8.83 ms
Wall time: 17 s


In [14]:
%%time
#calcula o tf-idf
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawTF", numFeatures=10000)
tf = hashingTF.transform(trainingData)
idf = IDF(inputCol="rawTF", outputCol="IDF", minDocFreq=5) #minDocFreq: remove termos sparsos
idfModel = idf.fit(tf) 
tfidf = idfModel.transform(tf)
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

CPU times: user 7.31 ms, sys: 15.2 ms, total: 22.5 ms
Wall time: 11.6 s


In [15]:
%%time
#Meu modelo fará previsões e pontuação no conjunto de testes
#Mostra as 30 principais previsões da maior probabilidade.

lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("VIOLATION DESCRIPTION","CRITICAL FLAG","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 5, truncate = 30)

+------------------------------+-------------+------------------------------+-----+----------+
|         VIOLATION DESCRIPTION|CRITICAL FLAG|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|Filth flies or food/refuse/...|     Critical|[0.9660538168008225,0.03394...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.9660538168008225,0.03394...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.9660538168008225,0.03394...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.9660538168008225,0.03394...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.9660538168008225,0.03394...|  0.0|       0.0|
+------------------------------+-------------+------------------------------+-----+----------+
only showing top 5 rows

CPU times: user 8.01 ms, sys: 38.1 ms, total: 46.1 ms
Wall time: 19.8 s


In [16]:
%%time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

CPU times: user 3.78 ms, sys: 12.6 ms, total: 16.3 ms
Wall time: 24.4 s


In [17]:
%%time
#soma os vetores
from pyspark.ml.linalg import SparseVector, DenseVector

# Representacao do vetor de classes
df = dataset.select('label','features')
vetor_de_classes = df.rdd.mapValues(lambda v: v.toArray()) \
    .reduceByKey(lambda x, y: x + y) \
    .mapValues(lambda x: DenseVector(x)) \
    .toDF(["label", "features_sum"])
    
vetor_de_classes.show(2)

+-----+--------------------+
|label|        features_sum|
+-----+--------------------+
|  0.0|[212689.0,296246....|
|  1.0|[355095.0,119057....|
+-----+--------------------+

CPU times: user 108 ms, sys: 56.4 ms, total: 165 ms
Wall time: 14.3 s


In [18]:
%%time
# preparo para o calculo da distancia euclidiana para classe 0.0
array0 = vetor_de_classes.filter('label = 0.0').collect()[0]['features_sum']
#print(array0)

CPU times: user 0 ns, sys: 4.93 ms, total: 4.93 ms
Wall time: 330 ms


In [19]:
%%time
# preparo para o calculo da distancia euclidiana para classe 1.0
array1 = vetor_de_classes.filter('label = 1.0').collect()[0]['features_sum']
#print(array1)

CPU times: user 828 µs, sys: 5.04 ms, total: 5.86 ms
Wall time: 137 ms


In [20]:
%%time
# calcula a distancia euclidiana
from scipy.spatial import distance

df_com_distancias = df.rdd.mapValues(lambda v: v.toArray()) \
    .mapValues(lambda v: [distance.euclidean(v, array0), distance.euclidean(v, array1)]) \
    .toDF(["label","distances"])

CPU times: user 28.4 ms, sys: 5.38 ms, total: 33.8 ms
Wall time: 140 ms


In [21]:
%%time
df_com_distancias.show(5)

+-----+--------------------+
|label|           distances|
+-----+--------------------+
|  1.0|[528328.845564010...|
|  0.0|[528321.991013056...|
|  0.0|[528329.913251180...|
|  0.0|[528329.815984295...|
|  0.0|[528329.948905038...|
+-----+--------------------+
only showing top 5 rows

CPU times: user 0 ns, sys: 1.63 ms, total: 1.63 ms
Wall time: 399 ms
