In [58]:
#https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35
#importa pacotes
from pyspark.sql import SQLContext
from pyspark.sql.types import *
sqlContext = SQLContext(sc)

In [59]:
#abre o arquivo no contexto do SQL
data = sqlContext.read.load('file:///C:/Spark/projetos/Data/base.csv', 
                          format='com.databricks.spark.csv', 
                          header='true', 
                          inferSchema='true')

In [60]:
#mostra 5 primeiras linhas
data.show(2)

+--------+-------------+--------+--------+------------------+-------+----------+-------------------+---------------+--------------------+--------------+---------------------+-------------+-----+-----+----------+-----------+--------------------+
|   CAMIS|          DBA|    BORO|BUILDING|            STREET|ZIPCODE|     PHONE|CUISINE DESCRIPTION|INSPECTION DATE|              ACTION|VIOLATION CODE|VIOLATION DESCRIPTION|CRITICAL FLAG|SCORE|GRADE|GRADE DATE|RECORD DATE|     INSPECTION TYPE|
+--------+-------------+--------+--------+------------------+-------+----------+-------------------+---------------+--------------------+--------------+---------------------+-------------+-----+-----+----------+-----------+--------------------+
|50074025|    LILA CAFE|BROOKLYN|     911|        DEKALB AVE|  11221|3475292886|          Caribbean|     03/27/2018|Violations were c...|           10F| Non-food contact ...| Not Critical|   33| null|      null| 04/26/2018|Pre-permit (Non-o...|
|41573314|SABANA LOU

In [61]:
#mostra o rótulo e os tipos dos dados
data.printSchema()

root
 |-- CAMIS: integer (nullable = true)
 |-- DBA: string (nullable = true)
 |-- BORO: string (nullable = true)
 |-- BUILDING: string (nullable = true)
 |-- STREET: string (nullable = true)
 |-- ZIPCODE: string (nullable = true)
 |-- PHONE: string (nullable = true)
 |-- CUISINE DESCRIPTION: string (nullable = true)
 |-- INSPECTION DATE: string (nullable = true)
 |-- ACTION: string (nullable = true)
 |-- VIOLATION CODE: string (nullable = true)
 |-- VIOLATION DESCRIPTION: string (nullable = true)
 |-- CRITICAL FLAG: string (nullable = true)
 |-- SCORE: integer (nullable = true)
 |-- GRADE: string (nullable = true)
 |-- GRADE DATE: string (nullable = true)
 |-- RECORD DATE: string (nullable = true)
 |-- INSPECTION TYPE: string (nullable = true)



In [62]:
#retira os dados não classificados
data = data[~data['CRITICAL FLAG'].isin(['Not Applicable'])]
#retira as colunas não utilizadas da tabela
drop_list = ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE','CUISINE DESCRIPTION', 'PHONE', 'INSPECTION DATE', 'ACTION', 'VIOLATION CODE', 'SCORE', 'GRADE', 'GRADE DATE', 'RECORD DATE', 'INSPECTION TYPE']

In [63]:
#mostra a tabela já filtrada
data = data.select([column for column in data.columns if column not in drop_list])
data.show(5)

+---------------------+-------------+
|VIOLATION DESCRIPTION|CRITICAL FLAG|
+---------------------+-------------+
| Non-food contact ...| Not Critical|
| Non-food contact ...| Not Critical|
| Non-food contact ...| Not Critical|
| Non-food contact ...| Not Critical|
| Food contact surf...|     Critical|
+---------------------+-------------+
only showing top 5 rows



In [64]:
#mostra o rótulo e os tipos dos dados da tabela filtrada
data.printSchema()

root
 |-- VIOLATION DESCRIPTION: string (nullable = true)
 |-- CRITICAL FLAG: string (nullable = true)



In [65]:
#conta cada tipo de dado do documento já classificado
from pyspark.sql.functions import col
data.groupBy("CRITICAL FLAG") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------------+------+
|CRITICAL FLAG| count|
+-------------+------+
|     Critical|204546|
| Not Critical|162077|
+-------------+------+



In [66]:
#conta cada tipo de violação
data.groupBy("VIOLATION DESCRIPTION") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(5)

+---------------------+-----+
|VIOLATION DESCRIPTION|count|
+---------------------+-----+
| Non-food contact ...|52814|
| Facility not verm...|38420|
| Evidence of mice ...|26858|
| Food not protecte...|25364|
| Food contact surf...|25240|
+---------------------+-----+
only showing top 5 rows



In [67]:
#importa os pacotes para tokenizar, remover as stop-words e Count vectors (“document-term vectors”)
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="VIOLATION DESCRIPTION", outputCol="words", pattern="\\W")
# stop words
#using stop list of 25 semantically non-selective words which are common in Reuters-RCV1.
add_stopwords = ["a","an", "and", "are", "as", "at", "be", "by", "for", "from", "has", "he", "in", "is", "it", "its", "of", "on", "that", "the", "to", "was", "were", "will", "with"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)

In [69]:
#StringIndexer encodes a string column of labels to a column of label indices. 
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "CRITICAL FLAG", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])

In [70]:
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+---------------------+-------------+--------------------+--------------------+--------------------+-----+
|VIOLATION DESCRIPTION|CRITICAL FLAG|               words|            filtered|            features|label|
+---------------------+-------------+--------------------+--------------------+--------------------+-----+
| Non-food contact ...| Not Critical|[non, food, conta...|[non, food, conta...|(528,[0,1,2,4,5,6...|  1.0|
| Non-food contact ...| Not Critical|[non, food, conta...|[non, food, conta...|(528,[0,1,2,4,5,6...|  1.0|
| Non-food contact ...| Not Critical|[non, food, conta...|[non, food, conta...|(528,[0,1,2,4,5,6...|  1.0|
| Non-food contact ...| Not Critical|[non, food, conta...|[non, food, conta...|(528,[0,1,2,4,5,6...|  1.0|
| Food contact surf...|     Critical|[food, contact, s...|[food, contact, s...|(528,[1,2,5,6,7,3...|  0.0|
+---------------------+-------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [71]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 257050
Test Dataset Count: 109573


In [72]:
#Logistic Regression using Count Vector Features
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("CRITICAL FLAG","VIOLATION DESCRIPTION","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+-------------+------------------------------+------------------------------+-----+----------+
|CRITICAL FLAG|         VIOLATION DESCRIPTION|                   probability|label|prediction|
+-------------+------------------------------+------------------------------+-----+----------+
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9660886616772623,0.03391...|  0.0|       0.0|
|     Critical|Filth flies or food/refuse/...|[0.9

In [73]:
#ver acurácia
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

1.0

In [74]:
#faz o tf-idf
from pyspark.ml.feature import HashingTF, IDF
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=100000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])

In [75]:
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

In [76]:
#predição usando regressão logística
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("VIOLATION DESCRIPTION","CRITICAL FLAG","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------------+------------------------------+-----+----------+
|         VIOLATION DESCRIPTION|CRITICAL FLAG|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.966088661677262,0.033911...|  0.0|       0.0|
|Filth flies or food/refuse/...|     Critical|[0.9

In [77]:
#ver acurácia da regressão
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

1.0

In [None]:
#cria os vetores das classes do spark (feito usando soma de vetores somando os de mesmo índice)

In [None]:
#cria os atributos medindo a distância euclidiana das linhas originais para os vetores das classes

In [None]:
#mostra as colunas dos atributos