In [None]:
os.environ['JAVA_HOME'] = '/opt/jdk'  #Mostra aonde está o JDK
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [None]:
from pyspark.sql import SparkSession

from pyspark.ml import Pipeline

from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF,IDF
from pyspark.ml.feature import StringIndexer


from pyspark.ml.classification import DecisionTreeClassifier

from pyspark.ml.evaluation import MulticlassClassificationEvaluator


from pyspark.ml.tuning import ParamGridBuilder,CrossValidator

import pandas as pd

https://spark.apache.org/docs/3.1.2/

https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.ml.classification.DecisionTreeClassifier.html#pyspark.ml.classification.DecisionTreeClassifier


https://spark.apache.org/docs/3.1.2/api/java/org/apache/spark/ml/classification/DecisionTreeClassifier.html


https://spark.apache.org/docs/3.1.2/api/python/reference/api/pyspark.ml.evaluation.MulticlassClassificationEvaluator.html#multiclassclassificationevaluator


https://spark.apache.org/docs/3.1.2/api/java/org/apache/spark/ml/evaluation/MulticlassClassificationEvaluator.html




In [None]:

#spark = SparkSession.builder \
#        .appName('app_name') \
#        .master('local[*]') \
#        .config('spark.sql.execution.arrow.pyspark.enabled', True) \
#        .config('spark.sql.session.timeZone', 'UTC') \
#        .config('spark.driver.memory','12G') \
#        .config('spark.ui.showConsoleProgress', True) \
#        .config('spark.sql.repl.eagerEval.enabled', True) \
#        .getOrCreate()

#sc=spark.sparkContext

In [None]:
spark = SparkSession.builder \
    .master('local[*]') \
    .config("spark.driver.memory", "12g") \
    .appName('my-cool-app') \
    .getOrCreate()

sc=spark.sparkContext

In [None]:
spark

In [None]:
caminho_df_test = 'data_test.csv'
caminho_df_training = 'data_training.csv'
#caminho_df_test = 'menordata_test.csv'
#caminho_df_training = 'menordata_training.csv'

df_test = spark.read.csv(caminho_df_test, header=True, inferSchema=True)
df_training = spark.read.csv(caminho_df_training, header=True, inferSchema=True)

In [None]:
df_test.printSchema()

In [None]:
df_training.printSchema()

In [None]:
label_stringIdx = StringIndexer(inputCol = "topico", outputCol = "label", handleInvalid='keep')
tokenization = Tokenizer(inputCol="texto", outputCol="palavras")
remover_stopword = StopWordsRemover(inputCol="palavras", outputCol="palavras_filtradas")
hashingTF = HashingTF(inputCol="palavras_filtradas", outputCol="tf_features")
idf = IDF(inputCol="tf_features", outputCol="tf_idf_features")

In [None]:
dt = DecisionTreeClassifier(featuresCol='tf_idf_features', labelCol='label')
pipelineDT = Pipeline(stages=[label_stringIdx, tokenization, remover_stopword, hashingTF, idf, dt])

In [None]:
dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [2, 3, 5])
             .build())

In [None]:
dtcv = CrossValidator(estimator = pipelineDT,
                      estimatorParamMaps = dtparamGrid,
                      evaluator = MulticlassClassificationEvaluator(predictionCol="prediction"),
                      numFolds = 10)

In [None]:
dtcvModel = dtcv.fit(df_training)

In [None]:
df_test_nbcvModel = dtcvModel.transform(df_test)

In [None]:
df_predictionAndLabels = df_test_nbcvModel.select(['prediction', 'label'])

In [None]:
labels = df_predictionAndLabels.rdd.map(lambda x: x.label).distinct().collect() # transforma o df para rdd e para poder extrair criar uma lista de labels distintas

In [None]:
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='label' )

In [None]:
vec_fMeasureByLabel = []
qtdClasses = 0
metric_list=[]

for label in sorted(labels):
    precisionByLabel = evaluator.evaluate(df_predictionAndLabels, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: label})
    recallByLabel = evaluator.evaluate(df_predictionAndLabels, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: label})
    fMeasureByLabel = evaluator.evaluate(df_predictionAndLabels, {evaluator.metricName: "fMeasureByLabel", evaluator.metricLabel: label})
    
    metric_tuple_one = (label, precisionByLabel, recallByLabel,fMeasureByLabel, None , None)
    
    metric_list.append(metric_tuple_one)
    
    vec_fMeasureByLabel.append(fMeasureByLabel)
    qtdClasses +=1 

accuracy = evaluator.evaluate(df_predictionAndLabels, {evaluator.metricName: "accuracy"})
macroF1 = (sum(vec_fMeasureByLabel))/qtdClasses

metric_tuple_two = (None,None,None,None, accuracy, macroF1)

metric_list.append(metric_tuple_two)

In [None]:
dt_metrics = pd.DataFrame(metric_list)
dt_metrics.columns=(['class', 'precision', 'recall', 'F1', 'accuracy', 'macroF1'])
dt_metrics.to_csv('metrics_decision_tree.csv', index=False)
