# Machine Learning - Classificação

## Imports

In [1]:
from pyspark.ml.feature import RFormula
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

### Data Import

In [2]:
churn = spark.read.csv('./data/Churn.csv', header = True, sep=';')

In [3]:
churn.show(3)

+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|CreditScore|Geography|Gender|Age|Tenure|Balance|NumOfProducts|HasCrCard|IsActiveMember|EstimatedSalary|Exited|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
|        619|   France|Female| 42|     2|      0|            1|        1|             1|       10134888|     1|
|        608|    Spain|Female| 41|     1|8380786|            1|        0|             1|       11254258|     0|
|        502|   France|Female| 42|     8|1596608|            3|        1|             0|       11393157|     1|
+-----------+---------+------+---+------+-------+-------------+---------+--------------+---------------+------+
only showing top 3 rows



## Aplicando Classificação

In [4]:
formula = RFormula(formula='Exited ~ .', featuresCol='features', labelCol='label', handleInvalid='skip')

In [5]:
churn_trans = formula.fit(churn).transform(churn).select('features', 'label')

                                                                                

In [6]:
churn_trans.show(truncate=False)

+-----------------------------------------------------------------------------------------------+-----+
|features                                                                                       |label|
+-----------------------------------------------------------------------------------------------+-----+
|(16925,[23,459,475,531,541,6922,6925,6926,7021],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])         |1.0  |
|(16925,[28,472,532,6081,6922,6926,7651],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                         |0.0  |
|(16925,[285,459,475,534,4752,6924,6925,7722],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                |1.0  |
|(16925,[43,459,469,532,541,6923,16598],[1.0,1.0,1.0,1.0,1.0,1.0,1.0])                          |0.0  |
|(16925,[0,476,531,2655,6922,6925,6926,15729],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                |0.0  |
|(16925,[17,461,478,534,1632,6923,6925,9741],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                 |1.0  |
|(16925,[401,459,461,487,533,541,6923,6925,6926,6969],[1.0,1.0,1

### Separando os dados transformados em treino e teste

In [7]:
churnTreino, churnTeste = churn_trans.randomSplit([.7, .3])

In [8]:
churnTreino.count()

22/12/07 15:00:43 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


7014

In [9]:
churnTeste.count()

22/12/07 15:00:43 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


2986

### Aplicando o DecisionTree Classifier

In [10]:
dt = DecisionTreeClassifier(labelCol='label', featuresCol='features')

In [11]:
modelo = dt.fit(churnTreino)

22/12/07 15:00:45 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
22/12/07 15:00:45 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
22/12/07 15:00:46 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB


[Stage 44:>                                                         (0 + 1) / 1]

22/12/07 15:00:47 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)
22/12/07 15:00:47 WARN BlockManager: Persisting block rdd_137_0 to disk instead.
22/12/07 15:00:48 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)


                                                                                

22/12/07 15:00:49 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
22/12/07 15:00:49 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)


                                                                                

22/12/07 15:00:49 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
22/12/07 15:00:50 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)


                                                                                

22/12/07 15:00:51 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
22/12/07 15:00:51 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)


                                                                                

22/12/07 15:00:52 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
22/12/07 15:00:52 WARN MemoryStore: Not enough space to cache rdd_137_0 in memory! (computed 328.5 MiB so far)


                                                                                

In [12]:
previsao = modelo.transform(churnTeste)

In [13]:
previsao.show()

22/12/07 15:00:53 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
+--------------------+-----+--------------+--------------------+----------+
|            features|label| rawPrediction|         probability|prediction|
+--------------------+-----+--------------+--------------------+----------+
|(16925,[0,459,461...|  0.0| [861.0,413.0]|[0.67582417582417...|       0.0|
|(16925,[0,459,461...|  0.0| [1743.0,52.0]|[0.97103064066852...|       0.0|
|(16925,[0,459,461...|  0.0| [861.0,413.0]|[0.67582417582417...|       0.0|
|(16925,[0,459,461...|  0.0|[1158.0,203.0]|[0.85084496693607...|       0.0|
|(16925,[0,459,461...|  0.0|[1158.0,203.0]|[0.85084496693607...|       0.0|
|(16925,[0,459,461...|  0.0| [861.0,413.0]|[0.67582417582417...|       0.0|
|(16925,[0,459,461...|  0.0|[1158.0,203.0]|[0.85084496693607...|       0.0|
|(16925,[0,459,461...|  0.0| [861.0,413.0]|[0.67582417582417...|       0.0|
|(16925,[0,459,461...|  0.0|[1158.0,203.0]|[0.85084496693607...|       0.0|
|(

                                                                                

In [14]:
previsao.show(truncate=False)

22/12/07 15:00:55 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
+-----------------------------------------------------------------------------------------------+-----+--------------+-----------------------------------------+----------+
|features                                                                                       |label|rawPrediction |probability                              |prediction|
+-----------------------------------------------------------------------------------------------+-----+--------------+-----------------------------------------+----------+
|(16925,[0,459,461,462,537,2770,6922,6925,12307],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])         |0.0  |[861.0,413.0] |[0.6758241758241759,0.3241758241758242]  |0.0       |
|(16925,[0,459,461,462,539,541,6923,6925,7488],[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])           |0.0  |[1743.0,52.0] |[0.9710306406685236,0.028969359331476322]|0.0       |
|(16925,[0,459,461,463,533,5935,6922,8447],[1.0,1.0,1.

### Avaliação da Performance

In [15]:
avaliar = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='label', metricName='areaUnderROC')

In [16]:
areaUnderRoc = avaliar.evaluate(previsao)

22/12/07 15:00:55 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


In [17]:
print(areaUnderRoc)

0.6225945684965821
