In [1]:
# File location and type
file_location = "./dados_clientes.csv"
file_type = "csv"


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master('local[*]').appName('Churn').getOrCreate()

In [4]:
dados = spark.read.csv(file_location, sep =',', header=True, inferSchema=True)

In [5]:
dados.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Churn: string (nullable = true)
 |-- Mais65anos: integer (nullable = true)
 |-- Conjuge: string (nullable = true)
 |-- Dependentes: string (nullable = true)
 |-- MesesDeContrato: integer (nullable = true)
 |-- TelefoneFixo: string (nullable = true)
 |-- MaisDeUmaLinhaTelefonica: string (nullable = true)
 |-- Internet: string (nullable = true)
 |-- SegurancaOnline: string (nullable = true)
 |-- BackupOnline: string (nullable = true)
 |-- SeguroDispositivo: string (nullable = true)
 |-- SuporteTecnico: string (nullable = true)
 |-- TVaCabo: string (nullable = true)
 |-- StreamingFilmes: string (nullable = true)
 |-- TipoContrato: string (nullable = true)
 |-- ContaCorreio: string (nullable = true)
 |-- MetodoPagamento: string (nullable = true)
 |-- MesesCobrados: double (nullable = true)



In [6]:
dados.groupBy('Churn').count().show()

+-----+-----+
|Churn|count|
+-----+-----+
|  Sim| 5174|
|  Nao| 5174|
+-----+-----+



In [7]:
colunasBinarias = [
    'Churn',
    'Conjuge',
    'Dependentes',
    'TelefoneFixo',
    'MaisDeUmaLinhaTelefonica',
    'SegurancaOnline',
   'SeguroDispositivo',
    'SuporteTecnico',
    'TVaCabo',
    'BackupOnline',
    'StreamingFilmes',
    'ContaCorreio']

In [8]:
from pyspark.sql import functions as f

In [9]:
todasColunas = [f.when(f.col(c)=='Sim', 1).otherwise(0).alias(c) for c in colunasBinarias]

In [10]:
for coluna in reversed(dados.columns):
    if coluna not in colunasBinarias:
        todasColunas.insert(0, coluna)
todasColunas

['id',
 'Mais65anos',
 'MesesDeContrato',
 'Internet',
 'TipoContrato',
 'MetodoPagamento',
 'MesesCobrados',
 Column<'CASE WHEN (Churn = Sim) THEN 1 ELSE 0 END AS Churn'>,
 Column<'CASE WHEN (Conjuge = Sim) THEN 1 ELSE 0 END AS Conjuge'>,
 Column<'CASE WHEN (Dependentes = Sim) THEN 1 ELSE 0 END AS Dependentes'>,
 Column<'CASE WHEN (TelefoneFixo = Sim) THEN 1 ELSE 0 END AS TelefoneFixo'>,
 Column<'CASE WHEN (MaisDeUmaLinhaTelefonica = Sim) THEN 1 ELSE 0 END AS MaisDeUmaLinhaTelefonica'>,
 Column<'CASE WHEN (SegurancaOnline = Sim) THEN 1 ELSE 0 END AS SegurancaOnline'>,
 Column<'CASE WHEN (SeguroDispositivo = Sim) THEN 1 ELSE 0 END AS SeguroDispositivo'>,
 Column<'CASE WHEN (SuporteTecnico = Sim) THEN 1 ELSE 0 END AS SuporteTecnico'>,
 Column<'CASE WHEN (TVaCabo = Sim) THEN 1 ELSE 0 END AS TVaCabo'>,
 Column<'CASE WHEN (BackupOnline = Sim) THEN 1 ELSE 0 END AS BackupOnline'>,
 Column<'CASE WHEN (StreamingFilmes = Sim) THEN 1 ELSE 0 END AS StreamingFilmes'>,
 Column<'CASE WHEN (ContaCorr

In [11]:
dados.select(todasColunas).show()

+---+----------+---------------+-----------+------------+----------------+-------------+-----+-------+-----------+------------+------------------------+---------------+-----------------+--------------+-------+------------+---------------+------------+
| id|Mais65anos|MesesDeContrato|   Internet|TipoContrato| MetodoPagamento|MesesCobrados|Churn|Conjuge|Dependentes|TelefoneFixo|MaisDeUmaLinhaTelefonica|SegurancaOnline|SeguroDispositivo|SuporteTecnico|TVaCabo|BackupOnline|StreamingFilmes|ContaCorreio|
+---+----------+---------------+-----------+------------+----------------+-------------+-----+-------+-----------+------------+------------------------+---------------+-----------------+--------------+-------+------------+---------------+------------+
|  0|         0|              1|        DSL| Mensalmente|BoletoEletronico|        29.85|    0|      1|          0|           0|                       0|              0|                0|             0|      0|           1|              0|      

In [12]:
dataset = dados.select(todasColunas)

In [13]:
dataset.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Mais65anos: integer (nullable = true)
 |-- MesesDeContrato: integer (nullable = true)
 |-- Internet: string (nullable = true)
 |-- TipoContrato: string (nullable = true)
 |-- MetodoPagamento: string (nullable = true)
 |-- MesesCobrados: double (nullable = true)
 |-- Churn: integer (nullable = false)
 |-- Conjuge: integer (nullable = false)
 |-- Dependentes: integer (nullable = false)
 |-- TelefoneFixo: integer (nullable = false)
 |-- MaisDeUmaLinhaTelefonica: integer (nullable = false)
 |-- SegurancaOnline: integer (nullable = false)
 |-- SeguroDispositivo: integer (nullable = false)
 |-- SuporteTecnico: integer (nullable = false)
 |-- TVaCabo: integer (nullable = false)
 |-- BackupOnline: integer (nullable = false)
 |-- StreamingFilmes: integer (nullable = false)
 |-- ContaCorreio: integer (nullable = false)



In [14]:
dataset.select(['Internet', 'TipoContrato', 'MetodoPagamento']).show()

+-----------+------------+----------------+
|   Internet|TipoContrato| MetodoPagamento|
+-----------+------------+----------------+
|        DSL| Mensalmente|BoletoEletronico|
|        DSL|       UmAno|          Boleto|
|        DSL| Mensalmente|          Boleto|
|        DSL|       UmAno|   DebitoEmConta|
|FibraOptica| Mensalmente|BoletoEletronico|
|FibraOptica| Mensalmente|BoletoEletronico|
|FibraOptica| Mensalmente|   CartaoCredito|
|        DSL| Mensalmente|          Boleto|
|FibraOptica| Mensalmente|BoletoEletronico|
|        DSL|       UmAno|   DebitoEmConta|
|        DSL| Mensalmente|          Boleto|
|        Nao|    DoisAnos|   CartaoCredito|
|FibraOptica|       UmAno|   CartaoCredito|
|FibraOptica| Mensalmente|   DebitoEmConta|
|FibraOptica| Mensalmente|BoletoEletronico|
|FibraOptica|    DoisAnos|   CartaoCredito|
|        Nao|       UmAno|          Boleto|
|FibraOptica|    DoisAnos|   DebitoEmConta|
|        DSL| Mensalmente|   CartaoCredito|
|FibraOptica| Mensalmente|Boleto

In [15]:
dataset.groupBy('id').pivot('Internet').agg(f.lit(1)).na.fill(0).show()

+----+---+-----------+---+
|  id|DSL|FibraOptica|Nao|
+----+---+-----------+---+
|7982|  1|          0|  0|
|9465|  0|          1|  0|
|2122|  1|          0|  0|
|3997|  1|          0|  0|
|6654|  0|          1|  0|
|7880|  0|          1|  0|
|4519|  0|          1|  0|
|6466|  0|          1|  0|
| 496|  1|          0|  0|
|7833|  0|          1|  0|
|1591|  0|          0|  1|
|2866|  0|          1|  0|
|8592|  0|          1|  0|
|1829|  0|          1|  0|
| 463|  0|          1|  0|
|4900|  0|          1|  0|
|4818|  0|          1|  0|
|7554|  1|          0|  0|
|1342|  0|          0|  1|
|5300|  0|          1|  0|
+----+---+-----------+---+
only showing top 20 rows



In [16]:
Internet = dataset.groupBy('id').pivot('Internet').agg(f.lit(1)).na.fill(0)
TipoContrato = dataset.groupBy('id').pivot('TipoContrato').agg(f.lit(1)).na.fill(0)
MetodoPagamento = dataset.groupBy('id').pivot('MetodoPagamento').agg(f.lit(1)).na.fill(0)

In [17]:
dataset = dataset\
    .join(Internet, 'id', how='inner')\
    .join(TipoContrato, 'id', how = 'inner')\
    .join(MetodoPagamento, 'id', how = 'inner')\
    .select('*',
           f.col('DSL').alias('Internet_DSL'),\
           f.col('FibraOptica').alias('Internet_FibraOptica'),\
           f.col('Nao').alias('Internet_Nao'),\
            f.col('Mensalmente').alias('TipoContrato_Mensalmente'),\
            f.col('UmAno').alias('TipoContrato_UmAno'),\
            f.col('DoisAnos').alias('tipoContrato_DoisAnos'),\
            f.col('DebitoEmConta').alias('MetodoPagamento_DebitoEmConta'),\
            f.col('CartaoCredito').alias('MetodoPagamento_CartaoCredito'),\
            f.col('BoletoEletronico').alias('MetodoPagamento_BoletoEletronico'),\
            f.col('Boleto').alias('MetodoPagamento_Boleto') \
           ).drop('Internet', 'TipoContrato', 'MetodoPagamento', 'DSL', 'FibraOptica', 'Nao', 'Mensalmente','UmAno', 'DoisAnos', 'DebitoEmConta', 'CartaoCredito', 'BoletoEletronico', 'Boleto')

In [18]:
dataset.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Mais65anos: integer (nullable = true)
 |-- MesesDeContrato: integer (nullable = true)
 |-- MesesCobrados: double (nullable = true)
 |-- Churn: integer (nullable = false)
 |-- Conjuge: integer (nullable = false)
 |-- Dependentes: integer (nullable = false)
 |-- TelefoneFixo: integer (nullable = false)
 |-- MaisDeUmaLinhaTelefonica: integer (nullable = false)
 |-- SegurancaOnline: integer (nullable = false)
 |-- SeguroDispositivo: integer (nullable = false)
 |-- SuporteTecnico: integer (nullable = false)
 |-- TVaCabo: integer (nullable = false)
 |-- BackupOnline: integer (nullable = false)
 |-- StreamingFilmes: integer (nullable = false)
 |-- ContaCorreio: integer (nullable = false)
 |-- Internet_DSL: integer (nullable = true)
 |-- Internet_FibraOptica: integer (nullable = true)
 |-- Internet_Nao: integer (nullable = true)
 |-- TipoContrato_Mensalmente: integer (nullable = true)
 |-- TipoContrato_UmAno: integer (nullable = true)
 |-- tipoContr

In [19]:
from pyspark.ml.feature import VectorAssembler

In [20]:
dataset = dataset.withColumnRenamed('Churn', 'label')

In [21]:
X = dataset.columns
X.remove('label')
X.remove('id')

In [22]:
assembler = VectorAssembler(inputCols =X, outputCol='features')

In [23]:
dataset_prep = assembler.transform(dataset).select('features', 'label')

In [24]:
dataset_prep.show(truncate = False)

+------------------------------------------------------------------------------------------------------------+-----+
|features                                                                                                    |label|
+------------------------------------------------------------------------------------------------------------+-----+
|(24,[1,2,10,12,13,14,17,22],[1.0,45.30540797610398,1.0,1.0,1.0,1.0,1.0,1.0])                                |1    |
|(24,[1,2,3,5,6,8,10,11,12,13,15,17,22],[60.0,103.6142230120257,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])|1    |
|(24,[1,2,5,6,9,10,12,13,14,18,23],[12.0,75.85,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                         |0    |
|(24,[1,2,3,5,11,12,13,14,19,21],[69.0,61.45,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                               |0    |
|(24,[1,2,3,5,6,10,13,15,17,22],[7.0,86.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                  |1    |
|(24,[1,2,5,6,12,13,15,17,22],[14.0,85.03742670311915,1.0,1.0,1.

In [25]:
SEED = 101

In [26]:
treino, teste = dataset_prep.randomSplit([0.7, 0.3], seed=SEED)

In [27]:
from pyspark.ml.classification import LogisticRegression

In [28]:
lr = LogisticRegression()

In [29]:
modelo_lr = lr.fit(treino)

In [30]:
previsoes_lr_teste = modelo_lr.transform(teste)

In [31]:
previsoes_lr_teste.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(24,[0,1,2,3,4,5,...|    0|[1.04773882412687...|[0.74034045500868...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[-0.1366605349276...|[0.46588793977598...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[0.72287161986035...|[0.67323905362807...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[0.86716499191714...|[0.70415544956888...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[-0.3327366549522...|[0.41757490222070...|       1.0|
|(24,[0,1,2,3,4,5,...|    1|[-0.0310089575015...|[0.49224838174889...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[0.03971699525400...|[0.50992794378674...|       0.0|
|(24,[0,1,2,3,4,5,...|    1|[0.17249223213928...|[0.54301645313354...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[0.70154269745002...|[0.66852971926204...|       0.0|
|(24,[0,1,2,3,4,

In [32]:
resumo_lr_treino = modelo_lr.summary

In [33]:
print("Acurácia: %f" % resumo_lr_treino.accuracy)
print("Precisao: %f" % resumo_lr_treino.precisionByLabel[1])
print("Recall: %f" % resumo_lr_treino.recallByLabel[1])
print("F1: %f" % resumo_lr_treino.fMeasureByLabel()[1])

Acurácia: 0.781293
Precisao: 0.763435
Recall: 0.810403
F1: 0.786218


In [34]:
def calcula_mostra_matriz_confusao(df_transform_modelo, normalize=False, percentagem = True):
    tp =  previsoes_lr_teste.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') ==1)).count()
    tn = previsoes_lr_teste.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') ==0)).count()
    fp =  previsoes_lr_teste.select('label', 'prediction').where((f.col('label') == 0) & (f.col('prediction') ==1)).count()
    fn = previsoes_lr_teste.select('label', 'prediction').where((f.col('label') == 1) & (f.col('prediction') ==0)).count()
    
    valorP = 1
    valorN = 1
    
    if normalize:
        valorP = tp + fn
        valorN = fp + tn
        
    if percentagem and normalize:
        valorP = valorP/100
        valorN = valorN/100
    print(' '*20, 'Previsto')
    print(' '*15, 'Churn', ' '*5, 'Não Churn')
    print(' '*4, 'Churn', ' '*6, int(tp/valorP), ' '*7, int(fn/valorP))
    print('Real')
    print(' '*4, 'Não-Churn', ' '*2, int(fp/valorN), ' '*7, int(tn/valorN))
    print(' ')
    print('Métricas:')
    print('Acurácia: %f' % resumo_lr_treino.accuracy)
    print('Precisão: %f' % resumo_lr_treino.precisionByLabel[1])
    print('Recall: %f ' % resumo_lr_treino.recallByLabel[1])
    print('F1:%f' % resumo_lr_treino.fMeasureByLabel()[1])


In [35]:
calcula_mostra_matriz_confusao (previsoes_lr_teste, normalize = False)

                     Previsto
                Churn       Não Churn
     Churn        1269         329
Real
     Não-Churn    355         1189
 
Métricas:
Acurácia: 0.781293
Precisão: 0.763435
Recall: 0.810403 
F1:0.786218


In [36]:
from pyspark.ml.classification import DecisionTreeClassifier

In [37]:
dtc = DecisionTreeClassifier(seed=SEED)

In [38]:
modelo_dtc = dtc.fit(treino)

In [39]:
previsoes_dtc_treino = modelo_dtc.transform(treino)

In [41]:
previsoes_dtc_treino.show()

+--------------------+-----+--------------+--------------------+----------+
|            features|label| rawPrediction|         probability|prediction|
+--------------------+-----+--------------+--------------------+----------+
|(24,[0,1,2,3,4,5,...|    0|[2073.0,323.0]|[0.86519198664440...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[2073.0,323.0]|[0.86519198664440...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[2073.0,323.0]|[0.86519198664440...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|   [32.0,13.0]|[0.71111111111111...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|   [32.0,13.0]|[0.71111111111111...|       0.0|
|(24,[0,1,2,3,4,5,...|    1|   [56.0,63.0]|[0.47058823529411...|       1.0|
|(24,[0,1,2,3,4,5,...|    1| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[338.0,1807.0]|[0.15757575757575...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[2073.0,323.0]|[0.86519198664440...|       0.0|
|(24,[0,1,2,3,4,5,...|    0| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,

In [42]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [43]:
evaluator = MulticlassClassificationEvaluator()

In [44]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_dtc_treino,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_dtc_treino, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_dtc_treino,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_dtc_treino,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))


Métricas:
Acurácia: 0.789481
Precisão: 0.791726
Recall: 0.781320 
F1:0.786488


In [45]:

previsoes_dtc_teste = modelo_dtc.transform(teste)

In [46]:
previsoes_dtc_teste.show()

+--------------------+-----+--------------+--------------------+----------+
|            features|label| rawPrediction|         probability|prediction|
+--------------------+-----+--------------+--------------------+----------+
|(24,[0,1,2,3,4,5,...|    0|   [32.0,13.0]|[0.71111111111111...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|  [54.0,116.0]|[0.31764705882352...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[338.0,1807.0]|[0.15757575757575...|       1.0|
|(24,[0,1,2,3,4,5,...|    0| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|   [62.0,26.0]|[0.70454545454545...|       0.0|
|(24,[0,1,2,3,4,5,...|    1| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,3,4,5,...|    0| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,3,4,5,...|    1| [224.0,211.0]|[0.51494252873563...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[2073.0,323.0]|[0.86519198664440...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[338.0,1807.0]|[0.15757575757575...|       1.0|
|(24,[0,1,2,

In [47]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_dtc_teste,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_dtc_teste, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_dtc_teste,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_dtc_teste,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))

Métricas:
Acurácia: 0.784214
Precisão: 0.797158
Recall: 0.772215 
F1:0.784488


In [48]:
from pyspark.ml.classification import RandomForestClassifier

In [49]:
rf = RandomForestClassifier(seed=SEED)

In [50]:
modelo_rf = rf.fit(treino)

In [51]:
previsoes_rf_treino = modelo_rf.transform(treino)

In [52]:
previsoes_rf_treino.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(24,[0,1,2,3,4,5,...|    0|[15.8956480513630...|[0.79478240256815...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[15.335712979194,...|[0.7667856489597,...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[15.5470870566389...|[0.77735435283194...|       0.0|
|(24,[0,1,2,3,4,5,...|    0|[8.07482664587826...|[0.40374133229391...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[8.02162950825082...|[0.40108147541254...|       1.0|
|(24,[0,1,2,3,4,5,...|    1|[6.35811497422193...|[0.31790574871109...|       1.0|
|(24,[0,1,2,3,4,5,...|    1|[8.08895233212468...|[0.40444761660623...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[4.62751004701780...|[0.23137550235089...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[15.6588867761392...|[0.78294433880696...|       0.0|
|(24,[0,1,2,3,4,

In [53]:
previsoes_rf_teste = modelo_rf.transform(teste)

In [54]:
previsoes_rf_teste.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(24,[0,1,2,3,4,5,...|    0|[8.07482664587826...|[0.40374133229391...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[6.53094675169338...|[0.32654733758466...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[6.06371482942920...|[0.30318574147146...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[8.63261391913623...|[0.43163069595681...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[6.35811497422193...|[0.31790574871109...|       1.0|
|(24,[0,1,2,3,4,5,...|    1|[7.88045128168474...|[0.39402256408423...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[9.34747000174199...|[0.46737350008709...|       1.0|
|(24,[0,1,2,3,4,5,...|    1|[7.29234812961423...|[0.36461740648071...|       1.0|
|(24,[0,1,2,3,4,5,...|    0|[15.0673239058128...|[0.75336619529064...|       0.0|
|(24,[0,1,2,3,4,

In [55]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_rf_teste,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_rf_teste, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_rf_teste,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_rf_teste,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))

Métricas:
Acurácia: 0.775939
Precisão: 0.766389
Recall: 0.804756 
F1:0.785104


In [56]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_rf_treino,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_rf_treino, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_rf_treino,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_rf_treino,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))

Métricas:
Acurácia: 0.780183
Precisão: 0.754862
Recall: 0.824944 
F1:0.788348


In [57]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [58]:
dtc = DecisionTreeClassifier(seed= SEED)

In [59]:
grid = ParamGridBuilder()\
        .addGrid(dtc.maxDepth, [2,5,10])\
        .addGrid(dtc.maxBins, [10,32,45])\
        .build()

In [60]:
evaluator = MulticlassClassificationEvaluator()

In [61]:
dtc_cv = CrossValidator(
    estimator=dtc,
    estimatorParamMaps=grid,
    evaluator=evaluator,
    numFolds = 3,
    seed=SEED
)

In [62]:
modelo_dtc_cv = dtc_cv.fit(treino)

In [63]:
previsoes_dtc_cv_teste = modelo_dtc_cv.transform(teste)

In [64]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_dtc_cv_teste,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_dtc_cv_teste, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_dtc_cv_teste,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_dtc_cv_teste,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))

Métricas:
Acurácia: 0.804583
Precisão: 0.792162
Recall: 0.834793 
F1:0.812919


In [65]:
rfc = RandomForestClassifier()

In [66]:
grid = ParamGridBuilder()\
        .addGrid(rfc.maxDepth, [2,5,10])\
        .addGrid(rfc.maxBins, [10,32,45])\
        .addGrid(rfc.numTrees, [10,20,30])\
        .build()

In [67]:
rfc_cv = CrossValidator(
    estimator=rfc,
    estimatorParamMaps=grid,
    evaluator=evaluator,
    numFolds=3
)

In [68]:
modelo_rfc_cv = rfc_cv.fit(treino)

In [69]:
previsoes_rfc_cv_teste = modelo_rfc_cv.transform(teste)

In [70]:
print('Métricas:')
print('Acurácia: %f' % evaluator.evaluate(previsoes_rfc_cv_teste,{evaluator.metricName: 'accuracy'} ))
print('Precisão: %f' % evaluator.evaluate(previsoes_rfc_cv_teste, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1}))
print('Recall: %f ' % evaluator.evaluate(previsoes_rfc_cv_teste,{evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1}))
print('F1:%f' % evaluator.evaluate(previsoes_rfc_cv_teste,{evaluator.metricName: 'fMeasureByLabel', evaluator.metricLabel: 1}))

Métricas:
Acurácia: 0.822088
Precisão: 0.805409
Recall: 0.857322 
F1:0.830555


In [71]:
melhor_modelo_rfc = modelo_rfc_cv.bestModel

In [72]:
print(melhor_modelo_rfc.getMaxDepth())
print(melhor_modelo_rfc.getMaxBins())
print(melhor_modelo_rfc.getNumTrees)

10
45
30


In [73]:
rfc_tuning = RandomForestClassifier(maxDepth=10, maxBins=45, numTrees = 30,seed=SEED)

In [74]:
modelo_rfc_tuning = rfc_tuning.fit(dataset_prep)

In [75]:
novo_cliente = [{'Mais65anos':0,
 'MesesDeContrato':1,
 'MesesCobrados':45.30540797610398,
 'Conjuge':0,
 'Dependentes':0,
 'TelefoneFixo':0,
 'MaisDeUmaLinhaTelefonica':0,
 'SegurancaOnline':0,
 'SeguroDispositivo':0,
 'SuporteTecnico':0,
 'TVaCabo':1,
 'BackupOnline':0,
 'StreamingFilmes':1,
 'ContaCorreio':1,
 'Internet_DSL':1,
 'Internet_FibraOptica':0,
 'Internet_Nao':0,
 'TipoContrato_Mensalmente':1,
 'TipoContrato_UmAno':0,
 'tipoContrato_DoisAnos':0,
 'MetodoPagamento_DebitoEmConta':0,
 'MetodoPagamento_CartaoCredito':0,
 'MetodoPagamento_BoletoEletronico':0,
 'MetodoPagamento_Boleto':1}]

In [76]:
novo_cliente = spark.createDataFrame(novo_cliente)
novo_cliente.show()

+------------+-------+------------+-----------+------------+--------------------+------------+----------+------------------------+-----------------+---------------+----------------------+--------------------------------+-----------------------------+-----------------------------+---------------+-----------------+---------------+--------------+-------+------------+------------------------+------------------+---------------------+
|BackupOnline|Conjuge|ContaCorreio|Dependentes|Internet_DSL|Internet_FibraOptica|Internet_Nao|Mais65anos|MaisDeUmaLinhaTelefonica|    MesesCobrados|MesesDeContrato|MetodoPagamento_Boleto|MetodoPagamento_BoletoEletronico|MetodoPagamento_CartaoCredito|MetodoPagamento_DebitoEmConta|SegurancaOnline|SeguroDispositivo|StreamingFilmes|SuporteTecnico|TVaCabo|TelefoneFixo|TipoContrato_Mensalmente|TipoContrato_UmAno|tipoContrato_DoisAnos|
+------------+-------+------------+-----------+------------+--------------------+------------+----------+------------------------+----

In [77]:
assembler = VectorAssembler(inputCols= X, outputCol = 'features')

In [78]:
novo_cliente_prep = assembler.transform(novo_cliente).select('features')

In [79]:
novo_cliente_prep.show()

+--------------------+
|            features|
+--------------------+
|(24,[1,2,10,12,13...|
+--------------------+



In [80]:
modelo_rfc_tuning.transform(novo_cliente_prep).show()

+--------------------+--------------------+--------------------+----------+
|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+----------+
|(24,[1,2,10,12,13...|[6.94344004074805...|[0.23144800135826...|       1.0|
+--------------------+--------------------+--------------------+----------+

