In [1]:
import pandas as pd

from pyspark.sql import SparkSession
from pyspark.sql import Row

from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.linalg import Vectors
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [2]:
# creating SparkSession
sps = SparkSession.builder.master('local').appName('customer_churn').config("spark.some.config.option", "some-value").getOrCreate()

In [3]:
def carregaDados(nome=''):
    try:
        rdd = sc.textFile(nome)  #carrega rdd
        first = first = rdd.first()  #coleta primeira linha
        rdd2 = rdd.filter(lambda x: first not in x)\  #filtra cabeçalho
                  .map(lambda x: x.replace('"yes"', '1').replace('"no"','0'))\  #converte yes/no
                  .map(lambda x: x.split(',')[1:len(x)])\  #split
                  .map(lambda x: Row(state=x[0],   #transforma em Row
                                     account_length=int(x[1]), 
                                     area_code=x[2], 
                                     international_plan=int(x[3]),     
                                     voice_mail_plan=int(x[4]),    
                                     number_vmail_messages=int(x[5]),   
                                     total_day_minutes=float(x[6]),  
                                     total_day_calls=int(x[7]),    
                                     total_day_charge=float(x[8]), 
                                     total_eve_minutes=float(x[9]), 
                                     total_eve_calls=int(x[10]),   
                                     total_eve_charge=float(x[11]),
                                     total_night_minutes=float(x[12]),  
                                     total_night_calls=int(x[13]),   
                                     total_night_charge=float(x[14]), 
                                     total_intl_minutes=float(x[15]),  
                                     total_intl_calls=int(x[16]),   
                                     total_intl_charge=float(x[17]), 
                                     number_customer_service_calls=int(x[18]),  
                                     churn=int(x[19])))

        return sps.createDataFrame(rdd2) # retorna df das Rows
    except Exception as e:
        print(e)
        

In [14]:
dft1 = carregaDados('projeto4_telecom_treino.csv')

In [15]:
dft2 = carregaDados('projeto4_telecom_teste.csv')

In [17]:
# Encodando variaveis string

ac_indexer = StringIndexer(inputCol="area_code", outputCol="area_code_t").fit(dft1)
st_indexer = StringIndexer(inputCol='state', outputCol='state_t').fit(dft1)

dft1 = ac_indexer.transform(dft1)
dft1 = st_indexer.transform(dft1)

dft2 = ac_indexer.transform(dft2)
dft2 = st_indexer.transform(dft2)

dft1 = dft1.drop(*['area_code','state'])
dft2 = dft2.drop(*['area_code','state'])

In [20]:
dft1.toPandas().describe()

Unnamed: 0,account_length,churn,international_plan,number_customer_service_calls,number_vmail_messages,total_day_calls,total_day_charge,total_day_minutes,total_eve_calls,total_eve_charge,total_eve_minutes,total_intl_calls,total_intl_charge,total_intl_minutes,total_night_calls,total_night_charge,total_night_minutes,voice_mail_plan,area_code_t,state_t
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,0.144914,0.09691,1.562856,8.09901,100.435644,30.562307,179.775098,100.114311,17.08354,200.980348,4.479448,2.764581,10.237294,100.107711,9.039325,200.872037,0.276628,0.754875,22.536454
std,39.822106,0.352067,0.295879,1.315491,13.688365,20.069084,9.259435,54.467389,19.922625,4.310668,50.713844,2.461214,0.753773,2.79184,19.568609,2.275873,50.573847,0.447398,0.829515,14.544121
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,1.04,23.2,0.0,0.0,0.0
25%,74.0,0.0,0.0,1.0,0.0,87.0,24.43,143.7,87.0,14.16,166.6,3.0,2.3,8.5,87.0,7.52,167.0,0.0,0.0,10.0
50%,101.0,0.0,0.0,1.0,0.0,101.0,30.5,179.4,100.0,17.12,201.4,4.0,2.78,10.3,100.0,9.05,201.2,0.0,1.0,22.0
75%,127.0,0.0,0.0,2.0,20.0,114.0,36.79,216.4,114.0,20.0,235.3,6.0,3.27,12.1,113.0,10.59,235.3,1.0,2.0,35.0
max,243.0,1.0,1.0,9.0,51.0,165.0,59.64,350.8,170.0,30.91,363.7,20.0,5.4,20.0,175.0,17.77,395.0,1.0,2.0,50.0


In [23]:
dft1.toPandas().corr()

Unnamed: 0,account_length,churn,international_plan,number_customer_service_calls,number_vmail_messages,total_day_calls,total_day_charge,total_day_minutes,total_eve_calls,total_eve_charge,total_eve_minutes,total_intl_calls,total_intl_charge,total_intl_minutes,total_night_calls,total_night_charge,total_night_minutes,voice_mail_plan,area_code_t,state_t
account_length,1.0,0.016541,0.024735,-0.003796,-0.004628,0.03847,0.006214,0.006216,0.01926,-0.006745,-0.006757,0.020661,0.009546,0.009514,-0.013176,-0.00896,-0.008955,0.002918,0.006123,0.001255
churn,0.016541,1.0,0.259852,0.20875,-0.089728,0.018459,0.205151,0.205151,0.009233,0.092786,0.092796,-0.052844,0.068259,0.068239,0.006141,0.035496,0.035493,-0.102148,0.004517,-0.014718
international_plan,0.024735,0.259852,1.0,-0.024522,0.008745,0.003755,0.049398,0.049396,0.006114,0.019106,0.0191,0.017366,0.04578,0.045871,0.012451,-0.028913,-0.028905,0.006006,0.000214,-0.008039
number_customer_service_calls,-0.003796,0.20875,-0.024522,1.0,-0.013263,-0.018942,-0.013427,-0.013423,0.002423,-0.012987,-0.012985,-0.017561,-0.009675,-0.00964,-0.012802,-0.009277,-0.009288,-0.017824,-0.006091,-0.005434
number_vmail_messages,-0.004628,-0.089728,0.008745,-0.013263,1.0,-0.009548,0.000776,0.000778,-0.005864,0.017578,0.017562,0.013957,0.002884,0.002856,0.007123,0.007663,0.007681,0.956927,-0.020857,0.011956
total_day_calls,0.03847,0.018459,0.003755,-0.018942,-0.009548,1.0,0.006753,0.00675,0.006462,-0.021449,-0.021451,0.004574,0.021666,0.021565,-0.019557,0.022927,0.022938,-0.011086,-0.003282,0.004518
total_day_charge,0.006214,0.205151,0.049398,-0.013427,0.000776,0.006753,1.0,1.0,0.015769,0.007036,0.00705,0.008032,-0.010094,-0.010157,0.022972,0.004301,0.004324,-0.001686,-0.034458,-0.019652
total_day_minutes,0.006216,0.205151,0.049396,-0.013423,0.000778,0.00675,1.0,1.0,0.015769,0.007029,0.007043,0.008033,-0.010092,-0.010155,0.022972,0.0043,0.004323,-0.001684,-0.034452,-0.019655
total_eve_calls,0.01926,0.009233,0.006114,0.002423,-0.005864,0.006462,0.015769,0.015769,1.0,-0.011423,-0.01143,0.017434,0.008674,0.008703,0.00771,-0.002056,-0.002093,-0.006444,-0.016664,0.017616
total_eve_charge,-0.006745,0.092786,0.019106,-0.012987,0.017578,-0.021449,0.007036,0.007029,-0.011423,1.0,1.0,0.002541,-0.011074,-0.011043,0.007596,-0.012601,-0.012592,0.021559,0.005697,0.003639


In [24]:
# Verificando correlção de churn e outras variáveis

print('Correlations no treino:')
lst=[]
for col in dft1.columns:
    lst.append((col, dft1.corr('churn',col)))
sorted(lst, key=lambda tup: tup[1])

Correlations no treino:


[('voice_mail_plan', -0.1021481406701469),
 ('number_vmail_messages', -0.08972796983506418),
 ('total_intl_calls', -0.052844335774137816),
 ('state_t', -0.014717725335615564),
 ('area_code_t', 0.004516661668833458),
 ('total_night_calls', 0.006141203007399843),
 ('total_eve_calls', 0.009233131913077921),
 ('account_length', 0.016540742243674286),
 ('total_day_calls', 0.018459311608577066),
 ('total_night_minutes', 0.03549285342127406),
 ('total_night_charge', 0.0354955562405066),
 ('total_intl_minutes', 0.06823877562717737),
 ('total_intl_charge', 0.06825863150391472),
 ('total_eve_charge', 0.09278603942871391),
 ('total_eve_minutes', 0.09279579031259168),
 ('total_day_charge', 0.20515074317015397),
 ('total_day_minutes', 0.2051508292613899),
 ('number_customer_service_calls', 0.20874999878379408),
 ('international_plan', 0.2598518473454819),
 ('churn', 1.0)]

In [41]:
def transforma(row) :
    obj = (row["churn"], Vectors.dense([row[item] for item in cols_lst]))
    return obj

In [44]:
# TODO: finish this method

def preve_churn(cols):
    cols_lst=['total_day_charge', 'total_day_minutes', 'number_customer_service_calls', 'international_plan']
    rdd_t1 = dft1.rdd.map(transforma)
    rdd_t2 = dft2.rdd.map(transforma)

    logit_clf = LogisticRegression(labelCol = "churn", featuresCol = "features").fit(rdd_t1)
    preds = modelo.transform(rdd_t2)

    avaliador = MulticlassClassificationEvaluator(predictionCol = "prediction", labelCol = "churn", metricName = "accuracy")
    avaliador.evaluate(preds) 

+-----+--------------------+
|churn|            features|
+-----+--------------------+
|    0|[45.07,265.1,1.0,...|
|    0|[27.47,161.6,1.0,...|
|    0|[41.38,243.4,0.0,...|
|    0|[50.9,299.4,2.0,1.0]|
|    0|[28.34,166.7,3.0,...|
|    0|[37.98,223.4,0.0,...|
|    0|[37.09,218.2,3.0,...|
|    0|[26.69,157.0,0.0,...|
|    0|[31.37,184.5,1.0,...|
|    0|[43.96,258.6,0.0,...|
+-----+--------------------+
only showing top 10 rows



0.8377281947261663