In [84]:
import findspark

In [85]:
findspark.add_jars('/app/postgresql-42.1.4.jar')
findspark.init()

In [86]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("Stocks:ETL")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [87]:
from pathlib import Path
import pandas as pd

In [88]:
ds = spark.read.csv('/dataset/BankChurners.csv', header=True)

In [89]:
ds.printSchema()
ds.count()

root
 |-- CLIENTNUM: string (nullable = true)
 |-- Attrition_Flag: string (nullable = true)
 |-- Customer_Age: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Dependent_count: string (nullable = true)
 |-- Education_Level: string (nullable = true)
 |-- Marital_Status: string (nullable = true)
 |-- Income_Category: string (nullable = true)
 |-- Card_Category: string (nullable = true)
 |-- Months_on_book: string (nullable = true)
 |-- Total_Relationship_Count: string (nullable = true)
 |-- Months_Inactive_12_mon: string (nullable = true)
 |-- Contacts_Count_12_mon: string (nullable = true)
 |-- Credit_Limit: string (nullable = true)
 |-- Total_Revolving_Bal: string (nullable = true)
 |-- Avg_Open_To_Buy: string (nullable = true)
 |-- Total_Amt_Chng_Q4_Q1: string (nullable = true)
 |-- Total_Trans_Amt: string (nullable = true)
 |-- Total_Trans_Ct: string (nullable = true)
 |-- Total_Ct_Chng_Q4_Q1: string (nullable = true)
 |-- Avg_Utilization_Ratio: string (nullable = 

10127

In [90]:
ds=ds.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1')

In [91]:
ds=ds.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2')

In [92]:
ds.show()

+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|CLIENTNUM|   Attrition_Flag|Customer_Age|Gender|Dependent_count|Education_Level|Marital_Status|Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+---------+-----------------+------------+------+---------------+---------------+--------------+---------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------

In [93]:
ds=ds.withColumnRenamed("CLIENTNUM", "client#")

In [94]:
import pyspark.sql.types as t

In [95]:
import pyspark.sql.functions as f

In [96]:
accounts_schema = t.StructType([
      t.StructField('client#', t.IntegerType(), True)
    , t.StructField('Attrition_Flag', t.StringType(), True)
    , t.StructField('Customer_Age', t.IntegerType(), True)
    , t.StructField('Gender', t.StringType(), True)
    , t.StructField('Dependent_count', t.IntegerType(), True)
    , t.StructField('Education_Level', t.StringType(), True)
    , t.StructField('Marital_Status', t.StringType(), True)
    , t.StructField('Income_Category', t.StringType(), True)
    , t.StructField('Card_Category', t.StringType(), True)
    , t.StructField('Months_on_book', t.IntegerType(), True)
    , t.StructField('Total_Relationship_Count', t.IntegerType(), True)
    , t.StructField('Months_Inactive_12_mon', t.IntegerType(), True)
    , t.StructField('Contacts_Count_12_mon', t.IntegerType(), True)
    , t.StructField('Credit_Limit', t.IntegerType(), True)
    , t.StructField('Total_Revolving_Bal', t.IntegerType(), True)
    , t.StructField('Avg_Open_To_Buy', t.IntegerType(), True)
    , t.StructField('Total_Amt_Chng_Q4_Q1', t.FloatType(), True)
    , t.StructField('Total_Trans_Amt', t.IntegerType(), True)
    , t.StructField('Total_Trans_Ct', t.IntegerType(), True)
    , t.StructField('Total_Ct_Chng_Q4_Q1', t.FloatType(), True)
    , t.StructField('Avg_Utilization_Ratio', t.FloatType(), True)
])

In [97]:
ds.groupBy('Attrition_Flag').count().orderBy('count')

DataFrame[Attrition_Flag: string, count: bigint]

In [98]:
churned = ds.filter(f.col('Attrition_Flag') == "Attrited Customer").count()
total = ds.count()
churned / total

0.1606596227905599

In [99]:
# Cálculo de proporcion de nulos

In [100]:
ds_nulos = ds.select([f.sum(f.col(c).isNull().cast('integer') / ds.count()).alias(c) for c in ds.columns])

In [101]:
ds_nulos.show()

+-------+--------------+------------+------+---------------+------------------+-------------------+-------------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+--------------------+---------------+--------------+-------------------+---------------------+
|client#|Attrition_Flag|Customer_Age|Gender|Dependent_count|   Education_Level|     Marital_Status|    Income_Category|Card_Category|Months_on_book|Total_Relationship_Count|Months_Inactive_12_mon|Contacts_Count_12_mon|Credit_Limit|Total_Revolving_Bal|Avg_Open_To_Buy|Total_Amt_Chng_Q4_Q1|Total_Trans_Amt|Total_Trans_Ct|Total_Ct_Chng_Q4_Q1|Avg_Utilization_Ratio|
+-------+--------------+------------+------+---------------+------------------+-------------------+-------------------+-------------+--------------+------------------------+----------------------+---------------------+------------+-------------------+---------------+---------

In [106]:
def _drop_nulos(ds, max_nulo_prop=0.5):
    ds_nulos = ds.select([f.sum(f.col(c).isNull().cast('integer') / ds.count()).alias(c) 
                          for c in ds.columns])
    null_cols = [c for c in ds_nulos.columns if ds_nulos.select(c).first()[0] > max_nulo_prop 
                 and c not in PROTECTED_COLS]
    df = ds.drop(*null_cols)
    return ds

In [107]:
PROTECTED_COLS = ['Attrition_Flag']
ds = _drop_nulos(ds)