In [60]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
from statistics import median
from plotly.subplots import make_subplots

In [2]:
churn_data = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [26]:
churn_data["TotalCharges"] = churn_data["TotalCharges"].replace(" ", np.nan)
churn_data["TotalCharges"] = pd.to_numeric(churn_data["TotalCharges"], errors = "coerce")

In [27]:
columns = ["tenure", "MonthlyCharges", "TotalCharges"]

for column in columns:
    print("-"* len(churn_data[column]))
    print(f"Estadísticas para la columna: {column}")
    print(f"Promedio de los datos: {churn_data[column].mean()}")
    print(f"Mediana de los datos: {churn_data[column].median()}")
    print(f"Moda de los datos: {churn_data[column].mode()}")

----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [72]:
fig = make_subplots(rows = 1, cols = 3, subplot_titles = columns)
colors = ["lightblue", "firebrick", "coral"]

for index, column in enumerate(columns):
    fig.add_trace(
        go.Histogram(
            x = churn_data[column],
            name = column,
            marker = dict(
                color = colors[index],
                line = dict(
                    color = "black",
                    width = 1
                )
            )
        ),
        row = 1,
        col = index + 1
    )
fig.update_layout(
    template = "plotly_dark"
)
fig.show()

Hay una cierta similaridad y simetría entre la media y la mediana. Tal vez haya outliers en los datos. Pero hay una distribución simétrica en los datos.

Hay clientes que duran poco, y clientes que tienen servicios básicos.

In [39]:
churn = churn_data.groupby("Churn").agg(sum)


The provided callable <built-in function sum> is currently using DataFrameGroupBy.sum. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "sum" instead.



In [76]:
print(f"Mediana de Tenure (No churn vs Churn): {median(churn_data[churn_data['Churn'] == 'No']['tenure'])} vs {median(churn_data[churn_data['Churn'] == 'Yes']['tenure'])}")
print(f"Mediana de MonthlyCharges (No churn vs Churn): {round(median(churn_data[churn_data['Churn'] == 'No']['MonthlyCharges']), 2)} vs {median(churn_data[churn_data['Churn'] == 'Yes']['MonthlyCharges'])}")

Mediana de Tenure (No churn vs Churn): 38.0 vs 10
Mediana de MonthlyCharges (No churn vs Churn): 64.43 vs 79.65


Los que duraron poco (10 meses) pagan más. Probablemente les estaban haciendo muchos cargos.

In [57]:
fig = go.Figure()

fig.add_trace(
    go.Box(
        x = churn_data["Churn"],
        y = churn_data["MonthlyCharges"],
        marker = dict(
            color = "red",
        ),
        name = "MonthlyCharges"
    )
)

fig.update_layout(
    template = "plotly_dark",
    xaxis = dict(
        title = "Churn"
    ),
    yaxis = dict(
        title = "MontlyCharges (dólares)"
    ),
    title = "Boxplot de MonthlyCharges vs Churn"
)