# Laboratorio 11 - Data Science
## Proyecto de Consultoría Regresión Logística
- Nelson García Bravatti
- Christian Echeverría


In [1]:
!pip -q install pyspark

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation
from pyspark.ml.linalg import DenseMatrix
from pyspark.sql.window import Window

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

spark = SparkSession.builder.appName("Lab DF — Abandono").getOrCreate()
print("Spark version:", spark.version)

Spark version: 3.5.1


In [6]:
file_path = 'abandono_clientes.csv'

df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(file_path)
)

# Ver las primeras filas del dataframe
df.show(5)

+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|           Names| Age|Total_Purchase|Account_Manager|Years|Num_Sites|       Onboard_date|            Location|             Company|Churn|
+----------------+----+--------------+---------------+-----+---------+-------------------+--------------------+--------------------+-----+
|Cameron Williams|42.0|       11066.8|              0| 7.22|      8.0|2013-08-30 07:00:40|10265 Elizabeth M...|          Harvey LLC|    1|
|   Kevin Mueller|41.0|      11916.22|              0|  6.5|     11.0|2013-08-13 00:38:46|6157 Frank Garden...|          Wilson PLC|    1|
|     Eric Lozano|38.0|      12884.75|              0| 6.67|     12.0|2016-06-29 06:20:07|1331 Keith Court ...|Miller, Johnson a...|    1|
|   Phillip White|42.0|       8010.76|              0| 6.71|     10.0|2014-04-22 12:43:12|13120 Daniel Moun...|           Smith Inc|    1|
|  Cynthia Norton|37.0|    

In [7]:
df.printSchema()

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)



In [8]:
# ---------- 1) Dimensión, duplicados ----------
n_rows = df.count()
n_cols = len(df.columns)
print(f"Filas: {n_rows}  |  Columnas: {n_cols}")

dup_company = (df.groupBy("Company").count()
                 .filter(F.col("count") > 1)
                 .count())
print(f"Empresas duplicadas por 'Company': {dup_company}")

Filas: 900  |  Columnas: 10
Empresas duplicadas por 'Company': 23


In [10]:
# ---------- 2) Nulos y tipos ----------
nulls = []
for c in df.columns:
    nulls.append((c, df.filter(F.col(c).isNull()).count()))
spark.createDataFrame(nulls, ["columna", "nulos"]).show(n=100, truncate=False)

# Rangos rápidos (numéricas)
num_cols = ["Age","Total_Purchase","Years","Num_Sites"]
(
  df.select([F.min(c).alias(f"min_{c}") for c in num_cols] +
            [F.max(c).alias(f"max_{c}") for c in num_cols] +
            [F.avg(c).alias(f"avg_{c}") for c in num_cols])
).show(truncate=False)



+---------------+-----+
|columna        |nulos|
+---------------+-----+
|Names          |0    |
|Age            |0    |
|Total_Purchase |0    |
|Account_Manager|0    |
|Years          |0    |
|Num_Sites      |0    |
|Onboard_date   |0    |
|Location       |0    |
|Company        |0    |
|Churn          |0    |
+---------------+-----+

root
 |-- Names: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- Total_Purchase: double (nullable = true)
 |-- Account_Manager: integer (nullable = true)
 |-- Years: double (nullable = true)
 |-- Num_Sites: double (nullable = true)
 |-- Onboard_date: timestamp (nullable = true)
 |-- Location: string (nullable = true)
 |-- Company: string (nullable = true)
 |-- Churn: integer (nullable = true)

+-------+------------------+---------+-------------+-------+------------------+---------+-------------+-----------------+------------------+----------------+-----------------+
|min_Age|min_Total_Purchase|min_Years|min_Num_Sites|max_Age|max_Total_Pur

In [11]:
# ---------- 3) Balance de clases ----------
class_balance = (df.groupBy("Churn").count()
                   .withColumn("prop", F.col("count")/n_rows))
class_balance.show()


+-----+-----+-------------------+
|Churn|count|               prop|
+-----+-----+-------------------+
|    1|  150|0.16666666666666666|
|    0|  750| 0.8333333333333334|
+-----+-----+-------------------+



In [12]:
# ---------- 4) Stats por clase ----------
stats_by_churn = (
    df.groupBy("Churn")
      .agg(
          F.count("*").alias("n"),
          *[F.mean(c).alias(f"mean_{c}") for c in num_cols],
          *[F.expr(f"percentile_approx({c}, 0.5)").alias(f"p50_{c}") for c in num_cols],
          *[F.expr(f"percentile_approx({c}, array(0.25,0.75))").alias(f"IQR_{c}") for c in num_cols]
      )
)
stats_by_churn.show(truncate=False)

+-----+---+-----------------+-------------------+------------------+-----------------+-------+------------------+---------+-------------+------------+-------------------+------------+-------------+
|Churn|n  |mean_Age         |mean_Total_Purchase|mean_Years        |mean_Num_Sites   |p50_Age|p50_Total_Purchase|p50_Years|p50_Num_Sites|IQR_Age     |IQR_Total_Purchase |IQR_Years   |IQR_Num_Sites|
+-----+---+-----------------+-------------------+------------------+-----------------+-------+------------------+---------+-------------+------------+-------------------+------------+-------------+
|1    |150|42.99333333333333|10192.179933333337 |5.8835999999999995|10.66            |43.0   |10271.19          |5.79     |11.0         |[38.0, 47.0]|[8563.24, 11758.69]|[5.12, 6.68]|[10.0, 12.0] |
|0    |750|41.58133333333333|10036.952853333332 |5.1510666666666625|8.173333333333334|41.0   |9993.5            |5.08     |8.0          |[37.0, 46.0]|[8475.8, 11764.35] |[4.36, 5.99]|[7.0, 9.0]   |
+-----+---

In [14]:
# ---------- 5) Ingeniería de fecha: Tenure desde Onboard_date ----------
# Antigüedad en años (aprox) respecto a la fecha máxima de Onboard (como proxy de "hoy" en el dataset)
max_date = df.agg(F.max("Onboard_date").alias("maxd")).collect()[0]["maxd"]
df2 = df.withColumn("Tenure_years_from_onboard",
                    (F.datediff(F.lit(max_date), F.col("Onboard_date"))/365.25).cast("double"))

(
  df2.select(
      F.avg(F.abs(F.col("Tenure_years_from_onboard") - F.col("Years"))).alias("MAE_Years_vs_Tenure"),
      F.corr("Tenure_years_from_onboard","Years").alias("corr_Years_vs_Tenure")
  ).show()
)

# ---------- 6) Leakage check: Account_Manager aleatorio? ----------
# Tabla de contingencia y proporciones
ct = (df.groupBy("Account_Manager","Churn").count()
        .withColumn("prop", F.col("count")/F.sum("count").over(Window.partitionBy("Account_Manager"))))
ct.orderBy("Account_Manager","Churn").show()

# Chi-cuadrado (versión simple con RDD para 2x2; si hay >2 clases ajustar)
from pyspark.ml.feature import StringIndexer
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

# Preparar vector de features categóricas (Account_Manager) y label (Churn)
assembler = VectorAssembler(inputCols=["Account_Manager"], outputCol="features")
chi_df = assembler.setHandleInvalid("keep").transform(df.select("Account_Manager","Churn").dropna())
chi = ChiSquareTest.test(chi_df, "features", "Churn").head()
print(f"Chi2 p-value Account_Manager ~ Churn: {chi.pValues[0]}")

# ---------- 7) Correlaciones numéricas ----------
vec_assembler = VectorAssembler(inputCols=num_cols, outputCol="features_num")
num_vec = vec_assembler.setHandleInvalid("skip").transform(df.select(num_cols).dropna())
corr_mat = Correlation.corr(num_vec, "features_num", "pearson").head()[0]  # DenseMatrix
# Imprimir matriz con nombres
def pretty_corr(names, m: DenseMatrix):
    arr = m.toArray().tolist()
    header = "        " + "  ".join([f"{c:>14}" for c in names])
    print(header)
    for i, row in enumerate(arr):
        print(f"{names[i]:>8}  " + "  ".join([f"{v:14.3f}" for v in row]))
pretty_corr(num_cols, corr_mat)



# ---------- 9) Cardinalidad de categorías ----------
(
  df.agg(
    F.countDistinct("Names").alias("uniq_Names"),
    F.countDistinct("Location").alias("uniq_Location"),
    F.countDistinct("Company").alias("uniq_Company")
  ).show()
)

# Top ubicaciones (si aporta algo regional)
(df.groupBy("Location")
   .agg(F.count("*").alias("n"), F.mean("Churn").alias("churn_rate"))
   .orderBy(F.desc("n"))
   .show(20, truncate=False))

# ---------- 10) Interacciones simples (ej., sitios por año) ----------
df3 = df2.withColumn("Sites_per_Year", (F.col("Num_Sites")/(F.col("Years")+F.lit(1e-6))).cast("double"))
(
  df3.groupBy("Churn")
     .agg(F.mean("Sites_per_Year").alias("mean_Sites_per_Year"))
     .show()
)

+-------------------+--------------------+
|MAE_Years_vs_Tenure|corr_Years_vs_Tenure|
+-------------------+--------------------+
| 2.9261441630542295| 0.04491650416398004|
+-------------------+--------------------+

+---------------+-----+-----+-------------------+
|Account_Manager|Churn|count|               prop|
+---------------+-----+-----+-------------------+
|              0|    0|  401| 0.8586723768736617|
|              0|    1|   66|0.14132762312633834|
|              1|    0|  349| 0.8060046189376443|
|              1|    1|   84|0.19399538106235567|
+---------------+-----+-----+-------------------+

Chi2 p-value Account_Manager ~ Churn: 0.03414770918874699
                   Age  Total_Purchase           Years       Num_Sites
     Age           1.000          -0.037           0.006          -0.006
Total_Purchase          -0.037           1.000          -0.006          -0.003
   Years           0.006          -0.006           1.000           0.052
Num_Sites          -0.006    