# PrDS_2024__TelcoCustomerChurn
Telco Customer Churn. Focused customer retention programs

## Libraries

In [99]:
from typing import Any

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", 30)

## Data Extraction

In [100]:
dtype: dict[str, Any] = {
    "customerID": str,
    "gender": "category",
    "SeniorCitizen": bool,
    "Partner": "category",
    "Dependents": "category",
    "tenure": int,
    "PhoneService": "category",
    "MultipleLines": "category",
    "InternetService": "category",
    "OnlineSecurity": "category",
    "OnlineBackup": "category",
    "DeviceProtection": "category",
    "TechSupport": "category",
    "StreamingTV": "category",
    "StreamingMovies": "category",
    "Contract": "category",
    "PaperlessBilling": "category",
    "PaymentMethod": "category",
    "MonthlyCharges": float,
    "Churn": "category",
}
raw_dataframe: pd.DataFrame = pd.read_csv(
    "../data/raw/WA_Fn-UseC_-Telco-Customer-Churn.csv", dtype=dtype
)

## Numerical Data Analysis

In [101]:
print(raw_dataframe.head())
print(raw_dataframe.shape)
print(raw_dataframe.dtypes)
raw_dataframe.info(memory_usage="deep")
print(raw_dataframe.memory_usage(deep=True))
print(raw_dataframe.describe(include="all"))

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female          False     Yes         No       1           No   
1  5575-GNVDE    Male          False      No         No      34          Yes   
2  3668-QPYBK    Male          False      No         No       2          Yes   
3  7795-CFOCW    Male          False      No         No      45           No   
4  9237-HQITU  Female          False      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity OnlineBackup  \
0  No phone service             DSL             No          Yes   
1                No             DSL            Yes           No   
2                No             DSL            Yes          Yes   
3  No phone service             DSL            Yes           No   
4                No     Fiber optic             No           No   

  DeviceProtection TechSupport StreamingTV StreamingMovies        Contract  \
0               No          No        

## Missing values

In [102]:
missing_values: int = raw_dataframe.isnull().sum()
percent_missing: float = (missing_values / raw_dataframe.shape[0]) * 100
print(
    pd.DataFrame({"Missing Values": missing_values, "Percent": percent_missing})
)

                  Missing Values  Percent
customerID                     0      0.0
gender                         0      0.0
SeniorCitizen                  0      0.0
Partner                        0      0.0
Dependents                     0      0.0
tenure                         0      0.0
PhoneService                   0      0.0
MultipleLines                  0      0.0
InternetService                0      0.0
OnlineSecurity                 0      0.0
OnlineBackup                   0      0.0
DeviceProtection               0      0.0
TechSupport                    0      0.0
StreamingTV                    0      0.0
StreamingMovies                0      0.0
Contract                       0      0.0
PaperlessBilling               0      0.0
PaymentMethod                  0      0.0
MonthlyCharges                 0      0.0
TotalCharges                   0      0.0
Churn                          0      0.0


## Unique

In [103]:
for column in raw_dataframe.columns:
    print(raw_dataframe[column].value_counts())
    print(raw_dataframe[column].unique())
    print(raw_dataframe[column].value_counts(normalize=True) * 100)

customerID
7590-VHVEG    1
3791-LGQCY    1
6008-NAIXK    1
5956-YHHRX    1
5365-LLFYV    1
             ..
9796-MVYXX    1
2637-FKFSY    1
1552-AAGRX    1
4304-TSPVK    1
3186-AJIEK    1
Name: count, Length: 7043, dtype: int64
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
customerID
7590-VHVEG    0.014198
3791-LGQCY    0.014198
6008-NAIXK    0.014198
5956-YHHRX    0.014198
5365-LLFYV    0.014198
                ...   
9796-MVYXX    0.014198
2637-FKFSY    0.014198
1552-AAGRX    0.014198
4304-TSPVK    0.014198
3186-AJIEK    0.014198
Name: proportion, Length: 7043, dtype: float64
gender
Male      3555
Female    3488
Name: count, dtype: int64
['Female', 'Male']
Categories (2, object): ['Female', 'Male']
gender
Male      50.47565
Female    49.52435
Name: proportion, dtype: float64
SeniorCitizen
False    5901
True     1142
Name: count, dtype: int64
[False  True]
SeniorCitizen
False    83.785319
True     16.214681
Name: proportion, dtype: float64
Partner

## Data Preprocessing

### Fixing data dtypes for missing values

In [104]:
dataframe: pd.DataFrame = raw_dataframe.copy()
dataframe['TotalCharges'] = pd.to_numeric(
    dataframe['TotalCharges'].replace(' ', np.nan), errors='coerce'
)
dataframe['TotalCharges'] = dataframe['TotalCharges'].fillna(0)

### Verification

In [105]:
print(dataframe.head)
print(dataframe.dtypes)
dataframe.info(memory_usage="deep")

<bound method NDFrame.head of       customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0     7590-VHVEG  Female          False     Yes         No       1   
1     5575-GNVDE    Male          False      No         No      34   
2     3668-QPYBK    Male          False      No         No       2   
3     7795-CFOCW    Male          False      No         No      45   
4     9237-HQITU  Female          False      No         No       2   
...          ...     ...            ...     ...        ...     ...   
7038  6840-RESVB    Male          False     Yes        Yes      24   
7039  2234-XADUH  Female          False     Yes        Yes      72   
7040  4801-JZAZL  Female          False     Yes        Yes      11   
7041  8361-LTMKD    Male           True     Yes         No       4   
7042  3186-AJIEK    Male          False      No         No      66   

     PhoneService     MultipleLines InternetService OnlineSecurity  \
0              No  No phone service             DSL        

### Memory usage
*From 1.1MB to 719.9KB (34.55% improvement)*

### Binary columns

In [106]:
# Example categorical columns with two unique values
binary_categoricals: list[str] = [
    'gender',
    'SeniorCitizen',
    'Partner',
    'Dependents',
    'PhoneService',
    'PaperlessBilling',
    'Churn',
]

for col in binary_categoricals:
    # Map the categories to boolean
    dataframe[col] = (
        dataframe[col].map({'Yes': True, 'No': False}).astype('bool')
    )
dataframe.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   customerID        7043 non-null   object  
 1   gender            7043 non-null   bool    
 2   SeniorCitizen     7043 non-null   bool    
 3   Partner           7043 non-null   bool    
 4   Dependents        7043 non-null   bool    
 5   tenure            7043 non-null   int32   
 6   PhoneService      7043 non-null   bool    
 7   MultipleLines     7043 non-null   category
 8   InternetService   7043 non-null   category
 9   OnlineSecurity    7043 non-null   category
 10  OnlineBackup      7043 non-null   category
 11  DeviceProtection  7043 non-null   category
 12  TechSupport       7043 non-null   category
 13  StreamingTV       7043 non-null   category
 14  StreamingMovies   7043 non-null   category
 15  Contract          7043 non-null   category
 16  PaperlessBilling  7043 n

### Memory usage
*From 719.KB to 718.6KB (1.80% improvement)*

## Tasks

### EDA Multivariado
- Test de multicolinealidad
### EDA Bivariado 
- gender           
- Dependents       
- MultipleLines    
- OnlineBackup     
- StreamingTV      
- PaperlessBilling 
- TotalCharges     
- Graficos de dispersion de todos contra todos (ggally corrplot)
- Test de multicolinealidad
- Graficos para 3 variables: MonthlyCharges vs Gender vs Churn


In [107]:
# 2024-02-03
print("Hello, World!")

Hello, World!
