# Exploratory Data Analysis (EDA) for Customer Churn Prediction

In this notebook, we will perform an initial exploratory data analysis on the Telco Customer Churn dataset. The goal is to understand the data's structure, identify any potential issues, and get a first look at the variables.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('../data/raw/train.csv')

# Set pandas options to display all columns
pd.set_option('display.max_columns', None)

## 1. Initial Data Inspection

In [None]:
# Display the first 5 rows of the dataframe
df.head()

In [None]:
# Get the shape of the dataframe
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

In [None]:
# Get a summary of the dataframe
df.info()

## 2. Descriptive Statistics

In [None]:
# Get descriptive statistics for numerical columns
df.describe()

## 3. Target Variable Analysis

In [None]:
print(df['Churn'].value_counts())

In [None]:
sns.countplot(x='Churn', data=df)
plt.title('Churn Distribution')

## 4. Feature Analysis

### 4.1 Categorical Features

In [None]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()

# Remove ID and other non-feature columns
categorical_cols.remove('Customer ID')
categorical_cols.remove('City') # Too many unique values
categorical_cols.remove('Lat Long') # Not a categorical feature
categorical_cols.remove('Churn Reason') # Too many unique values, and related to churn
categorical_cols.remove('Churn Category') # Too many unique values, and related to churn

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

In [None]:
# Analyze distribution of categorical features
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    sns.countplot(y=col, data=df, order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()

### 4.2 Numerical Features

In [None]:
# Analyze distribution of numerical features
df[numerical_cols].hist(figsize=(15, 20), bins=20, layout=(-1, 3))
plt.tight_layout()

## 5. Bivariate Analysis (Feature vs. Target)

In [None]:
# Relationship between Contract and Churn
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Churn by Contract Type')

In [None]:
# Relationship between Internet Type and Churn
plt.figure(figsize=(10, 6))
sns.countplot(x='Internet Type', hue='Churn', data=df)
plt.title('Churn by Internet Type')

In [None]:
# Relationship between Monthly Charges and Churn
sns.boxplot(x='Churn', y='Monthly Charge', data=df)
plt.title('Monthly Charge vs. Churn')

In [None]:
# Relationship between Tenure and Churn
sns.boxplot(x='Churn', y='Tenure in Months', data=df)
plt.title('Tenure in Months vs. Churn')