# 1. Data Exploration (Keşifsel Veri Analizi - EDA)

Bu notebook'ta Telco Customer Churn veri setini keşfedeceğiz.

**Amaçlar:**
- Veri setinin yapısını anlamak
- Eksik değerleri tespit etmek
- Hedef değişken dağılımını incelemek
- Özelliklerin dağılımlarını görselleştirmek
- Churn ile ilişkili faktörleri keşfetmek

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

## 1.1 Veri Yükleme

In [None]:
# Load data from CSV or database
import sys
sys.path.append('..')

from src.data.loader import DataLoader

loader = DataLoader(data_dir='../data')

# Try loading from database first, fallback to CSV
try:
    df = loader.load_customer_360()
    print("Loaded from database")
except:
    # Load from CSV if available
    try:
        df = loader.load_telco_churn()
        print("Loaded from CSV")
    except:
        print("Please place the Telco-Customer-Churn.csv in data/raw/")
        df = None

In [None]:
# Basic info
if df is not None:
    print(f"Dataset Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    print(df.columns.tolist())

In [None]:
# First few rows
df.head()

In [None]:
# Data types and non-null counts
df.info()

## 1.2 Eksik Değer Analizi

In [None]:
# Missing values
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Missing %': missing_pct
}).sort_values('Missing Count', ascending=False)

missing_df[missing_df['Missing Count'] > 0]

## 1.3 Hedef Değişken Analizi

In [None]:
# Churn distribution
churn_col = 'churned' if 'churned' in df.columns else 'Churn'

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
df[churn_col].value_counts().plot(kind='bar', ax=axes[0], color=['green', 'red'])
axes[0].set_title('Churn Count')
axes[0].set_xlabel('Churned')
axes[0].set_ylabel('Count')

# Pie chart
df[churn_col].value_counts().plot(kind='pie', ax=axes[1], autopct='%1.1f%%', colors=['green', 'red'])
axes[1].set_title('Churn Distribution')

plt.tight_layout()
plt.show()

print(f"\nChurn Rate: {df[churn_col].mean() * 100:.2f}%")

## 1.4 Sayısal Değişken Analizi

In [None]:
# Numerical columns statistics
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[numerical_cols].describe()

In [None]:
# Distribution plots for numerical features
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, col in enumerate(['tenure_months', 'monthly_charges', 'total_charges'][:3]):
    if col in df.columns:
        # Convert to numeric if needed
        data = pd.to_numeric(df[col], errors='coerce')
        axes[i].hist(data.dropna(), bins=30, edgecolor='black', alpha=0.7)
        axes[i].set_title(f'{col} Distribution')
        axes[i].set_xlabel(col)

plt.tight_layout()
plt.show()

## 1.5 Kategorik Değişken Analizi

In [None]:
# Categorical columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns.tolist()
print(f"Categorical columns: {categorical_cols}")

In [None]:
# Value counts for key categorical features
key_cats = ['contract_type', 'payment_method', 'internet_service']

for col in key_cats:
    if col in df.columns:
        print(f"\n{col}:")
        print(df[col].value_counts())

## 1.6 Churn vs Features Analizi

In [None]:
# Churn rate by contract type
if 'contract_type' in df.columns:
    churn_by_contract = df.groupby('contract_type')[churn_col].mean() * 100
    
    fig, ax = plt.subplots(figsize=(10, 5))
    churn_by_contract.plot(kind='bar', ax=ax, color=['#ff6b6b', '#feca57', '#48dbfb'])
    ax.set_title('Churn Rate by Contract Type')
    ax.set_ylabel('Churn Rate (%)')
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
    
    for i, v in enumerate(churn_by_contract):
        ax.text(i, v + 1, f'{v:.1f}%', ha='center')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Churn by tenure groups
if 'tenure_months' in df.columns:
    df['tenure_group'] = pd.cut(
        df['tenure_months'], 
        bins=[0, 12, 24, 48, 72], 
        labels=['0-12', '13-24', '25-48', '49+']
    )
    
    churn_by_tenure = df.groupby('tenure_group')[churn_col].mean() * 100
    
    fig, ax = plt.subplots(figsize=(10, 5))
    churn_by_tenure.plot(kind='bar', ax=ax, color='coral')
    ax.set_title('Churn Rate by Tenure Group')
    ax.set_ylabel('Churn Rate (%)')
    ax.set_xlabel('Tenure (months)')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Correlation heatmap for numerical features
numerical_df = df[numerical_cols].apply(pd.to_numeric, errors='coerce')

plt.figure(figsize=(10, 8))
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap')
plt.tight_layout()
plt.show()

## 1.7 Key Insights

**Bulgular:**
1. Month-to-month sözleşmeli müşteriler daha yüksek churn oranına sahip
2. Kısa süreli müşteriler (0-12 ay) daha fazla churn yapıyor
3. Fiber optik internet kullananlar daha yüksek churn gösteriyor
4. Electronic check ödeme yöntemi daha riskli
5. Tech support ve online security hizmetleri churn'ü azaltıyor

In [None]:
# Summary statistics
print("=" * 50)
print("DATA EXPLORATION SUMMARY")
print("=" * 50)
print(f"Total Customers: {len(df)}")
print(f"Churn Rate: {df[churn_col].mean() * 100:.2f}%")
print(f"Average Tenure: {df['tenure_months'].mean():.1f} months")
print(f"Average Monthly Charges: ${df['monthly_charges'].mean():.2f}")