In [32]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from scipy import stats

df = pd.read_csv("C:\\Users\\Dell\\Downloads\\customer_churn (3).csv")
df.head()

df.info()
df.describe()
print("\nColumns in Dataset:")
print(df.columns)

print("\nDataset Info:")
print(df.info())

print("\nMissing Values:")
print(df.isnull().sum())

print("\nChurn Distribution:")
print(df['Churn'].value_counts())


# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# FIXED (no inplace warning)
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())


# 5. TARGET VARIABLE ENCODING
# --------------------------------------------
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# --------------------------------------------
# 6. CATEGORICAL ENCODING
# --------------------------------------------

# Label Encoding for Binary Columns
le = LabelEncoder()

binary_cols = [
    'Gender',           # FIXED COLUMN NAME
    'Partner',
    'Dependents',
    'PhoneService',
    'PaperlessBilling'
]

for col in binary_cols:
    if col in df.columns:
        df[col] = le.fit_transform(df[col])

# One-Hot Encoding for Remaining Categorical Columns
df = pd.get_dummies(df, drop_first=True)

print("\nCategorical Encoding Completed")

# 7. FEATURE SCALING
# --------------------------------------------
X = df.drop('Churn', axis=1)
y = df['Churn']

# Min-Max Scaling
minmax_scaler = MinMaxScaler()
X_minmax = minmax_scaler.fit_transform(X)

# Standard Scaling
standard_scaler = StandardScaler()
X_standard = standard_scaler.fit_transform(X)

print("\nFeature Scaling Completed")

# --------------------------------------------
# 8. OUTLIER DETECTION & HANDLING
# --------------------------------------------

Q1 = df['MonthlyCharges'].quantile(0.25)
Q3 = df['MonthlyCharges'].quantile(0.75)
IQR = Q3 - Q1

df = df[
    (df['MonthlyCharges'] >= Q1 - 1.5 * IQR) &
    (df['MonthlyCharges'] <= Q3 + 1.5 * IQR)
]

df = df[(np.abs(stats.zscore(df['MonthlyCharges'])) < 3)]

print("\nOutliers Handled")


# Ensure column names are clean
df.columns = df.columns.str.strip()

# Convert only if columns exist
if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

if 'MonthlyCharges' in df.columns:
    df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors='coerce')

if 'tenure' in df.columns:
    df['tenure'] = pd.to_numeric(df['tenure'], errors='coerce')



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   CustomerID        500 non-null    object
 1   Tenure            500 non-null    int64 
 2   MonthlyCharges    500 non-null    int64 
 3   TotalCharges      500 non-null    int64 
 4   Contract          500 non-null    object
 5   PaymentMethod     500 non-null    object
 6   PaperlessBilling  500 non-null    object
 7   SeniorCitizen     500 non-null    int64 
 8   Churn             500 non-null    int64 
dtypes: int64(5), object(4)
memory usage: 35.3+ KB

Columns in Dataset:
Index(['CustomerID', 'Tenure', 'MonthlyCharges', 'TotalCharges', 'Contract',
       'PaymentMethod', 'PaperlessBilling', 'SeniorCitizen', 'Churn'],
      dtype='object')

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column            Non-Nu