In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
uploaded = files.upload()
uploaded = files.upload()
df_train = pd.read_csv("customer_churn_dataset-training-master.csv")
df_test = pd.read_csv("customer_churn_dataset-testing-master.csv")
df_train.head()
df_test.head()

Saving customer_churn_dataset-training-master.csv to customer_churn_dataset-training-master (2).csv


Saving customer_churn_dataset-testing-master.csv to customer_churn_dataset-testing-master.csv


Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0


In [None]:
print(df_train.shape)
print(df_test.shape)
print(df_train.head().to_string(index=False))
print(df_test.head().to_string(index=False))

(440833, 12)
(64374, 12)
 CustomerID  Age Gender  Tenure  Usage Frequency  Support Calls  Payment Delay Subscription Type Contract Length  Total Spend  Last Interaction  Churn
        2.0 30.0 Female    39.0             14.0            5.0           18.0          Standard          Annual        932.0              17.0    1.0
        3.0 65.0 Female    49.0              1.0           10.0            8.0             Basic         Monthly        557.0               6.0    1.0
        4.0 55.0 Female    14.0              4.0            6.0           18.0             Basic       Quarterly        185.0               3.0    1.0
        5.0 58.0   Male    38.0             21.0            7.0            7.0          Standard         Monthly        396.0              29.0    1.0
        6.0 23.0   Male    32.0             20.0            5.0            8.0             Basic         Monthly        617.0              20.0    1.0
 CustomerID  Age Gender  Tenure  Usage Frequency  Support Calls  Paym

In [None]:
print(df_train.dtypes)

CustomerID           float64
Age                  float64
Gender                object
Tenure               float64
Usage Frequency      float64
Support Calls        float64
Payment Delay        float64
Subscription Type     object
Contract Length       object
Total Spend          float64
Last Interaction     float64
Churn                float64
dtype: object


In [None]:
def clean_data(df):
    # Drop rows with missing target
    df = df.dropna(subset=['Churn'])
    # Drop CustomerID if exists
    if 'CustomerID' in df.columns:
        df = df.drop(columns=['CustomerID'])
    return df

df_train = clean_data(df_train)
df_test = clean_data(df_test)

In [None]:
print(df_train.isnull().sum())

Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64


In [None]:
print(df_train.duplicated().sum())

0


In [None]:
print(df_train['Churn'].value_counts(dropna=False))


Churn
1.0    249999
0.0    190833
Name: count, dtype: int64


In [None]:
c_cols = df_train.select_dtypes(include=['object','category']).columns
for c in c_cols:
    print(f"\n{c} -> {df_train[c].nunique()} unique, sample values: {df[c].unique()[:10]}")



Gender -> 2 unique, sample values: ['Female' 'Male']

Subscription Type -> 3 unique, sample values: ['Standard' 'Basic' 'Premium']

Contract Length -> 3 unique, sample values: ['Annual' 'Monthly' 'Quarterly']


In [None]:
cat_cols = df_train.select_dtypes(include=['object']).columns
num_cols = df_train.select_dtypes(include=['int64', 'float64']).columns

print("Categorical columns:", cat_cols.tolist())
print("Numerical columns:", num_cols.tolist())


Categorical columns: ['Gender', 'Subscription Type', 'Contract Length']
Numerical columns: ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction', 'Churn']


In [None]:
print("=== Dataset Overview ===")
print(f"Shape: {df_train.shape}")
print("\n--- Data Types ---")
print(df_train.dtypes)

print("\n--- Missing Values ---")
print(df_train.isnull().sum())

print("\n--- Duplicate Rows ---")
print(df_train.duplicated().sum())

print("\n--- Unique Values per Column ---")
for col in df_train.columns:
    unique_vals = df_train[col].nunique()
    print(f"{col}: {unique_vals} unique values")

print("\n--- Sample Data ---")
print(df_train.head().to_string(index=False))


=== Dataset Overview ===
Shape: (440832, 11)

--- Data Types ---
Age                  float64
Gender                object
Tenure               float64
Usage Frequency      float64
Support Calls        float64
Payment Delay        float64
Subscription Type     object
Contract Length       object
Total Spend          float64
Last Interaction     float64
Churn                float64
dtype: object

--- Missing Values ---
Age                  0
Gender               0
Tenure               0
Usage Frequency      0
Support Calls        0
Payment Delay        0
Subscription Type    0
Contract Length      0
Total Spend          0
Last Interaction     0
Churn                0
dtype: int64

--- Duplicate Rows ---
0

--- Unique Values per Column ---
Age: 48 unique values
Gender: 2 unique values
Tenure: 60 unique values
Usage Frequency: 30 unique values
Support Calls: 11 unique values
Payment Delay: 31 unique values
Subscription Type: 3 unique values
Contract Length: 3 unique values
Total Spend: 68

In [None]:
# Target column analysis
print("Unique values in 'Churn':", df_train['Churn'].unique())
print("\nValue counts:")
print(df_train['Churn'].value_counts())
print("\nPercentage distribution:")
print((df_train['Churn'].value_counts(normalize=True) * 100).round(2).astype(str) + "%")

Unique values in 'Churn': [1. 0.]

Value counts:
Churn
1.0    249999
0.0    190833
Name: count, dtype: int64

Percentage distribution:
Churn
1.0    56.71%
0.0    43.29%
Name: proportion, dtype: object


In [None]:
# ====== numeric & categorical columns ======
num_features = df_train.select_dtypes(include=['int64','float64']).columns.tolist()
cat_features = df_train.select_dtypes(include=['object','category']).columns.tolist()

print("Numeric features:", num_features)
print("Categorical features:", cat_features)


Numeric features: ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction', 'Churn']
Categorical features: ['Gender', 'Subscription Type', 'Contract Length']


The Churn column is our target variable. We must remove it from the list of numeric features because preprocessing steps like scaling or imputation should only be applied to input features, not the target. Applying transformations to the target would corrupt the labels and lead to incorrect model training.

In [None]:
# ======  remove target from numeric features ======
target_col = 'Churn'
if target_col in num_features:
  num_features.remove(target_col)
print("Updated numeric features:", num_features)


Updated numeric features: ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 'Payment Delay', 'Total Spend', 'Last Interaction']


In [None]:
numeric_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='constant', fill_value='Missing')

num_pipeline = Pipeline([
    ('imputer', numeric_imputer),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', cat_imputer),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_features),
    ('cat', cat_pipeline, cat_features)
])


In [None]:
def print_preprocessing_summary(preprocessor, df, num_features, cat_features, target_col=None):
    print("=== Before preprocessing ===")
    for col in df.columns:
        if col == target_col:
            print(f"{col:20} | dtype: {df[col].dtype} | approx columns after encoding: (target)")
            continue
        dtype = df[col].dtype
        if col in num_features:
            n_after = 1
        elif col in cat_features:
            cat_transformer = preprocessor.named_transformers_['cat']
            steps = getattr(cat_transformer, 'named_steps', {})
            if 'onehot' in steps:
                onehot = steps['onehot']
                idx = cat_features.index(col)
                n_after = len(onehot.categories_[idx])
            elif 'encoder' in steps:
                n_after = 1
            else:
                n_after = 1
        else:
            n_after = 0
        print(f"{col:20} | dtype: {dtype} | approx columns after encoding: {n_after}")

    feature_names = preprocessor.get_feature_names_out().tolist()
    print("\n=== After preprocessing (X_t) ===")
    for i, col in enumerate(feature_names, start=1):
        print(f"{i:2d}: {col}")
    print(f"\nTotal columns after preprocessing: {len(feature_names)}")
    return feature_names


In [None]:
X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

preprocessor.fit(X_train)
feature_names = print_preprocessing_summary(
    preprocessor,
    df_train,
    num_features,
    cat_features,
    target_col='Churn'
)

=== Before preprocessing ===
Age                  | dtype: float64 | approx columns after encoding: 1
Gender               | dtype: object | approx columns after encoding: 1
Tenure               | dtype: float64 | approx columns after encoding: 1
Usage Frequency      | dtype: float64 | approx columns after encoding: 1
Support Calls        | dtype: float64 | approx columns after encoding: 1
Payment Delay        | dtype: float64 | approx columns after encoding: 1
Subscription Type    | dtype: object | approx columns after encoding: 1
Contract Length      | dtype: object | approx columns after encoding: 1
Total Spend          | dtype: float64 | approx columns after encoding: 1
Last Interaction     | dtype: float64 | approx columns after encoding: 1
Churn                | dtype: float64 | approx columns after encoding: (target)

=== After preprocessing (X_t) ===
 1: num__Age
 2: num__Tenure
 3: num__Usage Frequency
 4: num__Support Calls
 5: num__Payment Delay
 6: num__Total Spend
 7: num_

In [None]:
X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(df_test.drop(columns=[target_col]))
y_test = df_test[target_col]

In [None]:
feature_names = num_features + cat_features

train_clean = pd.DataFrame(X_train_t, columns=feature_names)
train_clean[target_col] = y_train.values

test_clean = pd.DataFrame(X_test_t, columns=feature_names)
test_clean[target_col] = y_test.values

In [64]:
train_clean.to_csv("train_cleaned.csv", index=False)
test_clean.to_csv("test_cleaned.csv", index=False)
files.download("train_cleaned.csv")
files.download("test_cleaned.csv")

print("Train cleaned CSV")
print("Test cleaned CSV")

print("Train shape:", train_clean.shape)
print("Test shape:", test_clean.shape)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Train cleaned CSV
Test cleaned CSV
Train shape: (440832, 11)
Test shape: (64374, 11)
