In [2]:
import numpy as np
import pandas as pd

def make_practice_dataset(n=15000, seed=41, save_path="telecom_churn_15k.csv"):
    rng = np.random.default_rng(seed)

    # categorical columns
    gender = rng.choice(["male", "female"], size=n, p=[0.52, 0.48])
    city = rng.choice(
        ["mumbai", "delhi", "bangalore", "hyderabad", "pune", "chennai", "kolkata", "ahmedabad"],
        size=n,
        p=[0.15, 0.14, 0.13, 0.12, 0.12, 0.11, 0.12, 0.11]
    )
    plan = rng.choice(["basic", "standard", "premium"], size=n, p=[0.50, 0.35, 0.15])
    payment_method = rng.choice(["upi", "card", "netbanking", "cod"], size=n, p=[0.45, 0.30, 0.20, 0.05])
    has_internet = rng.choice(["yes", "no"], size=n, p=[0.82, 0.18])

    # numeric columns
    age = rng.integers(18, 71, size=n)
    tenure_months = rng.integers(0, 73, size=n)
    monthly_bill = rng.normal(loc=550, scale=180, size=n).clip(100, 1500)
    support_calls = rng.poisson(lam=1.3, size=n).clip(0, 10)
    data_gb = rng.gamma(shape=2.2, scale=6.0, size=n).clip(0, 80)

    # create a realistic target with signal (churn-like)
    # (this is "ground truth" logic; you learn pipelines, not magic)
    plan_weight = np.where(plan == "basic", 0.45, np.where(plan == "standard", 0.10, -0.25))
    pay_weight = np.where(payment_method == "cod", 0.35, np.where(payment_method == "upi", -0.08, 0.02))
    net_weight = np.where(has_internet == "no", 0.25, -0.05)

    z = (
        -1.2
        + 0.015 * (monthly_bill - 500)
        - 0.020 * tenure_months
        + 0.18 * support_calls
        + 0.010 * (data_gb - 12)
        + 0.20 * (age < 25)
        + plan_weight
        + pay_weight
        + net_weight
        + rng.normal(0, 0.6, size=n)  # noise
    )

    p = 1 / (1 + np.exp(-z))
    churn = (rng.random(n) < p).astype(int)

    df = pd.DataFrame({
        "customer_id": np.arange(100000, 100000 + n),
        "gender": gender,
        "age": age,
        "city": city,
        "plan": plan,
        "tenure_months": tenure_months,
        "monthly_bill": np.round(monthly_bill, 2),
        "support_calls": support_calls,
        "data_gb": np.round(data_gb, 2),
        "payment_method": payment_method,
        "has_internet": has_internet,
        "churn": churn
    })

    # inject missing values (practice imputation)
    def inject_missing(col, frac):
        idx = rng.choice(df.index, size=int(frac * n), replace=False)
        df.loc[idx, col] = np.nan

    inject_missing("age", 0.06)
    inject_missing("city", 0.04)
    inject_missing("plan", 0.03)
    inject_missing("monthly_bill", 0.05)
    inject_missing("has_internet", 0.02)

    # also inject some "dirty" category values to test handle_unknown
    dirty_idx = rng.choice(df.index, size=120, replace=False)
    df.loc[dirty_idx, "city"] = rng.choice(["unknown_city", "n/a", "??"], size=len(dirty_idx))

    df.to_csv(save_path, index=False)
    return df

df = make_practice_dataset()
df.head(), df.shape


(   customer_id  gender   age       city      plan  tenure_months  \
 0       100000  female  46.0     mumbai     basic             50   
 1       100001  female  65.0  bangalore     basic             61   
 2       100002    male  68.0     mumbai   premium             11   
 3       100003  female  64.0  hyderabad     basic             30   
 4       100004  female  62.0     mumbai  standard             64   
 
    monthly_bill  support_calls  data_gb payment_method has_internet  churn  
 0        599.66              3    14.05            upi          yes      1  
 1        539.78              1    23.10     netbanking          yes      0  
 2        542.47              3     3.43     netbanking          yes      0  
 3        682.10              0     7.00           card           no      1  
 4        314.00              2    22.73           card          yes      0  ,
 (15000, 12))

In [4]:
df.sample(15)

Unnamed: 0,customer_id,gender,age,city,plan,tenure_months,monthly_bill,support_calls,data_gb,payment_method,has_internet,churn
9524,109524,male,46.0,ahmedabad,standard,70,303.49,2,12.97,upi,yes,0
13679,113679,female,28.0,delhi,standard,57,810.13,0,8.66,card,yes,1
14072,114072,female,63.0,kolkata,standard,0,,2,16.35,upi,yes,1
14642,114642,male,55.0,bangalore,premium,32,986.41,1,5.13,upi,yes,1
8601,108601,female,67.0,delhi,standard,37,424.88,3,20.1,upi,yes,0
5559,105559,male,30.0,pune,standard,30,517.14,0,16.81,upi,no,0
4097,104097,male,,delhi,premium,25,491.66,1,3.66,cod,yes,0
11289,111289,female,36.0,bangalore,,3,357.59,0,6.15,cod,yes,1
12873,112873,male,,ahmedabad,basic,12,321.79,1,8.64,netbanking,yes,0
14209,114209,female,56.0,bangalore,standard,4,195.59,0,21.42,netbanking,yes,0


In [5]:
df = df.iloc[:, 1:]
df.head()

Unnamed: 0,gender,age,city,plan,tenure_months,monthly_bill,support_calls,data_gb,payment_method,has_internet,churn
0,female,46.0,mumbai,basic,50,599.66,3,14.05,upi,yes,1
1,female,65.0,bangalore,basic,61,539.78,1,23.1,netbanking,yes,0
2,male,68.0,mumbai,premium,11,542.47,3,3.43,netbanking,yes,0
3,female,64.0,hyderabad,basic,30,682.1,0,7.0,card,no,1
4,female,62.0,mumbai,standard,64,314.0,2,22.73,card,yes,0


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier

In [7]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=41)

In [8]:
x_train

Unnamed: 0,gender,age,city,plan,tenure_months,monthly_bill,support_calls,data_gb,payment_method,has_internet
11286,male,19.0,hyderabad,basic,44,869.00,2,9.00,netbanking,yes
14327,male,25.0,delhi,basic,8,450.59,0,7.10,upi,yes
10187,male,27.0,mumbai,standard,66,,0,9.34,upi,yes
8294,female,33.0,hyderabad,,25,798.71,2,14.93,upi,yes
14246,female,39.0,hyderabad,standard,47,366.61,1,11.04,netbanking,yes
...,...,...,...,...,...,...,...,...,...,...
5200,female,68.0,hyderabad,premium,35,933.77,1,4.49,card,yes
4066,male,44.0,chennai,premium,7,639.81,1,7.34,upi,yes
12172,female,58.0,pune,basic,14,447.95,1,14.63,upi,yes
931,female,20.0,mumbai,standard,10,521.19,1,8.73,card,yes


In [12]:
df['support_calls'].value_counts()

Unnamed: 0_level_0,count
support_calls,Unnamed: 1_level_1
1,5379
0,4109
2,3405
3,1470
4,478
5,124
6,28
7,7


In [9]:
df.isnull().sum()

Unnamed: 0,0
gender,0
age,900
city,592
plan,450
tenure_months,0
monthly_bill,750
support_calls,0
data_gb,0
payment_method,0
has_internet,300


In [10]:
# simple imputation -> age, monthly_bill (avg)
# simple imputation -> has_internet, plan, city (most_frequent)

tnf1 = ColumnTransformer([
    ('impute_age_bill', SimpleImputer(), [1, 5]),
    ('impute_net_city_plan', SimpleImputer(strategy='most_frequent'), [2, 3, 9])
], remainder='passthrough')


In [13]:
# one hot encoding on -> gender(0), cities(2), plan(3), payment_method(8), has_internet(9)

tnf2 = ColumnTransformer([
    ('ohe_gender_city_plan', OneHotEncoder(sparse_output = False, handle_unknown= 'ignore'), [0, 2, 3, 8, 9])
], remainder='passthrough')

In [14]:
# scaling
tnf3 = ColumnTransformer([
    ('scale', MinMaxScaler(), slice(0, 27))
])

In [15]:
# feature selection
tnf4 = SelectKBest(score_func= chi2, k= 20)

In [16]:
# train the model
tnf5 = DecisionTreeClassifier()

In [17]:
pipe = make_pipeline(tnf1, tnf2, tnf3, tnf4, tnf5)

In [18]:
pipe.fit(x_train, y_train)

In [19]:
from sklearn.metrics import accuracy_score

y_pred = pipe.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5605


In [20]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# -----------------------------
# 1) data generator (same dataset idea, different names)
# -----------------------------
def build_dataset(rows=15000, seed=7, out_csv="telecom_churn_15k.csv"):
    gen = np.random.default_rng(seed)

    col_gender = gen.choice(["male", "female"], size=rows, p=[0.52, 0.48])
    col_city = gen.choice(
        ["mumbai", "delhi", "bangalore", "hyderabad", "pune", "chennai", "kolkata", "ahmedabad"],
        size=rows,
        p=[0.15, 0.14, 0.13, 0.12, 0.12, 0.11, 0.12, 0.11]
    )
    col_plan = gen.choice(["basic", "standard", "premium"], size=rows, p=[0.50, 0.35, 0.15])
    col_pay = gen.choice(["upi", "card", "netbanking", "cod"], size=rows, p=[0.45, 0.30, 0.20, 0.05])
    col_net = gen.choice(["yes", "no"], size=rows, p=[0.82, 0.18])

    col_age = gen.integers(18, 71, size=rows)
    col_tenure = gen.integers(0, 73, size=rows)
    col_bill = gen.normal(loc=550, scale=180, size=rows).clip(100, 1500)
    col_calls = gen.poisson(lam=1.3, size=rows).clip(0, 10)
    col_gb = gen.gamma(shape=2.2, scale=6.0, size=rows).clip(0, 80)

    w_plan = np.where(col_plan == "basic", 0.45, np.where(col_plan == "standard", 0.10, -0.25))
    w_pay = np.where(col_pay == "cod", 0.35, np.where(col_pay == "upi", -0.08, 0.02))
    w_net = np.where(col_net == "no", 0.25, -0.05)

    score = (
        -1.2
        + 0.015 * (col_bill - 500)
        - 0.020 * col_tenure
        + 0.18 * col_calls
        + 0.010 * (col_gb - 12)
        + 0.20 * (col_age < 25)
        + w_plan + w_pay + w_net
        + gen.normal(0, 0.6, size=rows)
    )
    prob = 1 / (1 + np.exp(-score))
    target = (gen.random(rows) < prob).astype(int)

    data = pd.DataFrame({
        "cust_id": np.arange(100000, 100000 + rows),
        "gender": col_gender,
        "age": col_age,
        "city": col_city,
        "plan": col_plan,
        "tenure_months": col_tenure,
        "monthly_bill": np.round(col_bill, 2),
        "support_calls": col_calls,
        "data_gb": np.round(col_gb, 2),
        "payment_method": col_pay,
        "has_internet": col_net,
        "churn": target
    })

    # missing values
    def add_nan(col, frac):
        ids = gen.choice(data.index, size=int(frac * rows), replace=False)
        data.loc[ids, col] = np.nan

    add_nan("age", 0.06)
    add_nan("city", 0.04)
    add_nan("plan", 0.03)
    add_nan("monthly_bill", 0.05)
    add_nan("has_internet", 0.02)

    # dirty categories (to test handle_unknown)
    bad = gen.choice(data.index, size=120, replace=False)
    data.loc[bad, "city"] = gen.choice(["unknown_city", "n/a", "??"], size=len(bad))

    data.to_csv(out_csv, index=False)
    return data

frame = build_dataset()
frame = frame.drop(columns=["cust_id"])  # like PassengerId drop
print(frame.shape)
print(frame.isnull().sum())

# -----------------------------
# 2) train/test split
# -----------------------------
feat = frame.drop(columns=["churn"])
label = frame["churn"]

A_train, A_test, b_train, b_test = train_test_split(
    feat, label, test_size=0.25, random_state=41, stratify=label
)

# -----------------------------
# 3) one clean ColumnTransformer (NO index confusion)
# -----------------------------
num_cols = ["age", "tenure_months", "monthly_bill", "support_calls", "data_gb"]
cat_cols = ["gender", "city", "plan", "payment_method", "has_internet"]

num_pipe = Pipeline([
    ("num_impute", SimpleImputer(strategy="median")),
    ("num_scale", MinMaxScaler())
])

cat_pipe = Pipeline([
    ("cat_impute", SimpleImputer(strategy="most_frequent")),
    ("cat_ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

prep = ColumnTransformer([
    ("nums", num_pipe, num_cols),
    ("cats", cat_pipe, cat_cols)
], remainder="drop")

# -----------------------------
# 4) model pipeline (chi2 + logistic regression)
# -----------------------------
model_pipe = Pipeline([
    ("prep", prep),
    ("kbest", SelectKBest(score_func=chi2, k=20)),
    ("clf", LogisticRegression(max_iter=2000))
])

model_pipe.fit(A_train, b_train)

pred = model_pipe.predict(A_test)

print("Accuracy:", round(accuracy_score(b_test, pred), 4))
print(classification_report(b_test, pred))


(15000, 11)
gender              0
age               900
city              593
plan              450
tenure_months       0
monthly_bill      750
support_calls       0
data_gb             0
payment_method      0
has_internet      300
churn               0
dtype: int64
Accuracy: 0.816
              precision    recall  f1-score   support

           0       0.83      0.87      0.85      2221
           1       0.80      0.74      0.77      1529

    accuracy                           0.82      3750
   macro avg       0.81      0.80      0.81      3750
weighted avg       0.82      0.82      0.81      3750



In [21]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# -----------------------------
# 1) make synthetic dataset (different variable names)
# -----------------------------
def create_big_table(rows=15000, seed=7, out_csv="telecom_churn_15k.csv"):
    r = np.random.default_rng(seed)

    g = r.choice(["male", "female"], size=rows, p=[0.52, 0.48])
    c = r.choice(
        ["mumbai", "delhi", "bangalore", "hyderabad", "pune", "chennai", "kolkata", "ahmedabad"],
        size=rows,
        p=[0.15, 0.14, 0.13, 0.12, 0.12, 0.11, 0.12, 0.11]
    )
    pl = r.choice(["basic", "standard", "premium"], size=rows, p=[0.50, 0.35, 0.15])
    pay = r.choice(["upi", "card", "netbanking", "cod"], size=rows, p=[0.45, 0.30, 0.20, 0.05])
    net = r.choice(["yes", "no"], size=rows, p=[0.82, 0.18])

    a = r.integers(18, 71, size=rows)
    ten = r.integers(0, 73, size=rows)
    bill = r.normal(loc=550, scale=180, size=rows).clip(100, 1500)
    calls = r.poisson(lam=1.3, size=rows).clip(0, 10)
    gb = r.gamma(shape=2.2, scale=6.0, size=rows).clip(0, 80)

    w_pl = np.where(pl == "basic", 0.45, np.where(pl == "standard", 0.10, -0.25))
    w_pay = np.where(pay == "cod", 0.35, np.where(pay == "upi", -0.08, 0.02))
    w_net = np.where(net == "no", 0.25, -0.05)

    s = (
        -1.2
        + 0.015 * (bill - 500)
        - 0.020 * ten
        + 0.18 * calls
        + 0.010 * (gb - 12)
        + 0.20 * (a < 25)
        + w_pl + w_pay + w_net
        + r.normal(0, 0.6, size=rows)
    )
    pr = 1 / (1 + np.exp(-s))
    y = (r.random(rows) < pr).astype(int)

    big = pd.DataFrame({
        "cust_id": np.arange(100000, 100000 + rows),
        "gender": g,
        "age": a,
        "city": c,
        "plan": pl,
        "tenure_months": ten,
        "monthly_bill": np.round(bill, 2),
        "support_calls": calls,
        "data_gb": np.round(gb, 2),
        "payment_method": pay,
        "has_internet": net,
        "churn": y
    })

    def put_nan(col, frac):
        idx = r.choice(big.index, size=int(frac * rows), replace=False)
        big.loc[idx, col] = np.nan

    put_nan("age", 0.06)
    put_nan("city", 0.04)
    put_nan("plan", 0.03)
    put_nan("monthly_bill", 0.05)
    put_nan("has_internet", 0.02)

    dirty = r.choice(big.index, size=120, replace=False)
    big.loc[dirty, "city"] = r.choice(["unknown_city", "n/a", "??"], size=len(dirty))

    big.to_csv(out_csv, index=False)
    return big

table = create_big_table()
table = table.drop(columns=["cust_id"])

# -----------------------------
# 2) split
# -----------------------------
X_all = table.drop(columns=["churn"])
y_all = table["churn"]

X_tr, X_te, y_tr, y_te = train_test_split(
    X_all, y_all, test_size=0.25, random_state=41, stratify=y_all
)

# -----------------------------
# 3) preprocessing with column NAMES (no index confusion)
# -----------------------------
num_features = ["age", "tenure_months", "monthly_bill", "support_calls", "data_gb"]
cat_features = ["gender", "city", "plan", "payment_method", "has_internet"]

num_steps = Pipeline([
    ("num_fill", SimpleImputer(strategy="median"))
])

cat_steps = Pipeline([
    ("cat_fill", SimpleImputer(strategy="most_frequent")),
    ("cat_ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

prep_steps = ColumnTransformer([
    ("nums", num_steps, num_features),
    ("cats", cat_steps, cat_features)
], remainder="drop")

# -----------------------------
# 4) decision tree pipeline
# -----------------------------
dt_pipe = Pipeline([
    ("prep", prep_steps),
    ("tree", DecisionTreeClassifier(
        random_state=41,
        max_depth=6,
        min_samples_leaf=20
    ))
])

dt_pipe.fit(X_tr, y_tr)

out = dt_pipe.predict(X_te)

print("Accuracy:", round(accuracy_score(y_te, out), 4))
print(classification_report(y_te, out))


Accuracy: 0.8061
              precision    recall  f1-score   support

           0       0.82      0.87      0.84      2221
           1       0.79      0.72      0.75      1529

    accuracy                           0.81      3750
   macro avg       0.80      0.79      0.80      3750
weighted avg       0.81      0.81      0.80      3750



In [25]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# -----------------------------
# data (same as yours)
# -----------------------------
df = make_practice_dataset()
df = df.iloc[:, 1:]  # drop customer_id like you did

X = df.iloc[:, :-1]
Y = df.iloc[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.25, random_state=41, stratify=Y
)

# -----------------------------
# IMPORTANT CHANGE 1:
# use column NAMES instead of index positions (no mismatch after passthrough)
# -----------------------------
num_cols = ["age", "monthly_bill", "tenure_months", "support_calls", "data_gb"]
cat_cols = ["gender", "city", "plan", "payment_method", "has_internet"]

# -----------------------------
# Numeric pipeline: Imputation (median) + Scaling (MinMaxScaler)
# -----------------------------
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# -----------------------------
# Categorical pipeline: Imputation (most frequent) + One-hot encoding
# -----------------------------
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# -----------------------------
# Create the preprocessor using a single ColumnTransformer
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    remainder='drop' # Drop any columns not specified
)

# feature selection (optional; also not needed for tree, but ok to practice)
tnf4 = SelectKBest(score_func=chi2, k=20)

# model
tnf5 = DecisionTreeClassifier(random_state=41, max_depth=6, min_samples_leaf=20)

# pipeline (same step-by-step style)
pipe = make_pipeline(preprocessor, tnf4, tnf5)

pipe.fit(X_train, Y_train)

pred = pipe.predict(X_test)
print("Accuracy:", round(accuracy_score(Y_test, pred), 4))


Accuracy: 0.8005
