In [7]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:/Users/anton/Downloads/archive/Telco-Customer-Churn.csv")

# Quick exploration
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())


   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [8]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors='coerce')
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


In [18]:
X = df.drop(["Churn","customerID"], axis=1)   # raw features
y = df["Churn"].apply(lambda x: 1 if x=="Yes" else 0)  # encode target


In [19]:
num_cols = ["tenure", "MonthlyCharges", "TotalCharges"]
cat_cols = [col for col in X.columns if col not in num_cols]


In [20]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols)
])

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, class_weight="balanced", random_state=42))
])


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)


In [22]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.7934705464868701
[[946  90]
 [201 172]]
              precision    recall  f1-score   support

           0       0.82      0.91      0.87      1036
           1       0.66      0.46      0.54       373

    accuracy                           0.79      1409
   macro avg       0.74      0.69      0.70      1409
weighted avg       0.78      0.79      0.78      1409



In [23]:
import pickle

with open("churn_pipeline_v3.pkl", "wb") as f:
    pickle.dump(pipeline, f)


In [24]:
# Pick a random customer from X_test
sample = X_test.iloc[[0]]  # keep it as a DataFrame

prediction = pipeline.predict(sample)[0]
probability = pipeline.predict_proba(sample)[0][1]

print("Prediction:", "Churn" if prediction==1 else "No Churn")
print("Probability of Churn:", probability)


Prediction: Churn
Probability of Churn: 0.685


In [26]:
import pandas as pd

# Load saved pipeline
import pickle
pipeline = pickle.load(open("churn_pipeline_v3.pkl", "rb"))

# --- Manually input a single customer ---
customer_data = {
    "gender": ["Female"],
    "SeniorCitizen": [0],
    "Partner": ["Yes"],
    "Dependents": ["No"],
    "tenure": [12],
    "PhoneService": ["Yes"],
    "MultipleLines": ["No"],
    "InternetService": ["DSL"],
    "OnlineSecurity": ["No"],
    "OnlineBackup": ["Yes"],
    "DeviceProtection": ["No"],
    "TechSupport": ["No"],
    "StreamingTV": ["Yes"],
    "StreamingMovies": ["No"],
    "Contract": ["Month-to-month"],
    "PaperlessBilling": ["Yes"],
    "PaymentMethod": ["Electronic check"],
    "MonthlyCharges": [75.5],
    "TotalCharges": [905.5]
}

# Convert to DataFrame
input_df = pd.DataFrame(customer_data)

# --- Make prediction ---
prediction = pipeline.predict(input_df)[0]
probability = pipeline.predict_proba(input_df)[0][1]

print("Prediction:", "Churn" if prediction==1 else "No Churn")
print("Probability of Churn:", probability)


Prediction: No Churn
Probability of Churn: 0.23
