In [1]:
import pandas as pd

# Load the original dataset
original_dataset_path = '/content/drive/MyDrive/DM_Project_Churn_rate/WA_Fn-UseC_-Telco-Customer-Churn.csv'
original_df = pd.read_csv(original_dataset_path)

# Step 1: Drop irrelevant columns
columns_to_drop = [
    "customerID", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies", "TotalCharges"
]
processed_df = original_df.drop(columns=columns_to_drop)

# Step 2: Binary Encoding for categorical values
processed_df["Gender"] = processed_df["gender"].map({"Female": 0, "Male": 1})
processed_df["Partner"] = processed_df["Partner"].map({"Yes": 1, "No": 0})
processed_df["Dependents"] = processed_df["Dependents"].map({"Yes": 1, "No": 0})
processed_df["PhoneService"] = processed_df["PhoneService"].map({"Yes": 1, "No": 0})
processed_df["MultipleLines"] = processed_df["MultipleLines"].map({
    "No phone service": 0, "No": 0, "Yes": 1
})
processed_df["Contract"] = processed_df["Contract"].map({
    "Month-to-month": 0, "One year": 1, "Two year": 2
})
processed_df["PaperlessBilling"] = processed_df["PaperlessBilling"].map({"Yes": 1, "No": 0})
processed_df["PaymentMethod"] = original_df["PaymentMethod"].map({
    "Electronic check": 0,
    "Mailed check": 0,
    "Bank transfer (automatic)": 1,
    "Credit card (automatic)": 1
})
processed_df["Churn"] = processed_df["Churn"].map({"No": 0, "Yes": 1})

# Step 3: Feature Engineering
processed_df["PremiumServices"] = original_df[[
    "OnlineSecurity", "OnlineBackup", "DeviceProtection",
    "TechSupport", "StreamingTV", "StreamingMovies"
]].apply(lambda x: int(any(val == "Yes" for val in x)), axis=1)

processed_df["DSL"] = (original_df["InternetService"] == "DSL").astype(int)
processed_df["Fiber"] = (original_df["InternetService"] == "Fiber optic").astype(int)

# InternetService logic correction
processed_df["InternetService"] = processed_df["DSL"] + processed_df["Fiber"]

# Step 4: Retain relevant columns and ensure final column order
final_columns = [
    "Gender", "SeniorCitizen", "Partner", "Dependents",
    "PhoneService", "MultipleLines", "InternetService", "DSL",
    "Fiber", "PremiumServices", "Contract", "PaperlessBilling",
    "PaymentMethod", "tenure", "MonthlyCharges", "Churn"
]
processed_df = processed_df[final_columns]

# Step 5: Align tenure and MonthlyCharges exactly
processed_df["tenure"] = original_df["tenure"]
processed_df["MonthlyCharges"] = original_df["MonthlyCharges"].round(0).astype(int)

# Save the final dataset to a CSV file
output_path = '/content/drive/MyDrive/DM_Project_Churn_rate/final_derived_dataset1.csv'
processed_df.to_csv(output_path, index=False)

# Confirm the file is ready
print(f"The final derived dataset has been saved to: {output_path}")


The final derived dataset has been saved to: /content/drive/MyDrive/DM_Project_Churn_rate/final_derived_dataset1.csv
