### Preprocess the data

In [None]:
import pandas as pd
import joblib

# Load the data
data = pd.read_csv('Telco-Customer-Churn-3.csv')
# Display the first few rows of the dataframe
data.head(1)


In [None]:
# Check the data types and missing values
data.info()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Drop the 'customerID' column as it is not a useful feature
data.drop("customerID", axis=1, inplace=True)

# Separate the target variable - X: features, y: target
X = data.drop("Churn", axis=1)
y = data["Churn"]

binary_features = [
    "gender",
    "SeniorCitizen",
    "Partner",
    "Dependents",
    "PhoneService",
    "PaperlessBilling",
]
multi_category_features = [
    "MultipleLines",
    "InternetService",
    "OnlineSecurity",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "StreamingTV",
    "StreamingMovies",
    "Contract",
    "PaymentMethod",
]
numerical_features = ["tenure", "MonthlyCharges", "TotalCharges"]

# Apply Label Encoding for binary features - Transform values to 0 and 1
le = LabelEncoder()
for feature in binary_features:
    X[feature] = le.fit_transform(X[feature])

# Apply Label Encoding for the target variable - Transform values into 0 and 1
le = LabelEncoder()
y = le.fit_transform(y)

# Split the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Define the transformations
transformer = ColumnTransformer(
    [
        ("multi_category", OneHotEncoder(drop="first"), multi_category_features),
        ("numerical", StandardScaler(), numerical_features),
    ]
)
X_train = transformer.fit_transform(X_train)
X_test = transformer.transform(X_test)
joblib.dump(transformer, "column_transformer.pkl")

print("X_train shape: ", X_train.shape)

### Train the model

In [None]:
from sklearn.linear_model import LogisticRegression

# Instantiate the LogisticRegression model
lr = LogisticRegression(class_weight='balanced',random_state=42)

# Fit the model on the training data
lr.fit(X_train, y_train)

# Save the model to a file
joblib.dump(lr, 'logistic_regression_model.pkl')

### Evaluate the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Make predictions on the test data
y_pred_lr = lr.predict(X_test)
y_prob_lr = lr.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
confusion_lr = confusion_matrix(y_test, y_pred_lr)

print(
    "accuracy_score: ",
    accuracy_lr,
    "\nprecision_score: ",
    precision_lr,
    "\nrecall_score: ",
    recall_lr,
    "\nf1_score: ",
    f1_lr,
    "\nconfusion_matrix: \n",
    confusion_lr,
)

### Inference on new data

In [1]:
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder

# load saved model and transformations
lr = joblib.load("logistic_regression_model.pkl")
transformer = joblib.load('column_transformer.pkl')

In [6]:
# Inference data
data = {
    "Inputs": {
        "input1": [
            {
                "gender": "Female",
                "SeniorCitizen": "No",
                "Partner": "Yes",
                "Dependents": "No",
                "tenure": 2,
                "PhoneService": "No",
                "MultipleLines": "No phone service",
                "InternetService": "DSL",
                "OnlineSecurity": "Yes",
                "OnlineBackup": "No",
                "DeviceProtection": "Yes",
                "TechSupport": "No",
                "StreamingTV": "No",
                "StreamingMovies": "No",
                "Contract": "Month-to-month",
                "PaperlessBilling": "Yes",
                "PaymentMethod": "Credit card (automatic)",
                "MonthlyCharges": 29.85,
                "TotalCharges": 59.7,
            },
        ]
    },
    "GlobalParameters": {},
}

# Extract the 'input1' list from the dictionary
input1 = data["Inputs"]["input1"]

# Convert the list to a DataFrame
df = pd.DataFrame(input1)

# Apply Label Encoding for binary categorical variables
binary_features = [
    "gender",
    "Partner",
    "Dependents",
    "PhoneService",
    "PaperlessBilling",
]
le = LabelEncoder()
for feature in binary_features:
    df[feature] = le.fit_transform(df[feature])

df = transformer.transform(df)
print(df.shape)

y_pred_lr = lr.predict(df)
y_prob_lr = lr.predict_proba(df)[:, 1]

print(y_pred_lr)
print(y_prob_lr)

(1, 24)
[1]
[0.65309426]
