In [1]:
pip install -r requirements.txt


Collecting streamlit (from -r requirements.txt (line 7))
  Downloading streamlit-1.34.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyarrow (from -r requirements.txt (line 8))
  Downloading pyarrow-16.0.0-cp312-cp312-win_amd64.whl.metadata (3.1 kB)
Collecting jolib (from -r requirements.txt (line 9))
  Using cached joLib-0.0.1-py3-none-any.whl.metadata (554 bytes)
Collecting altair<6,>=4.0 (from streamlit->-r requirements.txt (line 7))
  Downloading altair-5.3.0-py3-none-any.whl.metadata (9.2 kB)
Collecting cachetools<6,>=4.0 (from streamlit->-r requirements.txt (line 7))
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting toml<2,>=0.10.1 (from streamlit->-r requirements.txt (line 7))
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->-r requirements.txt (line 7))
  Downloading pydeck-0.9.0-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog>=2.1.5 (from streamlit->-r requirements.txt (l

In [2]:
import numpy as np
import pandas as pd
import os

from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.metrics import (
    accuracy_score, classification_report, recall_score, confusion_matrix,
    roc_auc_score, precision_score, f1_score, roc_curve, auc
)
from sklearn.preprocessing import OrdinalEncoder

from catboost import CatBoostClassifier, Pool

In [3]:
data_path = "data.csv"

In [4]:
df = pd.read_csv(data_path)

In [5]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,Yes


In [6]:
df.TotalCharges.describe()

count     7043
unique    6531
top           
freq        11
Name: TotalCharges, dtype: object

In [7]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['TotalCharges'].fillna(df['tenure'] * df['MonthlyCharges'], inplace=True)


In [8]:
df.TotalCharges.describe()

count    7043.000000
mean     2279.734304
std      2266.794470
min         0.000000
25%       398.550000
50%      1394.550000
75%      3786.600000
max      8684.800000
Name: TotalCharges, dtype: float64

In [9]:
df.SeniorCitizen.describe()

count    7043.000000
mean        0.162147
std         0.368612
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: SeniorCitizen, dtype: float64

In [10]:
df.MultipleLines.value_counts()

MultipleLines
No                  3390
Yes                 2971
No phone service     682
Name: count, dtype: int64

In [11]:
df['MultipleLines'] = df['MultipleLines'].replace('No phone service', 'No')

In [12]:
columns_to_replace = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']

In [13]:
for x in columns_to_replace:
    df[x] = df[x].replace('No internet service', 'No')

In [14]:
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})

  df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})


In [15]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,0
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,0
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,1


In [16]:
df.InternetService.value_counts()

InternetService
Fiber optic    3096
DSL            2421
No             1526
Name: count, dtype: int64

In [17]:
df.PaymentMethod.value_counts()

PaymentMethod
Electronic check             2365
Mailed check                 1612
Bank transfer (automatic)    1544
Credit card (automatic)      1522
Name: count, dtype: int64

In [18]:
X= df.drop(columns = ['Churn'])
y = df['Churn']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

In [21]:
categorical_columns

['customerID',
 'gender',
 'Partner',
 'Dependents',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod']

In [22]:
cat_model = CatBoostClassifier(verbose=True, random_state=0, scale_pos_weight=3)
cat_model.fit(X_train, y_train, cat_features=categorical_columns, eval_set=(X_test, y_test))

Learning rate set to 0.048569
0:	learn: 0.6738828	test: 0.6726456	best: 0.6726456 (0)	total: 203ms	remaining: 3m 22s
1:	learn: 0.6550831	test: 0.6523851	best: 0.6523851 (1)	total: 260ms	remaining: 2m 9s
2:	learn: 0.6418068	test: 0.6380715	best: 0.6380715 (2)	total: 286ms	remaining: 1m 34s
3:	learn: 0.6272434	test: 0.6219899	best: 0.6219899 (3)	total: 316ms	remaining: 1m 18s
4:	learn: 0.6152535	test: 0.6093077	best: 0.6093077 (4)	total: 347ms	remaining: 1m 8s
5:	learn: 0.6049905	test: 0.5981503	best: 0.5981503 (5)	total: 380ms	remaining: 1m 2s
6:	learn: 0.5963607	test: 0.5886843	best: 0.5886843 (6)	total: 450ms	remaining: 1m 3s
7:	learn: 0.5880757	test: 0.5798546	best: 0.5798546 (7)	total: 491ms	remaining: 1m
8:	learn: 0.5807857	test: 0.5714693	best: 0.5714693 (8)	total: 516ms	remaining: 56.8s
9:	learn: 0.5719231	test: 0.5620651	best: 0.5620651 (9)	total: 551ms	remaining: 54.5s
10:	learn: 0.5648340	test: 0.5539977	best: 0.5539977 (10)	total: 590ms	remaining: 53.1s
11:	learn: 0.5584295	t

<catboost.core.CatBoostClassifier at 0x274ae552e10>

In [23]:
# Predict on test set
y_pred = cat_model.predict(X_test)

# Calculate evaluation metrics
accuracy, recall, roc_auc, precision = [round(metric(y_test, y_pred), 4) for metric in [accuracy_score, recall_score, roc_auc_score, precision_score]]

In [24]:
model_names = ['CatBoost_Model']
result = pd.DataFrame({'Accuracy': accuracy, 'Recall': recall, 'Roc_Auc': roc_auc, 'Precision': precision}, index=model_names)

# Print results
print(result)

                Accuracy  Recall  Roc_Auc  Precision
CatBoost_Model    0.7502  0.8445   0.7804     0.5172


In [25]:
cat_model.save_model('catboost.cbm')

In [26]:
import pickle

In [27]:
with open('X_train.pkl', 'wb') as f:
    pickle.dump(X_train, f)

In [28]:
with open('y_train.pkl', 'wb') as f:
    pickle.dump(y_train, f)

In [29]:
with open('X_test.pkl', 'wb') as f:
    pickle.dump(X_test, f)

In [30]:
with open('y_test.pkl', 'wb') as f:
    pickle.dump(y_test, f)

In [31]:
df.to_parquet('data.parquet', index=False)