In [1]:
import os
import psycopg
from dotenv import load_dotenv

import pandas as pd
import mlflow
import numpy as np
from autofeat import AutoFeatClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, 
    f1_score, 
    precision_score, 
    recall_score,
    confusion_matrix,
    log_loss)

In [2]:
load_dotenv()

True

In [3]:
TABLE_NAME = 'users_churn'
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "Data_Exploration" # название эксперимента
RUN_NAME = "feature_generation" 

In [4]:
connection = {'sslmode' : 'require', 'target_session_attrs' : "read-write"}

postgres_credetials = {
    'dbname' : os.getenv('DB_DESTINATION_NAME'),
    'host' : os.getenv('DB_DESTINATION_HOST'),
    'port' : os.getenv('DB_DESTINATION_PORT'),
    'user' : os.getenv('DB_DESTINATION_USER'),
    'password' : os.getenv('DB_DESTINATION_PASSWORD')
}

connection.update(postgres_credetials)

In [5]:
with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f'SELECT * FROM {TABLE_NAME}')
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

        df = pd.DataFrame(data, columns=columns)


In [6]:
columns_without_datetime = df.select_dtypes(exclude='datetime').columns
df = df.dropna(subset=columns_without_datetime)

In [7]:
df.head()

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0
2,3,3668-QPYBK,2019-10-01,2019-12-01,Month-to-month,Yes,Mailed check,53.85,108.15,DSL,...,No,No,No,No,Male,0,No,No,No,1
4,5,9237-HQITU,2019-09-01,2019-11-01,Month-to-month,Yes,Electronic check,70.7,151.65,Fiber optic,...,No,No,No,No,Female,0,No,No,No,1
5,6,9305-CDSKC,2019-03-01,2019-11-01,Month-to-month,Yes,Electronic check,99.65,820.5,Fiber optic,...,Yes,No,Yes,Yes,Female,0,No,No,Yes,1
6,7,1452-KIOVK,2018-04-01,NaT,Month-to-month,Yes,Credit card (automatic),89.1,1949.4,Fiber optic,...,No,No,Yes,No,Male,0,No,Yes,Yes,0


In [8]:
X = df.drop(columns=['target', 'end_date'])
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [9]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

transformations = ["log", "sqrt", "abs", "1/"]

afc = AutoFeatClassifier(
    categorical_cols=cat_features, 
    transformations=transformations, 
    feateng_steps=1, 
    n_jobs=-1)

X_train_features = afc.fit_transform(X_train[features], y_train)
X_test_features = afc.transform(X_test[features])


In [10]:
mlflow.set_tracking_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')
mlflow.set_registry_uri(f'http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}')

In [11]:
artifact_path = 'afc'

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    run_id = run.info.run_id

    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path) 

2025-02-10 09:42:50,516 INFO: Found credentials in environment variables.


In [12]:
display(experiment_id)
display(run_id)

'4'

'dc0eccdb2efd4eb48cb9d5d1e028140d'

In [13]:
model = LogisticRegression()

In [14]:
model.fit(X_train_features, y_train)

In [15]:
prediction = model.predict(X_test_features)
proba = model.predict_proba(X_test_features)[:, 1]

In [16]:
metrics = {}

_, err1, _, err2 = confusion_matrix(y_test, prediction, normalize='all').ravel()
auc = roc_auc_score(y_test, proba)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
f1 = f1_score(y_test, prediction)
logloss = log_loss(y_test, prediction)

metrics["err1"] = err1
metrics["err2"] = err2
metrics["auc"] = auc
metrics["precision"] = precision
metrics["recall"] = recall
metrics["f1"] = f1
metrics["logloss"] = logloss

In [17]:
metrics

{'err1': 0.09100310237849017,
 'err2': 0.19751809720785937,
 'auc': 0.8307134048003036,
 'precision': 0.6845878136200717,
 'recall': 0.5753012048192772,
 'f1': 0.6252045826513911,
 'logloss': 8.535673863606855}

In [18]:
EXPERIMENT_NAME = 'churn_fio'
RUN_NAME = 'feature_generation'
REGISTRY_MODEL_NAME = 'churn_model_maximpetrov'

In [19]:
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if experiment:
    experiment_id = experiment.experiment_id
else:
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

In [20]:
with mlflow.start_run(experiment_id=experiment_id, run_name=RUN_NAME) as run:
    run_id = run.info.run_id
    mlflow.log_metrics(metrics)

    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME
    )

Registered model 'churn_model_maximpetrov' already exists. Creating a new version of this model...
2025/02/10 09:43:19 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_model_maximpetrov, version 9
Created version '9' of model 'churn_model_maximpetrov'.


In [21]:
run_id

'798e1ce82e234b3fb975f5a1e08c359a'