In [1]:
import os
import psycopg
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)

from dotenv import load_dotenv
import os
load_dotenv()

EXPERIMENT_NAME = "churn_marselkamilov_EDA" # напишите название вашего эксперимента
RUN_NAME = "eda"
REGISTRY_MODEL_NAME = "churn_marselkamilov_EDA_train" # название зарегистрированной модели 

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id
if not experiment_id: 
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id


* 'schema_extra' has been renamed to 'json_schema_extra'


In [2]:
TABLE_NAME = "users_churn" # таблица с данными в postgres 

# connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0


In [None]:
# определение категориальных колонок, которые будут преобразованы
cat_columns = ["type", "payment_method", "internet_service", "gender"]

# создание объекта OneHotEncoder для преобразования категориальных переменных
# auto - автоматическое определение категорий
# ignore - игнорировать ошибки, если встречается неизвестная категория
# max_categories - максимальное количество уникальных категорий
# sparse_output - вывод в виде разреженной матрицы, если False, то в виде обычного массива
# drop="first" - удаляет первую категорию, чтобы избежать ловушки мультиколлинеарности
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')

# применение OneHotEncoder к данным. Преобразование категориальных данных в массив
encoded_features = encoder_oh.fit_transform(df[cat_columns].to_numpy())

# преобразование полученных признаков в DataFrame и установка названий колонок
# get_feature_names_out() - получение имён признаков после преобразования
encoded_df = pd.DataFrame(encoded_features,columns=encoder_oh.get_feature_names_out())

# конкатенация исходного DataFrame с новым DataFrame, содержащим закодированные категориальные признаки
# axis=1 означает конкатенацию по колонкам
obj_df = pd.concat([df, encoded_df], axis=1)

obj_df.head(2)

In [None]:
num_columns = ["monthly_charges", "total_charges"]

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

df = df.dropna().reset_index()

num_df = df[num_columns]

# SplineTransformer
encoder_spl = SplineTransformer(n_knots=n_knots, degree=degree_spline)

encoded_features = encoder_spl.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(encoded_features, 
                          columns=encoder_spl.get_feature_names_out(num_columns))
num_df = pd.concat([num_df, encoded_df], axis=1)


# QuantileTransformer
encoder_q = QuantileTransformer(n_quantiles=n_quantiles)

encoded_features = encoder_q.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_q.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_q_{n_quantiles}" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# RobustScaler
encoder_rb = RobustScaler()
encoded_features = encoder_rb.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_rb.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_robust" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


# PolynomialFeatures
encoder_pol = PolynomialFeatures(degree=degree)
encoded_features = encoder_pol.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_pol.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_pol" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)

# KBinsDiscretizer
encoder_kbd = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample)
encoded_features = encoder_kbd.fit_transform(df[num_columns].to_numpy())

encoded_df = pd.DataFrame(
    encoded_features, 
    columns=encoder_kbd.get_feature_names_out(num_columns)
)
encoded_df.columns = [col + f"_bin" for col in num_columns]
num_df = pd.concat([num_df, encoded_df], axis=1)


num_df.head(2)

In [3]:
cat_columns = ["type", "payment_method", "internet_service", "gender"]

num_columns = ["monthly_charges", "total_charges"]

df["monthly_charges"].fillna(value=df["monthly_charges"].mean(), inplace=True)
df["total_charges"].fillna(value=df["total_charges"].mean(), inplace=True)

n_knots = 3
degree_spline = 4
n_quantiles=100
degree = 3
n_bins = 5
encode = 'ordinal'
strategy = 'uniform'
subsample = None

numeric_transformer = ColumnTransformer(transformers=[('spl', SplineTransformer(n_knots=n_knots, degree=degree_spline), num_columns), 
                                                      ('q', QuantileTransformer(n_quantiles=n_quantiles), num_columns), 
                                                      ('rb', RobustScaler(), num_columns), 
                                                      ('pol', PolynomialFeatures(degree=degree), num_columns), 
                                                      ('kbd', KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy, subsample=subsample), num_columns)])


categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_columns), 
                                               ('cat', categorical_transformer, cat_columns)], 
                                               n_jobs=-1)

encoded_features = preprocessor.fit_transform(df)

transformed_df = pd.DataFrame(
    encoded_features, 
    columns=preprocessor.get_feature_names_out()
)

df = pd.concat([df, transformed_df], axis=1)
df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,num__kbd__monthly_charges,num__kbd__total_charges,cat__type_One year,cat__type_Two year,cat__payment_method_Credit card (automatic),cat__payment_method_Electronic check,cat__payment_method_Mailed check,cat__internet_service_Fiber optic,cat__internet_service_None,cat__gender_Male
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [4]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    mlflow.sklearn.log_model(preprocessor, "column_transformer") 



In [5]:
run_id

'f72c585af4614422b4b6088564f515cf'

In [6]:
# Обучение модели
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

try:
    df = df.drop(cat_columns)
except:
    1

try:
    df= df.drop(num_columns)
except:
    1

X_tr, X_val, y_tr, y_test = train_test_split(df, df['target'], stratify=df['target'])

model = CatBoostClassifier(auto_class_weights='Balanced')

# создайте пайплайн
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)]
)

# обучите пайплайн
pipeline.fit(X_tr, y_tr)

# получите предсказания для тестовой выборки
prediction = pipeline.predict(X_val)
probas = pipeline.predict_proba(X_val)

Learning rate set to 0.020969
0:	learn: 0.6843476	total: 69.7ms	remaining: 1m 9s
1:	learn: 0.6761355	total: 77.5ms	remaining: 38.7s
2:	learn: 0.6680747	total: 84.9ms	remaining: 28.2s
3:	learn: 0.6603421	total: 91.9ms	remaining: 22.9s
4:	learn: 0.6528592	total: 99.1ms	remaining: 19.7s
5:	learn: 0.6452555	total: 106ms	remaining: 17.6s
6:	learn: 0.6388230	total: 113ms	remaining: 16s
7:	learn: 0.6327765	total: 120ms	remaining: 14.8s
8:	learn: 0.6274465	total: 127ms	remaining: 14s
9:	learn: 0.6217884	total: 134ms	remaining: 13.3s
10:	learn: 0.6159349	total: 141ms	remaining: 12.7s
11:	learn: 0.6106892	total: 148ms	remaining: 12.2s
12:	learn: 0.6055460	total: 155ms	remaining: 11.8s
13:	learn: 0.6000700	total: 163ms	remaining: 11.5s
14:	learn: 0.5953831	total: 171ms	remaining: 11.2s
15:	learn: 0.5913001	total: 178ms	remaining: 10.9s
16:	learn: 0.5866708	total: 186ms	remaining: 10.7s
17:	learn: 0.5821763	total: 193ms	remaining: 10.5s
18:	learn: 0.5779236	total: 200ms	remaining: 10.3s
19:	learn:

In [9]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_EDA
experiment_id:  6
run_id:  c9009a3bc5534e4b91488db39a6156f2


In [8]:

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(cb_model=pipeline['model'],
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
		)

Successfully registered model 'churn_marselkamilov_EDA_train'.
2024/10/25 11:42:41 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_marselkamilov_EDA_train, version 1
Created version '1' of model 'churn_marselkamilov_EDA_train'.


In [10]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_EDA
experiment_id:  6
run_id:  c9009a3bc5534e4b91488db39a6156f2
