In [None]:
import featuretools as ft

es = ft.demo.load_mock_customer(return_entityset=True)
print(es)

In [2]:
import featuretools as ft
from woodwork.logical_types import Categorical, PostalCode

data = ft.demo.load_mock_customer()
transactions_df = data["transactions"].merge(data["sessions"]).merge(data["customers"])
products_df = data["products"]

In [None]:
es

In [None]:
es = es.add_dataframe(
    dataframe_name="transactions",
    dataframe=transactions_df,
    index="transaction_id",
    time_index="transaction_time",
    logical_types={
        "product_id": Categorical,
        "zip_code": PostalCode,
    },
)

In [None]:
es = es.add_dataframe(
    dataframe_name="products", dataframe=products_df, index="product_id"
)
es = es.add_relationship("products", "product_id", "transactions", "product_id")

In [None]:
print(es)

In [None]:
import featuretools as ft

es = ft.demo.load_mock_customer(return_entityset=True)

feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="customers",
    agg_primitives=["count"],
    trans_primitives=["month"],
    max_depth=1,
) 

In [None]:
ft.list_primitives()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from autofeat import AutoFeatRegressor
import pandas as pd
import numpy as np

# Загрузка данных
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]

# Разделение данных на обучающий и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Напишите код ниже #
af = AutoFeatRegressor(feateng_steps = 1,
                       max_gb = 16,
                       transformations = ["log"])
X_train_af = af.fit_transform(X_train, y_train)
X_test_af = af.transform(X_test)

lr = LinearRegression()
lr.fit(X_train_af, y_train)
lr.predict(X_test_af)

In [2]:
import os
import psycopg
import pandas as pd
import mlflow
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    OneHotEncoder, 
    SplineTransformer, 
    QuantileTransformer, 
    RobustScaler,
    PolynomialFeatures,
    KBinsDiscretizer,
)
from sklearn.model_selection import train_test_split
from autofeat import AutoFeatClassifier

from dotenv import load_dotenv
import os
load_dotenv()

TABLE_NAME = "users_churn" # таблица с данными в postgres 

# connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)


* 'schema_extra' has been renamed to 'json_schema_extra'


Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0


In [3]:
cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]
num_features = ["monthly_charges", "total_charges"]

features = cat_features + num_features

target = 'target' # колонка с таргетом вашей модели

split_column = "begin_date"
test_size = 0.2

df = df.sort_values(by=[split_column])

# df["monthly_charges"].fillna(value=df["monthly_charges"].mean(), inplace=True)
# df["total_charges"].fillna(value=df["total_charges"].mean(), inplace=True)
# df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(
    df[features],
    df['target'],
    test_size=test_size,
    shuffle=False,
)

# transformations = ["1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "sin", "cos", "exp-", "2^"] 
transformations = ('1/', 'log', 'abs', 'sqrt')

afc = AutoFeatClassifier(categorical_cols=cat_features, 
                         transformations=transformations, 
                         feateng_steps=1, 
                         n_jobs=-1)

X_train_features = afc.fit_transform(X_train, y_train)
X_test_features = afc.transform(X_test)

In [15]:
EXPERIMENT_NAME = "churn_marselkamilov_EDA" # напишите название вашего эксперимента
RUN_NAME = "fe"
REGISTRY_MODEL_NAME = "churn_marselkamilov_FE_train" # название зарегистрированной модели 

TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

artifact_path = "afc"
experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id
if not experiment_id: 
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
# experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    
    afc_info = mlflow.sklearn.log_model(afc, artifact_path=artifact_path) 

2024-10-26 16:13:12,855 INFO: Found credentials in environment variables.


In [16]:
print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_EDA
experiment_id:  6
run_id:  fb6deb9c9495435b9b0268616c13b0a3


In [18]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(auto_class_weights='Balanced')
model.fit(X_train_features,y_train)

Learning rate set to 0.021554
0:	learn: 0.6856508	total: 59.1ms	remaining: 59s
1:	learn: 0.6775725	total: 63.5ms	remaining: 31.7s
2:	learn: 0.6701278	total: 67.7ms	remaining: 22.5s
3:	learn: 0.6632739	total: 71.8ms	remaining: 17.9s
4:	learn: 0.6575876	total: 75.7ms	remaining: 15.1s
5:	learn: 0.6505159	total: 79.6ms	remaining: 13.2s
6:	learn: 0.6439733	total: 83.6ms	remaining: 11.9s
7:	learn: 0.6388063	total: 87.6ms	remaining: 10.9s
8:	learn: 0.6332314	total: 91.6ms	remaining: 10.1s
9:	learn: 0.6271204	total: 95.5ms	remaining: 9.45s
10:	learn: 0.6216112	total: 99.3ms	remaining: 8.93s
11:	learn: 0.6170797	total: 103ms	remaining: 8.47s
12:	learn: 0.6119723	total: 107ms	remaining: 8.1s
13:	learn: 0.6078693	total: 111ms	remaining: 7.81s
14:	learn: 0.6029009	total: 116ms	remaining: 7.61s
15:	learn: 0.5983712	total: 120ms	remaining: 7.36s
16:	learn: 0.5935566	total: 124ms	remaining: 7.14s
17:	learn: 0.5896281	total: 128ms	remaining: 6.97s
18:	learn: 0.5866642	total: 132ms	remaining: 6.81s
19:

<catboost.core.CatBoostClassifier at 0x7f98ea263b50>

In [19]:
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(cb_model=model,
        artifact_path="models",
        registered_model_name=REGISTRY_MODEL_NAME,
		)

Successfully registered model 'churn_marselkamilov_FE_train'.
2024/10/26 16:16:09 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: churn_marselkamilov_FE_train, version 1
Created version '1' of model 'churn_marselkamilov_FE_train'.


In [20]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_EDA
experiment_id:  6
run_id:  4d70154790dd43329218e83b27a70743


In [None]:
model_info.