In [10]:
# Импорт необходимых библиотек
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
import numpy as np

# Загрузка датасета с данными о недвижимости в Калифорнии
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LinearRegression()

# Инициализация Sequential Feature Selector для backward selection
sbs = SequentialFeatureSelector(lr,
    k_features=6,       
    forward=False,      
    floating=False,     
    scoring='neg_mean_squared_error', 
    cv=2,               
    n_jobs=-1
)
# Обучение SBS на обучающем наборе
sbs = sbs.fit(X_train,y_train)

# Выбор признаков с помощью SBS
selected_features_sbs = sbs.k_feature_idx_

# Инициализация Sequential Feature Selector с включённым параметром floating для SFFS
sbfs1 = SequentialFeatureSelector(lr,    
    k_features=6,       
    forward=False,      
    floating=True,     
    scoring='neg_mean_squared_error', 
    cv=2,               
    n_jobs=-1                
)
# Обучение SBFS на обучающем наборе
sbfs = sbfs1.fit(X_train,y_train)

# Выбор признаков с помощью SBFS
selected_features_sbfs = sbfs1.k_feature_idx_

# Объединение индексов выбранных признаков из SBS и SBFS для обучения
selected_features = list(set(selected_features_sbs + selected_features_sbfs))
X_train_selected = sbfs.transform(X_train)
X_test_selected = X_test[:, selected_features]

# Обучение модели линейной регрессии на выбранных признаках
model = LinearRegression()
model.fit(X_train_selected, y_train)

In [11]:
import os

import psycopg
import pandas as pd
import mlflow
from catboost import CatBoostClassifier
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_marselkamilov_FEATURE_SELECTION2"
RUN_NAME = "feature_selection"
REGISTRY_MODEL_NAME = "churn_marselkamilov_FS_train"
FS_ASSETS = "fs_assets" 

In [12]:
from dotenv import load_dotenv
import os
load_dotenv()

# connection = {"sslmode": "verify-full", "target_session_attrs": "read-write"}
connection = {"sslmode": "require", "target_session_attrs": "read-write"}
postgres_credentials = {
    "host": os.getenv("DB_DESTINATION_HOST"),
    "port": os.getenv("DB_DESTINATION_PORT"),
    "dbname": os.getenv("DB_DESTINATION_NAME"),
    "user": os.getenv("DB_DESTINATION_USER"),
    "password": os.getenv("DB_DESTINATION_PASSWORD"),
}

connection.update(postgres_credentials)

with psycopg.connect(**connection) as conn:

    with conn.cursor() as cur:
        cur.execute(f"SELECT * FROM {TABLE_NAME}")
        data = cur.fetchall()
        columns = [col[0] for col in cur.description]

df = pd.DataFrame(data, columns=columns)

df.head(2)

Unnamed: 0,id,customer_id,begin_date,end_date,type,paperless_billing,payment_method,monthly_charges,total_charges,internet_service,...,device_protection,tech_support,streaming_tv,streaming_movies,gender,senior_citizen,partner,dependents,multiple_lines,target
0,1,7590-VHVEG,2020-01-01,NaT,Month-to-month,Yes,Electronic check,29.85,29.85,DSL,...,No,No,No,No,Female,0,Yes,No,,0
1,2,5575-GNVDE,2017-04-01,NaT,One year,No,Mailed check,56.95,1889.5,DSL,...,Yes,No,No,No,Male,0,No,No,No,0


In [13]:
from sklearn.model_selection import train_test_split

cat_features = [
    'paperless_billing',
    'payment_method',
    'internet_service',
    'online_security',
    'online_backup',
    'device_protection',
    'tech_support',
    'streaming_tv',
    'streaming_movies',
    'gender',
    'senior_citizen',
    'partner',
    'dependents',
    'multiple_lines',
]

num_features = ["monthly_charges", "total_charges"]

target = 'target'

from sklearn.preprocessing import OneHotEncoder
encoder_oh = OneHotEncoder(categories='auto', handle_unknown='ignore', max_categories=10, sparse_output=False, drop='first')
encoded_features = encoder_oh.fit_transform(df[cat_features].to_numpy())
encoded_df = pd.DataFrame(encoded_features,columns=encoder_oh.get_feature_names_out())


In [14]:
obj_df = pd.concat([df[num_features + [target]], encoded_df], axis=1)
obj_df

Unnamed: 0,monthly_charges,total_charges,target,x0_Yes,x1_Credit card (automatic),x1_Electronic check,x1_Mailed check,x2_Fiber optic,x2_None,x3_Yes,...,x7_Yes,x7_None,x8_Yes,x8_None,x9_Male,x10_1,x11_Yes,x12_Yes,x13_Yes,x13_None
0,29.85,29.85,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,56.95,1889.50,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,53.85,108.15,1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,42.30,1840.75,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,70.70,151.65,1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,84.80,1990.50,0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0
7039,103.20,7362.90,0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
7040,29.60,346.45,0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
7041,74.40,306.60,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0


In [15]:
df = obj_df

df["monthly_charges"].fillna(value=df["monthly_charges"].mean(), inplace=True)
df["total_charges"].fillna(value=df["total_charges"].mean(), inplace=True)
df = df.dropna()

X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=[target]),
    df['target'],
    test_size=0.2,
    shuffle=False,
)

# from autofeat import AutoFeatClassifier
# transformations = ('1/', 'log', 'abs', 'sqrt')
# afc = AutoFeatClassifier(categorical_cols=cat_features, 
#                          transformations=transformations, 
#                          feateng_steps=1, 
#                          n_jobs=-1)

# X_train_features = afc.fit_transform(X_train, y_train)
# X_test_features = afc.transform(X_test)

In [27]:
estimator = RandomForestClassifier()

sfs = SFS(estimator,    
    k_features=10,       
    forward=True,      
    floating=False,     
    scoring='roc_auc', 
    cv=4,               
    n_jobs=-1                
)

sbs = SFS(estimator,    
    k_features=10,       
    forward=False,      
    floating=False,     
    scoring='roc_auc', 
    cv=4,               
    n_jobs=-1                
)
sfs = sfs.fit(X_train, y_train)
sbs = sbs.fit(X_train, y_train)

top_sfs = sfs.k_feature_names_
top_sbs = sbs.k_feature_names_

print('\nSequential Forward Selection (k=10)')
print('CV Score:')
print(sfs.k_score_)

print('\nSequential Backward Selection')
print('CV Score:')
print(sbs.k_score_)


Sequential Forward Selection (k=10)
CV Score:
0.7914538102801685

Sequential Backward Selection
CV Score:
None



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

In [29]:
sfs_df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
# sbs_df = pd.DataFrame.from_dict(sbs.get_metric_dict()).T 
sbs_df = sfs_df

In [None]:
os.mkdir(FS_ASSETS)

sfs_df.to_csv(f"{FS_ASSETS}/sfs.csv")
sbs_df.to_csv(f"{FS_ASSETS}/sbs.csv") 

In [None]:
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Forward Selection (w. StdDev)')
plt.grid()
plt.show()

plt.savefig("fs_assets/sfs.png") 

In [None]:
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

fig = plot_sfs(sbs.get_metric_dict(), kind='std_dev')

plt.title('Sequential Backward Selection (w. StdDev)')
plt.grid()
plt.show()

plt.savefig("fs_assets/sbs.png") 

In [31]:
top_sfs = sfs.k_feature_names_
top_sbs = sbs.k_feature_names_

In [33]:
top_sfs

('x1_Electronic check',
 'x1_Mailed check',
 'x2_Fiber optic',
 'x3_Yes',
 'x3_None',
 'x5_None',
 'x6_Yes',
 'x7_None',
 'x8_None',
 'x11_Yes')

In [35]:
top_sbs = top_sfs

In [36]:
interc_features = list(set(top_sbs) & set(top_sfs))
union_features = list(set(top_sbs) | set(top_sfs))

In [47]:
import os
import mlflow
from dotenv import load_dotenv

load_dotenv()

TABLE_NAME = "users_churn"
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

EXPERIMENT_NAME = "churn_marselkamilov_FEATURE_SELECTION2"
RUN_NAME = "feature_selection"
REGISTRY_MODEL_NAME = "churn_marselkamilov_FS_train"
FS_ASSETS = "fs_assets" 

os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')

mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")
mlflow.set_registry_uri(f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}")

experiment_id = mlflow.set_experiment(EXPERIMENT_NAME).experiment_id
if not experiment_id: 
    experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=f"{RUN_NAME}_intersection_and_union", experiment_id=experiment_id) as run:
    run_id = run.info.run_id
   
    mlflow.log_artifacts(FS_ASSETS) 

In [None]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

In [45]:
from catboost import CatBoostClassifier

model_interc = CatBoostClassifier(auto_class_weights='Balanced')
model_union = CatBoostClassifier(auto_class_weights='Balanced')

model_interc.fit(X_train[interc_features],y_train)
model_union.fit(X_train[union_features],y_train)

Learning rate set to 0.021554
0:	learn: 0.6863977	total: 59ms	remaining: 59s
1:	learn: 0.6796438	total: 61.1ms	remaining: 30.5s
2:	learn: 0.6735018	total: 63.1ms	remaining: 21s
3:	learn: 0.6679367	total: 64.9ms	remaining: 16.2s
4:	learn: 0.6632757	total: 66.5ms	remaining: 13.2s
5:	learn: 0.6595355	total: 68ms	remaining: 11.3s
6:	learn: 0.6549673	total: 70ms	remaining: 9.93s
7:	learn: 0.6504260	total: 72.1ms	remaining: 8.95s
8:	learn: 0.6468395	total: 73.9ms	remaining: 8.14s
9:	learn: 0.6422486	total: 75.9ms	remaining: 7.52s
10:	learn: 0.6381217	total: 77.9ms	remaining: 7s
11:	learn: 0.6338749	total: 80ms	remaining: 6.58s
12:	learn: 0.6311648	total: 81.4ms	remaining: 6.18s
13:	learn: 0.6282173	total: 83.1ms	remaining: 5.85s
14:	learn: 0.6244951	total: 85ms	remaining: 5.58s
15:	learn: 0.6213168	total: 87.3ms	remaining: 5.37s
16:	learn: 0.6179055	total: 89.4ms	remaining: 5.17s
17:	learn: 0.6145475	total: 91.6ms	remaining: 5s
18:	learn: 0.6114723	total: 93.5ms	remaining: 4.83s
19:	learn: 0

<catboost.core.CatBoostClassifier at 0x7ff674a1e620>

In [69]:
with mlflow.start_run(run_name=f"{RUN_NAME}_intersection", experiment_id=experiment_id) as run:
    run_id = run.info.run_id

    model_info = mlflow.catboost.log_model(cb_model=model_interc,
        artifact_path="models",
        registered_model_name="feature_selection",
		)
    
with mlflow.start_run(run_name=f"{RUN_NAME}_union", experiment_id=experiment_id) as run:
    run_id = run.info.run_id
    model_info = mlflow.catboost.log_model(cb_model=model_union,
        artifact_path="models",
        registered_model_name="feature_selection",
        )

Successfully registered model 'feature_selection'.
2024/10/29 10:43:50 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: feature_selection, version 1
Created version '1' of model 'feature_selection'.
Registered model 'feature_selection' already exists. Creating a new version of this model...
2024/10/29 10:43:51 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: feature_selection, version 2
Created version '2' of model 'feature_selection'.


In [65]:
run = mlflow.get_run(run_id) # ваш код здесь

print("EXPERIMENT_NAME: ", EXPERIMENT_NAME)
print("experiment_id: ", experiment_id)
print("run_id: ", run_id)

EXPERIMENT_NAME:  churn_marselkamilov_FEATURE_SELECTION2
experiment_id:  9
run_id:  625b1783c7ca4d0aaa9d11ed9908cf18
