In [1]:
import pandas as pd
from pathlib import Path
import sys
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Получаем абсолютный путь к src
parent_path = Path.cwd().parent
sys.path.append(str(parent_path))

In [2]:

from sklearn.linear_model import LogisticRegression


# Метрики для оценки классификации
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report

# Утилиты для подготовки данных
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [3]:
from scripts.config import get_processed_data_path
from scripts.config import get_seed


SEED = int(get_seed('../configs/data_config.yaml'))

In [4]:
df = pd.read_csv(get_processed_data_path(version= 0, path = '../configs/data_config.yaml'))
df.drop(['Unnamed: 0'], axis = 1, inplace= True)

In [5]:
SEED

42

In [6]:


X = df.drop(['Churn'], axis = 1)
y = df.Churn


test_size = 0.2


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state=SEED)
scaler = StandardScaler()
scale_columns = ['ClientPeriod', 'MonthlySpending', 'TotalSpent', 'Spending_Change_Ratio']

X_train[scale_columns] = scaler.fit_transform(X_train[scale_columns])
X_test[scale_columns] = scaler.transform(X_test[scale_columns])

In [7]:
X_train.shape, X_test.shape

((4214, 33), (1054, 33))

# Бэйзлайн на логистической регрессии


In [8]:
X_train

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,IsSeniorCitizen,Spending_Change_Ratio,Sex_Male,HasPartner_Yes,HasChild_Yes,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,...,HasMovieSubscription_No internet service,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,random_number,random_normal
298,-1.123668,0.671828,-0.829853,0,0.165704,False,False,False,True,False,...,False,True,False,False,True,True,False,False,0.251281,0.460067
68,1.163759,0.486222,1.158731,0,-0.496389,True,True,False,True,False,...,False,True,True,False,True,False,True,False,0.371306,-2.200691
2372,1.000371,0.721544,1.169562,0,-0.491513,True,False,False,True,False,...,False,False,True,False,True,False,True,False,0.645234,-1.472074
2856,-1.001127,-1.502416,-0.947625,0,-0.110247,False,True,True,True,False,...,True,False,True,False,False,False,True,False,0.230859,0.407647
150,-0.674352,-1.475901,-0.872226,0,-0.330613,True,False,False,True,False,...,True,False,False,False,True,True,False,False,0.211078,-1.955710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,-0.470117,1.301563,-0.043815,0,-0.384247,False,True,False,True,False,...,False,True,False,False,True,False,False,False,0.548526,0.375107
3772,1.572228,-1.366526,-0.298031,0,-0.501928,True,True,True,True,False,...,True,False,False,True,False,True,False,False,0.223977,-1.450934
5191,1.082065,1.139158,1.590745,1,-0.494031,False,False,False,True,False,...,False,True,True,False,True,False,True,False,0.581688,0.817437
5226,1.327147,1.004925,1.738397,1,-0.499620,False,False,False,True,False,...,False,False,False,False,True,False,False,False,0.166937,0.257413


# Вход в wandb

In [9]:
import os
from dotenv import load_dotenv
import wandb

load_dotenv()  

api_key = os.getenv('WANDB_API_KEY')
if api_key:
    wandb.login(key=api_key)
    print("Successfully logged in to W&B")
else:
    print("Please set WANDB_API_KEY in .env file")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\_netrc
[34m[1mwandb[0m: Currently logged in as: [33msutormin-p[0m ([33msutormin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged in to W&B


In [10]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print(classification_report(y_pred = y_pred, y_true = y_test))
print('roc_auc = ', roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))


from scripts.train import log_wandb_sklearn

log_wandb_sklearn(logreg, X_train, y_train, X_test, y_test)

              precision    recall  f1-score   support

           0       0.84      0.91      0.87       773
           1       0.68      0.51      0.58       281

    accuracy                           0.80      1054
   macro avg       0.76      0.71      0.73      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc =  0.8495532035375415




# Были добавлены фичи со случайными числами и ещё одна фича комбинация двух других. Была обучена логистическая регрессия. Данные немного не сбалансированы и поэтому на разных классах разные метрики

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [12]:
X_train

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,IsSeniorCitizen,Spending_Change_Ratio,Sex_Male,HasPartner_Yes,HasChild_Yes,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,...,HasMovieSubscription_No internet service,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,random_number,random_normal
298,-1.123668,0.671828,-0.829853,0,0.165704,False,False,False,True,False,...,False,True,False,False,True,True,False,False,0.251281,0.460067
68,1.163759,0.486222,1.158731,0,-0.496389,True,True,False,True,False,...,False,True,True,False,True,False,True,False,0.371306,-2.200691
2372,1.000371,0.721544,1.169562,0,-0.491513,True,False,False,True,False,...,False,False,True,False,True,False,True,False,0.645234,-1.472074
2856,-1.001127,-1.502416,-0.947625,0,-0.110247,False,True,True,True,False,...,True,False,True,False,False,False,True,False,0.230859,0.407647
150,-0.674352,-1.475901,-0.872226,0,-0.330613,True,False,False,True,False,...,True,False,False,False,True,True,False,False,0.211078,-1.955710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,-0.470117,1.301563,-0.043815,0,-0.384247,False,True,False,True,False,...,False,True,False,False,True,False,False,False,0.548526,0.375107
3772,1.572228,-1.366526,-0.298031,0,-0.501928,True,True,True,True,False,...,True,False,False,True,False,True,False,False,0.223977,-1.450934
5191,1.082065,1.139158,1.590745,1,-0.494031,False,False,False,True,False,...,False,True,True,False,True,False,True,False,0.581688,0.817437
5226,1.327147,1.004925,1.738397,1,-0.499620,False,False,False,True,False,...,False,False,False,False,True,False,False,False,0.166937,0.257413


In [13]:
from scripts.train import train_and_log

In [14]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(max_depth = 5)
rf = RandomForestClassifier()
grad_boosting = GradientBoostingClassifier()
knn = KNeighborsClassifier()
lgbm = LGBMClassifier()
xgb = XGBClassifier()
catboost = CatBoostClassifier()


models = [logreg, tree, rf, grad_boosting, knn, lgbm, xgb, catboost ]
names = ['logreg', 'tree', 'rf', 'grad_boosting', 'knn', 'lgbm', 'xgb', 'catboost']
reports = {}
roc_aucs = {}
experiments_dir = '..\configs\experiments'

for idx,( name, model )in enumerate(zip(names, models)):
    
    train_and_log(name, model, X_train, y_train, X_test, y_test, experiments_dir, reports, roc_aucs, )
    break


              precision    recall  f1-score   support

           0       0.84      0.91      0.87       773
           1       0.68      0.51      0.58       281

    accuracy                           0.80      1054
   macro avg       0.76      0.71      0.73      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8495532035375415


  experiments_dir = '..\configs\experiments'


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




✅ Эксперимент сохранен: ..\configs\experiments\exp5.yaml


In [15]:
for name in reports:
    print("=" * 30, "model: ", name ,"=" * 30, '\n')
    print(pd.DataFrame(reports[name]).T)
    print('roc auc score =', roc_aucs[name], '\n')


              precision    recall  f1-score      support
0              0.835308  0.912031  0.871985   773.000000
1              0.676190  0.505338  0.578411   281.000000
accuracy       0.803605  0.803605  0.803605     0.803605
macro avg      0.755749  0.708685  0.725198  1054.000000
weighted avg   0.792887  0.803605  0.793717  1054.000000
roc auc score = 0.8495532035375415 



# Судя по метрикам лучше всех себя показало, что удивительно самая наша первая итерация, а именно логистическая регрессия. Значит с ней и продолжим, но посмотрим на важность фичей у например кэтбуста

In [16]:
# Получаем коэффициенты
coefficients = logreg.coef_[0]
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print(feature_importance)


                                         feature  coefficient  abs_coefficient
26                     HasContractPhone_Two year    -1.435201         1.435201
11                HasInternetService_Fiber optic     1.131659         1.131659
25                     HasContractPhone_One year    -0.693372         0.693372
0                                   ClientPeriod    -0.603664         0.603664
10                   HasMultiplePhoneNumbers_Yes     0.534457         0.534457
8                            HasPhoneService_Yes    -0.478890         0.478890
24                      HasMovieSubscription_Yes     0.395089         0.395089
4                          Spending_Change_Ratio     0.390278         0.390278
1                                MonthlySpending    -0.379879         0.379879
22                               HasOnlineTV_Yes     0.378988         0.378988
27                        IsBillingPaperless_Yes     0.364149         0.364149
3                                IsSeniorCitizen    

# Сделаем отсечку фичей по порогу 0.2

In [17]:
thresh = 0.2
to_drop = list(feature_importance[feature_importance.abs_coefficient < thresh].feature)


X_train_without_unimportant = X_train.drop(to_drop, axis = 1)
X_test_without_unimportant = X_test.drop(to_drop, axis = 1)



In [18]:
X_test_without_unimportant

Unnamed: 0,ClientPeriod,MonthlySpending,IsSeniorCitizen,Spending_Change_Ratio,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,HasMultiplePhoneNumbers_Yes,HasInternetService_Fiber optic,HasOnlineSecurityService_Yes,HasTechSupportAccess_Yes,HasOnlineTV_Yes,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Electronic check
4793,0.060892,-0.128599,0,-0.444318,False,True,False,False,True,True,True,True,True,False,False,False
5020,-0.837740,0.295644,0,-0.249847,True,False,True,False,False,True,True,True,True,False,False,False
655,-0.551811,0.812690,0,-0.371823,True,False,True,True,False,False,False,True,False,False,False,False
1874,0.714443,1.475569,0,-0.483757,True,False,True,True,True,True,True,True,False,True,True,True
2318,-1.246209,0.186269,1,1.086832,True,False,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4154,1.449687,0.658570,0,-0.500266,True,False,False,False,True,True,True,True,False,True,True,False
5085,0.510208,-0.347349,0,-0.477837,True,False,False,False,True,True,False,False,False,True,False,False
295,-1.164515,0.191240,0,0.378367,True,False,False,True,False,False,False,False,False,False,True,False
1751,-0.470117,-0.965484,0,-0.388933,False,True,False,False,False,True,False,False,False,False,True,False


In [19]:
from scripts import config
processed_path = config.get_processed_data_path(version= 'processed_path_processed_path_without_unimportant', path = '../configs/data_config.yaml')

cols = list(X_train_without_unimportant.columns)
cols.append('Churn')
df[cols].to_csv(processed_path)

In [20]:
logreg = LogisticRegression()
train_and_log('logreg_without_unimportant' , logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)




              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884




✅ Эксперимент сохранен: ..\configs\experiments\exp6.yaml


In [21]:
print(pd.DataFrame(reports['logreg']).T)
print(roc_aucs['logreg'])

              precision    recall  f1-score      support
0              0.835308  0.912031  0.871985   773.000000
1              0.676190  0.505338  0.578411   281.000000
accuracy       0.803605  0.803605  0.803605     0.803605
macro avg      0.755749  0.708685  0.725198  1054.000000
weighted avg   0.792887  0.803605  0.793717  1054.000000
0.8495532035375415


In [22]:
print(pd.DataFrame(reports['logreg_without_unimportant']).T)
print(roc_aucs['logreg_without_unimportant'])

              precision    recall  f1-score      support
0              0.833333  0.912031  0.870908   773.000000
1              0.673077  0.498221  0.572597   281.000000
accuracy       0.801708  0.801708  0.801708     0.801708
macro avg      0.753205  0.705126  0.721753  1054.000000
weighted avg   0.790608  0.801708  0.791377  1054.000000
0.8497005243700884


#

# roc auc стал совсееееем чуть лучше

In [23]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [24]:
logregl1 = LogisticRegression(penalty='l1')

logreg3 = LogisticRegression(penalty= 'elasticnet')

train_and_log('logreg_l1', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)
train_and_log('logreg_elastic', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




✅ Эксперимент сохранен: ..\configs\experiments\exp7.yaml
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884




✅ Эксперимент сохранен: ..\configs\experiments\exp8.yaml


# От пенальти не зависит

# class weight не улучшает


In [29]:

logreg = LogisticRegression(class_weight = {0: 1, 1: 3})

train_and_log('logreg_class_weight', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)

              precision    recall  f1-score   support

           0       0.90      0.74      0.81       773
           1       0.52      0.78      0.63       281

    accuracy                           0.75      1054
   macro avg       0.71      0.76      0.72      1054
weighted avg       0.80      0.75      0.76      1054

roc_auc = 0.8492999958566017




✅ Эксперимент сохранен: ..\configs\experiments\exp12.yaml


In [34]:

logreg = LogisticRegression(C = 2)

train_and_log('logreg_C', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)

logreg = LogisticRegression(C = 0.5)

train_and_log('logreg_C', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, experiments_dir, reports, roc_aucs)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.849686713042037


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




✅ Эксперимент сохранен: ..\configs\experiments\exp15.yaml
              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.68      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497051281461054




✅ Эксперимент сохранен: ..\configs\experiments\exp16.yaml


# Лучше не стало Оставим классическую логистическую регрессию без параметорв


In [26]:
wandb.finish()


# Выводы. Были исследованы данные с дисбалансом классов, churn = 1 встречается реже. Были обучены 8 различных моделей. Лучше всех себя показала логистичекая регрессия без параметров. Вероятно улучшений можно достичь стакингом логистической регрессии и ещё каких то моделей.