In [14]:
import pandas as pd
from pathlib import Path
import sys
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Получаем абсолютный путь к src
parent_path = Path.cwd().parent
sys.path.append(str(parent_path))

In [15]:

from sklearn.linear_model import LogisticRegression


# Метрики для оценки классификации
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report

# Утилиты для подготовки данных
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

In [16]:
from scripts.config import get_processed_data_path
from scripts.config import get_seed


SEED = int(get_seed('../configs/data_config.yaml'))

In [17]:
df = pd.read_csv(get_processed_data_path(version= 0, path = '../configs/data_config.yaml'))
df.drop(['Unnamed: 0'], axis = 1, inplace= True)

In [18]:
SEED

42

In [19]:


X = df.drop(['Churn'], axis = 1)
y = df.Churn


test_size = 0.2


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state=SEED)
scaler = StandardScaler()
scale_columns = ['ClientPeriod', 'MonthlySpending', 'TotalSpent', 'Spending_Change_Ratio']

X_train[scale_columns] = scaler.fit_transform(X_train[scale_columns])
X_test[scale_columns] = scaler.transform(X_test[scale_columns])

In [20]:
X_train.shape, X_test.shape

((4214, 33), (1054, 33))

# Бэйзлайн на логистической регрессии


In [21]:
X_train

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,IsSeniorCitizen,Spending_Change_Ratio,Sex_Male,HasPartner_Yes,HasChild_Yes,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,...,HasMovieSubscription_No internet service,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,random_number,random_normal
298,-1.123668,0.671828,-0.829853,0,0.165704,False,False,False,True,False,...,False,True,False,False,True,True,False,False,0.282700,-1.603129
68,1.163759,0.486222,1.158731,0,-0.496389,True,True,False,True,False,...,False,True,True,False,True,False,True,False,0.966673,1.213148
2372,1.000371,0.721544,1.169562,0,-0.491513,True,False,False,True,False,...,False,False,True,False,True,False,True,False,0.282253,1.635603
2856,-1.001127,-1.502416,-0.947625,0,-0.110247,False,True,True,True,False,...,True,False,True,False,False,False,True,False,0.590101,-0.838572
150,-0.674352,-1.475901,-0.872226,0,-0.330613,True,False,False,True,False,...,True,False,False,False,True,True,False,False,0.328831,0.650161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,-0.470117,1.301563,-0.043815,0,-0.384247,False,True,False,True,False,...,False,True,False,False,True,False,False,False,0.748672,0.619844
3772,1.572228,-1.366526,-0.298031,0,-0.501928,True,True,True,True,False,...,True,False,False,True,False,True,False,False,0.561232,-0.451321
5191,1.082065,1.139158,1.590745,1,-0.494031,False,False,False,True,False,...,False,True,True,False,True,False,True,False,0.689110,-1.400315
5226,1.327147,1.004925,1.738397,1,-0.499620,False,False,False,True,False,...,False,False,False,False,True,False,False,False,0.216336,0.546769


# Вход в wandb

In [22]:
import os
from dotenv import load_dotenv
import wandb

load_dotenv()  

api_key = os.getenv('WANDB_API_KEY')
if api_key:
    wandb.login(key=api_key)
    print("Successfully logged in to W&B")
else:
    print("Please set WANDB_API_KEY in .env file")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\user\_netrc
[34m[1mwandb[0m: Currently logged in as: [33msutormin-p[0m ([33msutormin[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Successfully logged in to W&B


In [23]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
 
print(classification_report(y_pred = y_pred, y_true = y_test))
print('roc_auc = ', roc_auc_score(y_test, logreg.predict_proba(X_test)[:,1]))


from scripts.train import log_wandb_sklearn

log_wandb_sklearn(logreg, X_train, y_train, X_test, y_test)

  def quick_save_model(model, feature_names=None, metrics=None,  save_dir= '..\configs\experiments'):
  def quick_load_model(exp_name,  save_dir= '..\configs\experiments'):


              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc =  0.8495854299696611




# Были добавлены фичи со случайными числами и ещё одна фича комбинация двух других. Была обучена логистическая регрессия. Данные немного не сбалансированы и поэтому на разных классах разные метрики

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [25]:
X_train

Unnamed: 0,ClientPeriod,MonthlySpending,TotalSpent,IsSeniorCitizen,Spending_Change_Ratio,Sex_Male,HasPartner_Yes,HasChild_Yes,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,...,HasMovieSubscription_No internet service,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,random_number,random_normal
298,-1.123668,0.671828,-0.829853,0,0.165704,False,False,False,True,False,...,False,True,False,False,True,True,False,False,0.282700,-1.603129
68,1.163759,0.486222,1.158731,0,-0.496389,True,True,False,True,False,...,False,True,True,False,True,False,True,False,0.966673,1.213148
2372,1.000371,0.721544,1.169562,0,-0.491513,True,False,False,True,False,...,False,False,True,False,True,False,True,False,0.282253,1.635603
2856,-1.001127,-1.502416,-0.947625,0,-0.110247,False,True,True,True,False,...,True,False,True,False,False,False,True,False,0.590101,-0.838572
150,-0.674352,-1.475901,-0.872226,0,-0.330613,True,False,False,True,False,...,True,False,False,False,True,True,False,False,0.328831,0.650161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3092,-0.470117,1.301563,-0.043815,0,-0.384247,False,True,False,True,False,...,False,True,False,False,True,False,False,False,0.748672,0.619844
3772,1.572228,-1.366526,-0.298031,0,-0.501928,True,True,True,True,False,...,True,False,False,True,False,True,False,False,0.561232,-0.451321
5191,1.082065,1.139158,1.590745,1,-0.494031,False,False,False,True,False,...,False,True,True,False,True,False,True,False,0.689110,-1.400315
5226,1.327147,1.004925,1.738397,1,-0.499620,False,False,False,True,False,...,False,False,False,False,True,False,False,False,0.216336,0.546769


In [26]:
from scripts.train import train_and_log

In [27]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier(max_depth = 5)
rf = RandomForestClassifier()
grad_boosting = GradientBoostingClassifier()
knn = KNeighborsClassifier()
lgbm = LGBMClassifier()
xgb = XGBClassifier()
catboost = CatBoostClassifier()


models = [logreg, tree, rf, grad_boosting, knn, lgbm, xgb, catboost ]
names = ['logreg', 'tree', 'rf', 'grad_boosting', 'knn', 'lgbm', 'xgb', 'catboost']
reports = {}
roc_aucs = {}


for idx,( name, model )in enumerate(zip(names, models)):
    
    train_and_log(name, model, X_train, y_train, X_test, y_test, reports, roc_aucs)
    break


              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8495854299696611


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




In [28]:
for name in reports:
    print("=" * 30, "model: ", name ,"=" * 30, '\n')
    print(pd.DataFrame(reports[name]).T)
    print('roc auc score =', roc_aucs[name], '\n')


              precision    recall  f1-score      support
0              0.833926  0.909444  0.870050   773.000000
1              0.668246  0.501779  0.573171   281.000000
accuracy       0.800759  0.800759  0.800759     0.800759
macro avg      0.751086  0.705612  0.721610  1054.000000
weighted avg   0.789756  0.800759  0.790901  1054.000000
roc auc score = 0.8495854299696611 



# Судя по метрикам лучше всех себя показало, что удивительно самая наша первая итерация, а именно логистическая регрессия. Значит с ней и продолжим, но посмотрим на важность фичей у например кэтбуста

In [29]:
# Получаем коэффициенты
coefficients = logreg.coef_[0]
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values('abs_coefficient', ascending=False)

print(feature_importance)


                                         feature  coefficient  abs_coefficient
26                     HasContractPhone_Two year    -1.428417         1.428417
11                HasInternetService_Fiber optic     1.138661         1.138661
25                     HasContractPhone_One year    -0.692482         0.692482
0                                   ClientPeriod    -0.602351         0.602351
10                   HasMultiplePhoneNumbers_Yes     0.534925         0.534925
8                            HasPhoneService_Yes    -0.461802         0.461802
24                      HasMovieSubscription_Yes     0.395730         0.395730
1                                MonthlySpending    -0.388966         0.388966
4                          Spending_Change_Ratio     0.387696         0.387696
22                               HasOnlineTV_Yes     0.381997         0.381997
27                        IsBillingPaperless_Yes     0.371116         0.371116
3                                IsSeniorCitizen    

# Сделаем отсечку фичей по порогу 0.2

In [30]:
thresh = 0.2
to_drop = list(feature_importance[feature_importance.abs_coefficient < thresh].feature)


X_train_without_unimportant = X_train.drop(to_drop, axis = 1)
X_test_without_unimportant = X_test.drop(to_drop, axis = 1)



In [31]:
X_test_without_unimportant

Unnamed: 0,ClientPeriod,MonthlySpending,IsSeniorCitizen,Spending_Change_Ratio,HasPhoneService_Yes,HasMultiplePhoneNumbers_No phone service,HasMultiplePhoneNumbers_Yes,HasInternetService_Fiber optic,HasOnlineSecurityService_Yes,HasTechSupportAccess_Yes,HasOnlineTV_Yes,HasMovieSubscription_Yes,HasContractPhone_One year,HasContractPhone_Two year,IsBillingPaperless_Yes,PaymentMethod_Electronic check
4793,0.060892,-0.128599,0,-0.444318,False,True,False,False,True,True,True,True,True,False,False,False
5020,-0.837740,0.295644,0,-0.249847,True,False,True,False,False,True,True,True,True,False,False,False
655,-0.551811,0.812690,0,-0.371823,True,False,True,True,False,False,False,True,False,False,False,False
1874,0.714443,1.475569,0,-0.483757,True,False,True,True,True,True,True,True,False,True,True,True
2318,-1.246209,0.186269,1,1.086832,True,False,False,True,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4154,1.449687,0.658570,0,-0.500266,True,False,False,False,True,True,True,True,False,True,True,False
5085,0.510208,-0.347349,0,-0.477837,True,False,False,False,True,True,False,False,False,True,False,False
295,-1.164515,0.191240,0,0.378367,True,False,False,True,False,False,False,False,False,False,True,False
1751,-0.470117,-0.965484,0,-0.388933,False,True,False,False,False,True,False,False,False,False,True,False


In [33]:
from scripts import config
processed_path = config.get_processed_data_path(version= 'processed_path_processed_path_without_unimportant', path = '../configs/data_config.yaml')

cols = list(X_train_without_unimportant.columns)
cols.append('Churn')
df[cols].to_csv(processed_path)

In [34]:
logreg = LogisticRegression()
train_and_log('logreg_without_unimportant' , logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, reports, roc_aucs)




              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884




In [35]:
print(pd.DataFrame(reports['logreg']).T)
print(roc_aucs['logreg'])

              precision    recall  f1-score      support
0              0.833926  0.909444  0.870050   773.000000
1              0.668246  0.501779  0.573171   281.000000
accuracy       0.800759  0.800759  0.800759     0.800759
macro avg      0.751086  0.705612  0.721610  1054.000000
weighted avg   0.789756  0.800759  0.790901  1054.000000
0.8495854299696611


In [36]:
print(pd.DataFrame(reports['logreg_without_unimportant']).T)
print(roc_aucs['logreg_without_unimportant'])

              precision    recall  f1-score      support
0              0.833333  0.912031  0.870908   773.000000
1              0.673077  0.498221  0.572597   281.000000
accuracy       0.801708  0.801708  0.801708     0.801708
macro avg      0.753205  0.705126  0.721753  1054.000000
weighted avg   0.790608  0.801708  0.791377  1054.000000
0.8497005243700884


#

# roc auc стал совсееееем чуть лучше

In [37]:
logreg.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'deprecated',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [38]:
logregl1 = LogisticRegression(penalty='l1')

logreg3 = LogisticRegression(penalty= 'elasticnet')

train_and_log('logreg_l1', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, reports, roc_aucs)
train_and_log('logreg_elastic', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, reports, roc_aucs)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.




              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884




# От пенальти не зависит

# class weight тоже не влияет


In [40]:

LogisticRegression(class_weight = {0: 1, 1: 10})

train_and_log('logreg_l1', logreg, X_train_without_unimportant, y_train, X_test_without_unimportant, y_test, reports, roc_aucs)

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       773
           1       0.67      0.50      0.57       281

    accuracy                           0.80      1054
   macro avg       0.75      0.71      0.72      1054
weighted avg       0.79      0.80      0.79      1054

roc_auc = 0.8497005243700884




In [41]:
wandb.finish()


[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.
