In [1]:
#!pip install pycaret

In [2]:
from lightgbm import LGBMRegressor
from pycaret.classification import *

In [3]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from kaggle.api.kaggle_api_extended import KaggleApi

import warnings

In [4]:
# Игнорирование предупреждений
warnings.filterwarnings('ignore')

In [5]:
root_dir = os.getcwd()

kaggle_data_set = 'arjunbhasin2013/ccdata'
dataset_dir = 'dataset'
dataset_name = 'CC GENERAL.csv'
dataset_path = os.path.join(root_dir, dataset_dir, dataset_name)

separator = f'\n\b{"-" * 60}\n\b'

In [6]:
def load_data_csv(file_path):
    """
    Загрузка данных из CSV файла.
    :param file_path: Путь к CSV файлу.
    :return: DataFrame с загруженными данными.
    """
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        raise Exception(e)

In [7]:
def load_kaggle_data_set(dataset):
    """
    Загрузка датасета из kaggle.

    Важно: Необходимо предварительно получить API Token.
    Переместите файл kaggle.json в папку ~/.kaggle/ (для Linux и macOS)
    или в C:/Users/<Ваше_имя_пользователя>/.kaggle/ (для Windows).
    Если папка .kaggle не существует, нужно ее создать.
    :param dataset:
    :return:
    """

    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(dataset=dataset,
                               path=f'./{dataset_dir}',
                               force=True,
                               unzip=True)  # Загрузка и распаковка датасета

In [8]:
try:
    if not os.path.exists(dataset_path):
        load_kaggle_data_set(kaggle_data_set)  # Загрузка файла датасета из kaggle если он не загружен
    df = load_data_csv(dataset_path)  # Загрузка файла в датафрейм
except Exception as e:
    raise Exception(f'При выполнении произошла ошибка: {e}')

In [9]:
# Экспорт датасета в Exel если не был создан ранее (для удобства просмотра датасета)
dataset_exel = os.path.join(root_dir, 'dataset.xlsx')
if not os.path.isfile(dataset_exel):
    df.to_excel(dataset_exel, index=False)

In [10]:
df

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


In [14]:
# Столбец "CUST_ID" - это id покупателя. Он не нужен, поэтому его удаляю.
df = df.drop(['CUST_ID'], axis=1)

KeyError: "['CUST_ID'] not found in axis"

In [16]:
s = setup(df, normalize = True)

Unnamed: 0,Description,Value
0,Session id,1434
1,Target,TENURE
2,Target type,Multiclass
3,Target mapping,"6: 0, 7: 1, 8: 2, 9: 3, 10: 4, 11: 5, 12: 6"
4,Original data shape,"(8950, 17)"
5,Transformed data shape,"(8950, 17)"
6,Transformed train set shape,"(6265, 17)"
7,Transformed test set shape,"(2685, 17)"
8,Numeric features,16
9,Rows with missing values,3.5%


In [17]:
# Сравнение моделей
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.969,0.9939,0.969,0.9696,0.9669,0.8775,0.8836,0.347
gbc,Gradient Boosting Classifier,0.9462,0.0,0.9462,0.9475,0.9408,0.7733,0.7902,0.753
rf,Random Forest Classifier,0.8975,0.9564,0.8975,0.8871,0.8751,0.5054,0.5555,0.06
dt,Decision Tree Classifier,0.8913,0.8241,0.8913,0.8932,0.8913,0.6106,0.6112,0.01
et,Extra Trees Classifier,0.8905,0.9646,0.8905,0.8861,0.8607,0.4331,0.5111,0.035
ridge,Ridge Classifier,0.8474,0.0,0.8474,0.7181,0.7774,0.0,0.0,0.006
dummy,Dummy Classifier,0.8474,0.5,0.8474,0.7181,0.7774,0.0,0.0,0.005
lr,Logistic Regression,0.8472,0.0,0.8472,0.7276,0.7812,0.0374,0.0853,0.168
svm,SVM - Linear Kernel,0.8469,0.0,0.8469,0.7217,0.778,0.0069,0.0182,0.01
lda,Linear Discriminant Analysis,0.8461,0.0,0.8461,0.7287,0.7789,0.0176,0.0467,0.005


In [18]:
lgbm = create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9697,0.9894,0.9697,0.9707,0.9674,0.8805,0.8869
1,0.9681,0.997,0.9681,0.9685,0.9661,0.8749,0.8805
2,0.9697,0.9975,0.9697,0.9707,0.9675,0.8806,0.887
3,0.9745,0.9968,0.9745,0.9744,0.9734,0.9024,0.9053
4,0.9537,0.9868,0.9537,0.9555,0.9499,0.8089,0.8231
5,0.9728,0.9892,0.9728,0.9729,0.9712,0.8942,0.8981
6,0.9696,0.9978,0.9696,0.9707,0.9672,0.8793,0.8858
7,0.9696,0.9971,0.9696,0.9702,0.9672,0.8799,0.8858
8,0.9633,0.9927,0.9633,0.9634,0.9604,0.8523,0.8602
9,0.9792,0.9948,0.9792,0.9788,0.9783,0.9219,0.9236


In [19]:
print(lgbm)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=1434, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)
