In [1]:
#!pip install pycaret

In [2]:
from pycaret.classification import *

In [3]:
import os
import seaborn as sns
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

from kaggle.api.kaggle_api_extended import KaggleApi

import warnings

In [4]:
# Игнорирование предупреждений
warnings.filterwarnings('ignore')

In [5]:
root_dir = os.getcwd()

kaggle_data_set = 'arjunbhasin2013/ccdata'
dataset_dir = 'dataset'
dataset_name = 'CC GENERAL.csv'
dataset_path = os.path.join(root_dir, dataset_dir, dataset_name)

separator = f'\n\b{"-" * 60}\n\b'

In [6]:
def load_data_csv(file_path):
    """
    Загрузка данных из CSV файла.
    :param file_path: Путь к CSV файлу.
    :return: DataFrame с загруженными данными.
    """
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        raise Exception(e)

In [7]:
def load_kaggle_data_set(dataset):
    """
    Загрузка датасета из kaggle.

    Важно: Необходимо предварительно получить API Token.
    Переместите файл kaggle.json в папку ~/.kaggle/ (для Linux и macOS)
    или в C:/Users/<Ваше_имя_пользователя>/.kaggle/ (для Windows).
    Если папка .kaggle не существует, нужно ее создать.
    :param dataset:
    :return:
    """

    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(dataset=dataset,
                               path=f'./{dataset_dir}',
                               force=True,
                               unzip=True)  # Загрузка и распаковка датасета

In [8]:
try:
    if not os.path.exists(dataset_path):
        load_kaggle_data_set(kaggle_data_set)  # Загрузка файла датасета из kaggle если он не загружен
    df = load_data_csv(dataset_path)  # Загрузка файла в датафрейм
except Exception as e:
    raise Exception(f'При выполнении произошла ошибка: {e}')

In [9]:
# Экспорт датасета в Exel если не был создан ранее (для удобства просмотра датасета)
dataset_exel = os.path.join(root_dir, 'dataset.xlsx')
if not os.path.isfile(dataset_exel):
    df.to_excel(dataset_exel, index=False)

In [10]:
df

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.40,0.00,95.40,0.000000,0.166667,0.000000,0.083333,0.000000,0,2,1000.0,201.802084,139.509787,0.000000,12
1,C10002,3202.467416,0.909091,0.00,0.00,0.00,6442.945483,0.000000,0.000000,0.000000,0.250000,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,C10003,2495.148862,1.000000,773.17,773.17,0.00,0.000000,1.000000,1.000000,0.000000,0.000000,0,12,7500.0,622.066742,627.284787,0.000000,12
3,C10004,1666.670542,0.636364,1499.00,1499.00,0.00,205.788017,0.083333,0.083333,0.000000,0.083333,1,1,7500.0,0.000000,,0.000000,12
4,C10005,817.714335,1.000000,16.00,16.00,0.00,0.000000,0.083333,0.083333,0.000000,0.000000,0,1,1200.0,678.334763,244.791237,0.000000,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8945,C19186,28.493517,1.000000,291.12,0.00,291.12,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,325.594462,48.886365,0.500000,6
8946,C19187,19.183215,1.000000,300.00,0.00,300.00,0.000000,1.000000,0.000000,0.833333,0.000000,0,6,1000.0,275.861322,,0.000000,6
8947,C19188,23.398673,0.833333,144.40,0.00,144.40,0.000000,0.833333,0.000000,0.666667,0.000000,0,5,1000.0,81.270775,82.418369,0.250000,6
8948,C19189,13.457564,0.833333,0.00,0.00,0.00,36.558778,0.000000,0.000000,0.000000,0.166667,2,0,500.0,52.549959,55.755628,0.250000,6


In [11]:
# Столбец "CUST_ID" - это id покупателя. Он не нужен, поэтому его удаляю.
df = df.drop(['CUST_ID'], axis=1)

In [12]:
s = setup(df, normalize = True)

Unnamed: 0,Description,Value
0,Session id,6451
1,Target,TENURE
2,Target type,Multiclass
3,Target mapping,"6: 0, 7: 1, 8: 2, 9: 3, 10: 4, 11: 5, 12: 6"
4,Original data shape,"(8950, 17)"
5,Transformed data shape,"(8950, 17)"
6,Transformed train set shape,"(6265, 17)"
7,Transformed test set shape,"(2685, 17)"
8,Numeric features,16
9,Rows with missing values,3.5%


In [13]:
# Сравнение моделей
best = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9676,0.9955,0.9676,0.9677,0.9651,0.8721,0.8779,0.423
gbc,Gradient Boosting Classifier,0.9456,0.0,0.9456,0.9457,0.9401,0.7732,0.7877,0.786
rf,Random Forest Classifier,0.8993,0.96,0.8993,0.8868,0.8777,0.5188,0.5653,0.07
dt,Decision Tree Classifier,0.8982,0.8409,0.8982,0.9003,0.8978,0.6343,0.6348,0.01
et,Extra Trees Classifier,0.8907,0.9668,0.8907,0.8868,0.8618,0.4394,0.513,0.038
lr,Logistic Regression,0.8498,0.0,0.8498,0.7337,0.7856,0.0661,0.1426,0.243
svm,SVM - Linear Kernel,0.8477,0.0,0.8477,0.7259,0.7787,0.0089,0.0401,0.014
ridge,Ridge Classifier,0.8474,0.0,0.8474,0.7181,0.7774,0.0,0.0,0.006
lda,Linear Discriminant Analysis,0.8474,0.0,0.8474,0.731,0.7822,0.0428,0.1008,0.006
dummy,Dummy Classifier,0.8474,0.5,0.8474,0.7181,0.7774,0.0,0.0,0.005
