In [1]:
import catboost 
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve

In [3]:
data = pd.read_csv('.././data/winequalityN.csv', sep=',')
data = data.dropna(inplace=False)

In [4]:
# Проверим количество значений по колонкам
data.info()  # в каждой по 5282, значит, пропусков нигде нет - расскоментировать, чтобы проверить

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6463 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  6463 non-null   object 
 1   fixed acidity         6463 non-null   float64
 2   volatile acidity      6463 non-null   float64
 3   citric acid           6463 non-null   float64
 4   residual sugar        6463 non-null   float64
 5   chlorides             6463 non-null   float64
 6   free sulfur dioxide   6463 non-null   float64
 7   total sulfur dioxide  6463 non-null   float64
 8   density               6463 non-null   float64
 9   pH                    6463 non-null   float64
 10  sulphates             6463 non-null   float64
 11  alcohol               6463 non-null   float64
 12  quality               6463 non-null   int64  
dtypes: float64(11), int64(1), object(1)
memory usage: 706.9+ KB


In [5]:
# Целевой признак
target_col = 'quality'

# Числовые признаки
num_cols = list(filter(lambda item: item != target_col, sorted(data.select_dtypes(include=['float64']).columns.to_list())))

# Категориальные признаки
cat_cols = list(filter(lambda item: item != target_col, sorted(data.select_dtypes(include=['object']).columns.to_list())))

feature_cols = num_cols + cat_cols

In [6]:
X = data.loc[:, data.columns != target_col]
X

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9
...,...,...,...,...,...,...,...,...,...,...,...,...
6491,red,6.8,0.620,0.08,1.9,0.068,28.0,38.0,0.99651,3.42,0.82,9.5
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


In [7]:
y = data.loc[:, data.columns == target_col]
y

Unnamed: 0,quality
0,6
1,6
2,6
3,6
4,6
...,...
6491,6
6492,5
6494,6
6495,5


In [13]:
# CatBoost умеет работать с категориальными признаками сам
X_train_origin, X_test_origin, y_train, y_test = train_test_split(X, y, 
                                                       train_size=0.99, 
                                                       random_state=42)

In [11]:
type(y_train)

pandas.core.frame.DataFrame

In [254]:
model = catboost.CatBoostClassifier(n_estimators=200,
                                            cat_features=cat_cols,
                                            custom_loss=['Accuracy'])
model.fit(X_train_origin, y_train)

Learning rate set to 0.331695
0:	learn: 1.5248635	total: 17.1ms	remaining: 3.39s
1:	learn: 1.3395866	total: 31.7ms	remaining: 3.14s
2:	learn: 1.2395199	total: 48.5ms	remaining: 3.18s
3:	learn: 1.1777848	total: 65.4ms	remaining: 3.21s
4:	learn: 1.1348594	total: 83.2ms	remaining: 3.24s
5:	learn: 1.0945977	total: 98.6ms	remaining: 3.19s
6:	learn: 1.0679357	total: 114ms	remaining: 3.13s
7:	learn: 1.0443271	total: 128ms	remaining: 3.07s
8:	learn: 1.0280243	total: 144ms	remaining: 3.06s
9:	learn: 1.0134532	total: 164ms	remaining: 3.11s
10:	learn: 1.0021272	total: 181ms	remaining: 3.11s
11:	learn: 0.9901078	total: 210ms	remaining: 3.29s
12:	learn: 0.9778035	total: 235ms	remaining: 3.37s
13:	learn: 0.9690484	total: 270ms	remaining: 3.59s
14:	learn: 0.9560613	total: 295ms	remaining: 3.64s
15:	learn: 0.9512865	total: 326ms	remaining: 3.75s
16:	learn: 0.9478803	total: 347ms	remaining: 3.73s
17:	learn: 0.9444769	total: 374ms	remaining: 3.78s
18:	learn: 0.9350106	total: 401ms	remaining: 3.81s
19:	l

<catboost.core.CatBoostClassifier at 0x1318f362fa0>

In [261]:
model.save_model("catboost_classifier_weights.cbm")

In [258]:
y_train_predicted = model.predict(X_train_origin)
y_test_predicted = model.predict(X_test_origin)

In [259]:
train_auc = accuracy_score(y_train, y_train_predicted)
test_auc = accuracy_score(y_test, y_test_predicted)

In [260]:
# train_auc, test_auc