<a href="https://colab.research.google.com/github/morrowbord/Spark/blob/main/credit_scoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Кредитный скоринг
При принятии решения о выдаче кредита или займа учитывается т.н. «Кредитный скоринг» — рейтинг платежеспособности клиента. ИИ на основе модели, которую просчитывает машинное обучение — в ней много параметров — возраст, зарплата, кредитная история, наличие недвижимости, автомобиля, судимости и других признаков, после обработки которых выносится положительное или отрицательное решение

In [1]:
# # Импортируем библиотеки
# %matplotlib inline
# %config InlineBackend.figure_format = 'svg'

# import warnings
# warnings.filterwarnings('ignore')
# import numpy as np
# import pandas as pd   
# import matplotlib.pyplot as plt
# import seaborn as sns
# import itertools

# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
from pyspark.sql import SparkSession

In [3]:
# from pyspark.sql import functions as F
from pyspark.sql.types import *

In [4]:
spark=SparkSession.builder.appName("scoring").getOrCreate()

# spark.stop()

# Данные:
[скачать](https://drive.google.com/file/d/1MuAyZiIm3b_r-AgQSj78tsRPqZpvv_2s/view?usp=sharing)

**application_record.csv**
*   Feature name	Explanation	Remarks
*   ID	Client number	
*   CODE_GENDER	Gender	
*   FLAG_OWN_CAR	Is there a car	
*   FLAG_OWN_REALTY	Is there a property	
*   CNT_CHILDREN	Number of children	
*   AMT_INCOME_TOTAL	Annual income	
*   NAME_INCOME_TYPE	Income category	
*   NAME_EDUCATION_TYPE	Education level	
*   NAME_FAMILY_STATUS	Marital status	
*   NAME_HOUSING_TYPE	Way of living	
*   DAYS_BIRTH	Birthday	Count backwards from current day (0), -1 means yesterday
*   DAYS_EMPLOYED	Start date of employment	Count backwards from current day(0). If positive, it means the person currently unemployed.
FLAG_MOBIL	Is there a mobile phone	
*   FLAG_WORK_PHONE	Is there a work phone	
*   FLAG_PHONE	Is there a phone	
*   FLAG_EMAIL	Is there an email	
*   OCCUPATION_TYPE	Occupation	
*   CNT_FAM_MEMBERS	Family size	

**credit_record.csv**
*   Feature name	Explanation	Remarks
*   ID	Client number	
*   MONTHS_BALANCE	Record month	The month of the extracted data is the starting point, backwards, 0 is the current month, -1 is the previous month, and so on
*   STATUS	Status	
   *   0: 1-29 days past due
   *   1: 30-59 days past due 
   *   2: 60-89 days overdue 
   *   3: 90-119 days overdue 
   *   4: 120-149 days overdue 
    *   5: Overdue or bad debts, write-offs for more than 150 days
    *   C: paid off that month X: No loan for the month


## Считываем данные

In [5]:
# data = pd.read_csv("application_record.csv", encoding = 'utf-8') 
# record = pd.read_csv("credit_record.csv", encoding = 'utf-8') 

In [6]:
data = spark.read.csv('Archive 2/application_record.csv',inferSchema=True, header=True)

In [7]:
record = spark.read.csv('Archive 2/credit_record.csv',inferSchema=True, header=True)

In [8]:
record.show()

+-------+--------------+------+
|     ID|MONTHS_BALANCE|STATUS|
+-------+--------------+------+
|5001711|             0|     X|
|5001711|            -1|     0|
|5001711|            -2|     0|
|5001711|            -3|     0|
|5001712|             0|     C|
|5001712|            -1|     C|
|5001712|            -2|     C|
|5001712|            -3|     C|
|5001712|            -4|     C|
|5001712|            -5|     C|
|5001712|            -6|     C|
|5001712|            -7|     C|
|5001712|            -8|     C|
|5001712|            -9|     0|
|5001712|           -10|     0|
|5001712|           -11|     0|
|5001712|           -12|     0|
|5001712|           -13|     0|
|5001712|           -14|     0|
|5001712|           -15|     0|
+-------+--------------+------+
only showing top 20 rows



In [9]:
# # Ниже, мы для тех, у кого хоть раз были просрчоки больше 60 дней, ставим в таргет 1.
# # # Добавляем срок кредита к параметрам выдачи кредита
# begin_month = pd.DataFrame(record.groupby(["ID"])["MONTHS_BALANCE"].agg(min) * - 1)
# begin_month = begin_month.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
# new_data = pd.merge(data, begin_month,how="left",on="ID") 

In [10]:
sdf=record.groupBy('id').min('MONTHS_BALANCE')

### умножим на -1 для получения положительных чисел в 'MONTHS_BALANCE'

In [11]:
sdf=sdf.withColumn('begin_month',sdf['min(MONTHS_BALANCE)'] * -1).drop(sdf['min(MONTHS_BALANCE)'])

In [12]:
sdf.show(5)

+-------+-----------+
|     id|begin_month|
+-------+-----------+
|5001812|         22|
|5001849|          8|
|5001921|         19|
|5003338|         33|
|5003386|          7|
+-------+-----------+
only showing top 5 rows



In [13]:
new_data=data.join(sdf,on='id',how="left")

In [14]:
#data test
#new_data.select('id','begin_month').where(new_data['id']=='5008804').show()
new_data.show(5)

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+-----------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|begin_month|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+-----------+
|5008804|          M|           Y|              Y|           0|        427500.0|             Working|    Higher education|      Civil marriage| Rented apartment

In [15]:
# # # Больше 60, то это просрочка, ставим - Yes, если просрочка есть за срок кредита,то так же ставим Yes
# record['dep_value'] = None
# record['dep_value'][record['STATUS'] == '2'] = 'Yes'
# record['dep_value'][record['STATUS'] == '3'] = 'Yes'
# record['dep_value'][record['STATUS'] == '4'] = 'Yes'
# record['dep_value'][record['STATUS'] == '5'] = 'Yes'
# cpunt = record.groupby('ID').count()
# cpunt['dep_value'][cpunt['dep_value'] > 0] = 'Yes' 
# cpunt['dep_value'][cpunt['dep_value'] == 0] = 'No'

### сделаем фильтрацию по полю STATUS: string

In [16]:
cpunt=record.filter("STATUS=='0' or STATUS=='2' or STATUS=='3' or STATUS=='4' or STATUS=='5'")

In [17]:
# record.filter("STATUS=='0' or STATUS=='2' or STATUS=='3' or STATUS=='4' or STATUS=='5'").groupby('ID').count()

### проверим значения, которые попали в колонку 'STATUS' - от 0 до 5

In [18]:
cpunt.describe().show()

+-------+-----------------+-------------------+-------------------+
|summary|               ID|     MONTHS_BALANCE|             STATUS|
+-------+-----------------+-------------------+-------------------+
|  count|           386224|             386224|             386224|
|   mean|5068504.870748581|-21.842368677244295|0.03120727867765856|
| stddev|45859.31637436183| 15.040049897593708|0.36646649920086827|
|    min|          5001711|                -60|                  0|
|    max|          5150485|                  0|                  5|
+-------+-----------------+-------------------+-------------------+



### проверим значения, которые попали в колонку 'STATUS' - от 0 до 5 и тут не должно быть '1'

In [19]:
cpunt.filter(cpunt["STATUS"]=='1').show()

+---+--------------+------+
| ID|MONTHS_BALANCE|STATUS|
+---+--------------+------+
+---+--------------+------+



In [20]:
# from pyspark.sql import Window
from pyspark.sql import functions as F

In [21]:
# windSpec=Window()\
# .partitionBy('ID')\
# .orderBy('STATUS')

In [22]:
# cpunt=cpunt.groupBy('ID').count()

In [23]:
cpunt.show()

+-------+--------------+------+
|     ID|MONTHS_BALANCE|STATUS|
+-------+--------------+------+
|5001711|            -1|     0|
|5001711|            -2|     0|
|5001711|            -3|     0|
|5001712|            -9|     0|
|5001712|           -10|     0|
|5001712|           -11|     0|
|5001712|           -12|     0|
|5001712|           -13|     0|
|5001712|           -14|     0|
|5001712|           -15|     0|
|5001712|           -16|     0|
|5001712|           -17|     0|
|5001712|           -18|     0|
|5001717|            -5|     0|
|5001717|            -6|     0|
|5001717|            -7|     0|
|5001717|            -8|     0|
|5001717|            -9|     0|
|5001717|           -10|     0|
|5001717|           -11|     0|
+-------+--------------+------+
only showing top 20 rows



In [24]:
cpunt=cpunt.select('ID','STATUS',F\
             .when(cpunt['STATUS']>0,'Yes')\
             .when(cpunt['STATUS']==0,'No')\
             .otherwise('no_data')\
             .alias('dep_value'))

### посмотрим на новую колонку и данные в ней:

In [25]:
cpunt.describe().show()

+-------+-----------------+-------------------+---------+
|summary|               ID|             STATUS|dep_value|
+-------+-----------------+-------------------+---------+
|  count|           386224|             386224|   386224|
|   mean|5068504.870748581|0.03120727867765856|     null|
| stddev|45859.31637436183|0.36646649920086827|     null|
|    min|          5001711|                  0|       No|
|    max|          5150485|                  5|      Yes|
+-------+-----------------+-------------------+---------+



In [26]:
cpunt=cpunt.drop('STATUS')

In [27]:
cpunt=cpunt.select('ID','dep_value').groupby('ID','dep_value').count()

In [28]:
# cpunt=cpunt.select('ID','dep_value').groupby('ID','dep_value').count()

In [29]:
cpunt.describe().show()

+-------+-----------------+---------+-----------------+
|summary|               ID|dep_value|            count|
+-------+-----------------+---------+-----------------+
|  count|            40647|    40647|            40647|
|   mean|5070456.520358206|     null| 9.50190665977809|
| stddev|45352.54866237441|     null|8.130273870026793|
|    min|          5001711|       No|                1|
|    max|          5150485|      Yes|               61|
+-------+-----------------+---------+-----------------+



In [30]:
# # # Джойним всё вместе,заменяем Yes и No на 1 и 0
# cpunt = cpunt[['dep_value']]
# new_data = pd.merge(new_data, cpunt, how='inner', on='ID')
# new_data['target'] = new_data['dep_value']
# new_data.loc[new_data['target'] == 'Yes','target'] = 1
# new_data.loc[new_data['target'] == 'No','target'] = 0

In [31]:
new_data=new_data.join(cpunt,on='ID')

In [32]:
new_data.show(5,vertical=True)

-RECORD 0-----------------------------------
 ID                  | 5009033              
 CODE_GENDER         | F                    
 FLAG_OWN_CAR        | N                    
 FLAG_OWN_REALTY     | N                    
 CNT_CHILDREN        | 0                    
 AMT_INCOME_TOTAL    | 255150.0             
 NAME_INCOME_TYPE    | Pensioner            
 NAME_EDUCATION_TYPE | Incomplete higher    
 NAME_FAMILY_STATUS  | Civil marriage       
 NAME_HOUSING_TYPE   | Rented apartment     
 DAYS_BIRTH          | -18682               
 DAYS_EMPLOYED       | 365243               
 FLAG_MOBIL          | 1                    
 FLAG_WORK_PHONE     | 0                    
 FLAG_PHONE          | 0                    
 FLAG_EMAIL          | 0                    
 OCCUPATION_TYPE     | null                 
 CNT_FAM_MEMBERS     | 2.0                  
 begin_month         | 16                   
 dep_value           | No                   
 count               | 4                    
-RECORD 1-

In [33]:
new_data=new_data.select('*',F\
             .when(new_data['dep_value']=='Yes',1)\
             .when(new_data['dep_value']=='No',0)\
             .otherwise('no_data')\
             .alias('target'))

In [34]:
# #  В итоге к анкетным данным мы добавили таргет
# new_data.head()

In [35]:
new_data.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- CODE_GENDER: string (nullable = true)
 |-- FLAG_OWN_CAR: string (nullable = true)
 |-- FLAG_OWN_REALTY: string (nullable = true)
 |-- CNT_CHILDREN: integer (nullable = true)
 |-- AMT_INCOME_TOTAL: double (nullable = true)
 |-- NAME_INCOME_TYPE: string (nullable = true)
 |-- NAME_EDUCATION_TYPE: string (nullable = true)
 |-- NAME_FAMILY_STATUS: string (nullable = true)
 |-- NAME_HOUSING_TYPE: string (nullable = true)
 |-- DAYS_BIRTH: integer (nullable = true)
 |-- DAYS_EMPLOYED: integer (nullable = true)
 |-- FLAG_MOBIL: integer (nullable = true)
 |-- FLAG_WORK_PHONE: integer (nullable = true)
 |-- FLAG_PHONE: integer (nullable = true)
 |-- FLAG_EMAIL: integer (nullable = true)
 |-- OCCUPATION_TYPE: string (nullable = true)
 |-- CNT_FAM_MEMBERS: double (nullable = true)
 |-- begin_month: integer (nullable = true)
 |-- dep_value: string (nullable = false)
 |-- count: long (nullable = false)
 |-- target: string (nullable = false)



сменим таргет на int

In [36]:
new_data=new_data.withColumn("target", new_data['target'].cast('int'))

In [37]:
# # Упростим себе задачу и оставим только часть признаков
# features = ['AMT_INCOME_TOTAL', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']	
# target = ['target',]
# dataset = new_data[features + target]
# dataset[target[0]] = pd.to_numeric(dataset[target[0]])

### переведем категориальные фичи в численные

In [38]:
features = ['AMT_INCOME_TOTAL', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN']

In [39]:
from pyspark.ml.feature import StringIndexer

In [40]:
indexer=StringIndexer(inputCols=['CODE_GENDER','FLAG_OWN_CAR', 'FLAG_OWN_REALTY'],outputCols=['CODE_GENDER_index','FLAG_OWN_CAR_index', 'FLAG_OWN_REALTY_index'])

In [41]:
new_features=['CODE_GENDER_index','FLAG_OWN_CAR_index', 'FLAG_OWN_REALTY_index']

In [42]:
dataset=indexer.fit(new_data).transform(new_data)

### добавим Таргет

In [43]:
dataset.show(1)

+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+-----------+---------+-----+------+-----------------+------------------+---------------------+
|     ID|CODE_GENDER|FLAG_OWN_CAR|FLAG_OWN_REALTY|CNT_CHILDREN|AMT_INCOME_TOTAL|    NAME_INCOME_TYPE| NAME_EDUCATION_TYPE|  NAME_FAMILY_STATUS|NAME_HOUSING_TYPE|DAYS_BIRTH|DAYS_EMPLOYED|FLAG_MOBIL|FLAG_WORK_PHONE|FLAG_PHONE|FLAG_EMAIL|OCCUPATION_TYPE|CNT_FAM_MEMBERS|begin_month|dep_value|count|target|CODE_GENDER_index|FLAG_OWN_CAR_index|FLAG_OWN_REALTY_index|
+-------+-----------+------------+---------------+------------+----------------+--------------------+--------------------+--------------------+-----------------+----------+-------------+----------+---------------+----------+----------+---------------+---------------+---------

In [44]:
dataset=dataset.select(new_features+['target','AMT_INCOME_TOTAL','CNT_CHILDREN'])

### посмотрим что вышло

In [69]:
dataset.filter(dataset['target']>0).count()

616

In [46]:
dataset.columns

['CODE_GENDER_index',
 'FLAG_OWN_CAR_index',
 'FLAG_OWN_REALTY_index',
 'target',
 'AMT_INCOME_TOTAL',
 'CNT_CHILDREN']

In [47]:
# from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler

In [48]:
faetureassembler=VectorAssembler(inputCols=new_features+['AMT_INCOME_TOTAL','CNT_CHILDREN'],outputCol='ind_features')

In [49]:
output=faetureassembler.transform(dataset)

посмотрим список фичей

In [50]:
output.select('ind_features').show(truncate=False)

+--------------------------+
|ind_features              |
+--------------------------+
|[1.0,1.0,0.0,427500.0,0.0]|
|[1.0,1.0,0.0,427500.0,0.0]|
|[1.0,1.0,0.0,112500.0,0.0]|
|(5,[3],[270000.0])        |
|(5,[3],[270000.0])        |
|(5,[3],[270000.0])        |
|(5,[3],[283500.0])        |
|(5,[3],[283500.0])        |
|(5,[3],[283500.0])        |
|[1.0,1.0,0.0,270000.0,0.0]|
|[1.0,1.0,0.0,270000.0,0.0]|
|[1.0,1.0,0.0,135000.0,0.0]|
|[1.0,1.0,0.0,135000.0,0.0]|
|[1.0,1.0,0.0,135000.0,0.0]|
|[1.0,1.0,0.0,135000.0,0.0]|
|[1.0,1.0,0.0,135000.0,0.0]|
|[0.0,1.0,1.0,130500.0,0.0]|
|[0.0,1.0,1.0,130500.0,0.0]|
|(5,[3],[157500.0])        |
|(5,[3],[157500.0])        |
+--------------------------+
only showing top 20 rows



In [51]:
from pyspark.ml.feature import MinMaxScaler, StandardScaler
from pyspark.ml import Pipeline

In [52]:
# MinMaxScaler(inputCol="ind_features", outputCol="scaled")

In [56]:
scaler = MinMaxScaler(inputCol="ind_features", outputCol="scaled")

In [74]:
pipeline = Pipeline(stages=[scaler])

In [75]:
scalerModel = pipeline.fit(output)

In [76]:
scaledData = scalerModel.transform(output)

In [77]:
finalized_data=scaledData.select('ind_features','target')

In [104]:
finalized_data.show(truncate=False)

+--------------------------+------+
|ind_features              |target|
+--------------------------+------+
|[1.0,1.0,0.0,427500.0,0.0]|0     |
|[1.0,1.0,0.0,427500.0,0.0]|0     |
|[1.0,1.0,0.0,112500.0,0.0]|0     |
|(5,[3],[270000.0])        |0     |
|(5,[3],[270000.0])        |0     |
|(5,[3],[270000.0])        |0     |
|(5,[3],[283500.0])        |0     |
|(5,[3],[283500.0])        |0     |
|(5,[3],[283500.0])        |0     |
|[1.0,1.0,0.0,270000.0,0.0]|0     |
|[1.0,1.0,0.0,270000.0,0.0]|0     |
|[1.0,1.0,0.0,135000.0,0.0]|0     |
|[1.0,1.0,0.0,135000.0,0.0]|0     |
|[1.0,1.0,0.0,135000.0,0.0]|0     |
|[1.0,1.0,0.0,135000.0,0.0]|0     |
|[1.0,1.0,0.0,135000.0,0.0]|0     |
|[0.0,1.0,1.0,130500.0,0.0]|0     |
|[0.0,1.0,1.0,130500.0,0.0]|0     |
|(5,[3],[157500.0])        |0     |
|(5,[3],[157500.0])        |0     |
+--------------------------+------+
only showing top 20 rows



У нас есть выборка, где указаны параметры клиента, и вышел ли он на просрочку или нет.

In [None]:
# # Разделим выборку на трейн и тест, на трейн будем обучать модель, на тест валидировать.
# X_train, X_test, y_train, y_test = train_test_split(dataset[features], pd.to_numeric(dataset[target[0]]), test_size=0.3, random_state=42)

In [93]:
train_data,test_data=finalized_data.randomSplit([0.75,0.25])

In [94]:
# # Создадим простейшую модель, которая покажет через линейные коэффиценты связь переменных и таргета
# model = LogisticRegression()
# model.fit(X_train, y_train)

In [95]:
# # Отскалируем численные
# mms = MinMaxScaler()
# mms.fit(X_train[['AMT_INCOME_TOTAL', 'CNT_CHILDREN']])
# X_train_scaled = mms.transform(X_train[['AMT_INCOME_TOTAL', 'CNT_CHILDREN']])
# X_test_scaled = mms.transform(X_test[['AMT_INCOME_TOTAL', 'CNT_CHILDREN']])

# X_train_scaled = pd.DataFrame(X_train_scaled, columns=['AMT_INCOME_TOTAL', 'CNT_CHILDREN'])
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=['AMT_INCOME_TOTAL', 'CNT_CHILDREN'])

#  Модель

In [96]:
from pyspark.ml.classification import LogisticRegression

In [97]:
log_reg=LogisticRegression(featuresCol='ind_features', labelCol='target')

In [98]:
log_reg=log_reg.fit(train_data)

In [99]:
log_reg.coefficients

DenseVector([0.2766, -0.195, 0.3493, 0.0, -0.0635])

In [100]:
pred_test=log_reg.evaluate(test_data)

In [101]:
pred_train=log_reg.evaluate(train_data)

### смотрим предсказание модели

In [123]:
pred_train.predictions.show()

+--------------------+------+--------------------+--------------------+----------+
|        ind_features|target|       rawPrediction|         probability|prediction|
+--------------------+------+--------------------+--------------------+----------+
|(5,[0,3],[1.0,301...|     0|[3.79057359818567...|[0.97791606880206...|       0.0|
|(5,[0,3],[1.0,301...|     0|[3.79057359818567...|[0.97791606880206...|       0.0|
|(5,[0,3],[1.0,315...|     0|[3.79047044551245...|[0.97791384098326...|       0.0|
|(5,[0,3],[1.0,315...|     0|[3.79047044551245...|[0.97791384098326...|       0.0|
|(5,[0,3],[1.0,369...|     0|[3.79005783481955...|[0.97790492751119...|       0.0|
|(5,[0,3],[1.0,369...|     0|[3.79005783481955...|[0.97790492751119...|       0.0|
|(5,[0,3],[1.0,423...|     0|[3.78964522412665...|[0.97789601052315...|       0.0|
|(5,[0,3],[1.0,469...|     0|[3.78929003508852...|[0.97788833166555...|       0.0|
|(5,[0,3],[1.0,469...|     0|[3.78929003508852...|[0.97788833166555...|       0.0|
|(5,

In [103]:
print(f'Точность модели на трейне {pred_train.accuracy}, на тесте {pred_test.accuracy}')

Точность модели на трейне 0.9806872232245366, на тесте 0.9822976437553412


In [None]:
# # Превращаем категориальные факторы в численные
# ohe = OneHotEncoder()
# ohe.fit(X_train[['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']])
# X_train_ohe = ohe.transform(X_train[['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']])
# X_test_ohe = ohe.transform(X_test[['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']])

# X_train_ohe = pd.DataFrame(X_train_ohe.toarray(), columns=[item for sublist in ohe.categories_ for item in sublist])
# X_test_ohe = pd.DataFrame(X_test_ohe.toarray(), columns=[item for sublist in ohe.categories_ for item in sublist])

In [None]:
# X_train = pd.concat([X_train_scaled, X_train_ohe,], axis=1)
# X_test = pd.concat([X_test_scaled, X_test_ohe, ], axis=1)

In [None]:
# train_score, test_score = accuracy_score(model.predict(X_train), y_train), accuracy_score(model.predict(X_test), y_test)
# print(f'Точность модели на трейне {train_score}, на тесте {test_score}')