In [49]:
# !cp /content/drive/MyDrive/Colab/Data/'Готовые задачи'/Калининград/participants/train/train.csv ./ 
# !cp /content/drive/MyDrive/Colab/Data/'Готовые задачи'/Калининград/participants/test/test.csv ./ 

In [50]:
from google.colab import drive
drive.mount('/content/drive')
train_path = '/content/drive/My Drive/hackathon_tula/train_dataset_train.csv'
test_path = '/content/drive/My Drive/hackathon_tula/test_dataset_test.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
#Установка catboost
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [52]:
#import необходимых модулей

import pandas as pd
import numpy as np
import math
import statistics as stat
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [53]:
#Считывание данных в DataFrame 

train = pd.read_csv(
   train_path,
     sep=';', index_col=None,
      dtype={'PATIENT_SEX':str, 'MKB_CODE':str,
             'ADRES':str, 'VISIT_MONTH_YEAR':str,
             'AGE_CATEGORY':str, 'PATIENT_ID_COUNT':int})
test = pd.read_csv(test_path,
                   sep=';', index_col=None,
                   dtype={'PATIENT_SEX':str,
                          'MKB_CODE':str, 'ADRES':str,
                          'VISIT_MONTH_YEAR':str, 'AGE_CATEGORY':str})

In [54]:
train_data = train.copy()
test_data = test.copy()

In [55]:
print('shape of initial train dataset: ', train_data.shape)

shape of initial train dataset:  (2212393, 6)


In [56]:
# transform the date column in train dataset
date_time_list = train_data['VISIT_MONTH_YEAR'].tolist()
year_list = []
month_list = []
for i in date_time_list:
  splt = i.split('.')
  month = int(splt[0])
  month_list.append(month)
  year = int(splt[1])
  year_list.append(year)
month_ser = pd.Series(month_list, index=train_data.index, name='VISIT_MONTH')
year_ser = pd.Series(year_list, index=train_data.index, name='VISIT_YEAR')
exp_train_data = train_data.copy()
exp_train_data.drop('VISIT_MONTH_YEAR', axis=1, inplace=True)
exp_train_data = exp_train_data.merge(month_ser, left_index=True, right_index=True)
exp_train_data = exp_train_data.merge(year_ser, left_index=True, right_index=True)
exp_train_data.sort_values(by=['VISIT_YEAR', 'VISIT_MONTH'], ascending=True, inplace=True)
month_list = exp_train_data['VISIT_MONTH'].tolist()
year_list = exp_train_data['VISIT_YEAR'].tolist()
str_month_list = [str(each) + '.' for each in month_list]
str_year_list = [str(each) for each in year_list]
date_list = []
for i in range(len(str_month_list)):
  str_date = str_month_list[i] + str_year_list[i]
  date_list.append(str_date)
date_series = pd.Series(date_list, index=exp_train_data.index, name='VISIT_MONTH_YEAR')
exp_train_data.drop(['VISIT_MONTH', 'VISIT_YEAR'], axis=1, inplace=True)
f_train_data = exp_train_data.merge(date_series, right_index=True, left_index=True)
print('shape of final train dataset: ', f_train_data.shape)


shape of final train dataset:  (2212393, 6)


In [57]:
#Отделение меток от данных

X = train_data[['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY']]
y = train_data[['PATIENT_ID_COUNT']]

In [58]:
support_x_train = X.copy()
for column in support_x_train.columns:
  support_x_train[column] = support_x_train[column].astype('category').cat.codes
print('shape of supporting dataset: ', support_x_train.shape)

shape of supporting dataset:  (2212393, 5)


In [59]:
# конструирование признаков
X['PATIENT_PERSANALITY'] = support_x_train['PATIENT_SEX'] / len(support_x_train['PATIENT_SEX'].tolist())  + support_x_train['AGE_CATEGORY'] / len(support_x_train['PATIENT_SEX'].tolist()) * 10**5

In [60]:
X['PATIENT_PERSANALITY']

0          0.2260
1          0.0452
2          0.0452
3          0.0452
4          0.0452
            ...  
2212388    0.0452
2212389    0.2260
2212390    0.0452
2212391    0.0452
2212392    0.0452
Name: PATIENT_PERSANALITY, Length: 2212393, dtype: float64

In [61]:
#Разделение на train/test для локального тестирования

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=234)

In [62]:
#Создание объекта данных Pool, плюсы: возможность указать какие признаки являются категориальными

pool_train = Pool(X_train, y_train, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY'])
pool_test = Pool(X_test, y_test, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY'])

In [36]:
#Объявление CatBoostRegressor и обучение
catboost_params = {
  'iterations': 1200,
 'l2_leaf_reg': 4,
 'random_strength': 1.2,
 'use_best_model': True,
 'random_seed': 234,
 'bagging_temperature': 1,
 'od_type': 'Iter',
 'od_wait': 20,
 'loss_function': 'RMSE',
 'task_type': 'GPU'
}
tuned_model = CatBoostRegressor(**catboost_params)
tuned_model.fit(pool_train,
                eval_set=pool_test)

0:	learn: 59.5029383	test: 59.6923877	best: 59.6923877 (0)	total: 166ms	remaining: 3m 19s
1:	learn: 58.8146137	test: 59.1372561	best: 59.1372561 (1)	total: 281ms	remaining: 2m 48s
2:	learn: 57.8187623	test: 58.1452005	best: 58.1452005 (2)	total: 534ms	remaining: 3m 32s
3:	learn: 57.0223747	test: 57.3583878	best: 57.3583878 (3)	total: 724ms	remaining: 3m 36s
4:	learn: 56.2361014	test: 56.6888569	best: 56.6888569 (4)	total: 945ms	remaining: 3m 45s
5:	learn: 55.5379240	test: 55.7965292	best: 55.7965292 (5)	total: 1.17s	remaining: 3m 52s
6:	learn: 54.9763044	test: 55.3474309	best: 55.3474309 (6)	total: 1.34s	remaining: 3m 48s
7:	learn: 54.2696150	test: 54.7489896	best: 54.7489896 (7)	total: 1.6s	remaining: 3m 58s
8:	learn: 53.4640986	test: 53.9331573	best: 53.9331573 (8)	total: 1.85s	remaining: 4m 4s
9:	learn: 52.9635738	test: 53.5349857	best: 53.5349857 (9)	total: 2.03s	remaining: 4m 1s
10:	learn: 52.4988941	test: 53.1845556	best: 53.1845556 (10)	total: 2.16s	remaining: 3m 53s
11:	learn: 

<catboost.core.CatBoostRegressor at 0x7fafa6b96790>

In [37]:
#Получение ответов модели на тестовой выборке в локальном тестировании 

y_pred = tuned_model.predict(pool_test)

In [38]:
#На локальном тестировании модель выдаёт такой результат

print("Значение метрики R2 на test: ", r2_score(y_test, y_pred))

Значение метрики R2 на test:  0.8111790430536051


In [39]:
support_x_test = test_data.copy()
for column in support_x_test.columns:
  support_x_test[column] = support_x_test[column].astype('category').cat.codes
print('shape of supporting dataset: ', support_x_test.shape)
# конструирование признаков
test_data['PATIENT_PERSANALITY'] = support_x_test['PATIENT_SEX'] / len(support_x_test['PATIENT_SEX'].tolist()) + support_x_test['AGE_CATEGORY'] / len(support_x_test['PATIENT_SEX'].tolist()) * 10**3

shape of supporting dataset:  (39373, 5)


In [40]:
test_data['PATIENT_PERSANALITY']

0        0.025398
1        0.050796
2        0.076194
3        0.126991
4        0.076194
           ...   
39368    0.050822
39369    0.076220
39370    0.127016
39371    0.025424
39372    0.025424
Name: PATIENT_PERSANALITY, Length: 39373, dtype: float64

In [41]:
#Формируем sample_solution. В обучении используется весь train, ответы получаем на test

pool_train_solution = Pool(X, y, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY'])
pool_test_solution = Pool(test_data, cat_features = ['PATIENT_SEX', 'MKB_CODE', 'ADRES', 'VISIT_MONTH_YEAR', 'AGE_CATEGORY'])
tuned_catboost_params = catboost_params = {
  'iterations': 1200 * 1.2,
 'l2_leaf_reg': 4,
 'random_strength': 1.2,
 'use_best_model': False,
 'random_seed': 234,
 'bagging_temperature': 1,
 'od_type': 'Iter',
 'od_wait': 20,
 'loss_function': 'RMSE',
 'task_type': 'GPU'
}

model_solution = CatBoostRegressor(**tuned_catboost_params)
model_solution.fit(pool_train_solution)

0:	learn: 59.5241181	total: 180ms	remaining: 4m 19s
1:	learn: 58.9007775	total: 300ms	remaining: 3m 35s
2:	learn: 57.9264896	total: 458ms	remaining: 3m 39s
3:	learn: 57.1481254	total: 615ms	remaining: 3m 40s
4:	learn: 56.2414985	total: 770ms	remaining: 3m 40s
5:	learn: 55.5078951	total: 927ms	remaining: 3m 41s
6:	learn: 54.6749682	total: 1.09s	remaining: 3m 42s
7:	learn: 53.8778555	total: 1.24s	remaining: 3m 42s
8:	learn: 53.1772602	total: 1.44s	remaining: 3m 48s
9:	learn: 52.5385631	total: 1.6s	remaining: 3m 48s
10:	learn: 51.9003467	total: 1.79s	remaining: 3m 52s
11:	learn: 51.3182395	total: 1.95s	remaining: 3m 51s
12:	learn: 50.6561723	total: 2.1s	remaining: 3m 50s
13:	learn: 50.1049834	total: 2.26s	remaining: 3m 49s
14:	learn: 49.4876267	total: 2.42s	remaining: 3m 49s
15:	learn: 48.9947944	total: 2.58s	remaining: 3m 49s
16:	learn: 48.4138372	total: 2.73s	remaining: 3m 48s
17:	learn: 47.9422190	total: 2.89s	remaining: 3m 48s
18:	learn: 47.5070642	total: 3.05s	remaining: 3m 47s
19:	l

<catboost.core.CatBoostRegressor at 0x7faf8204a110>

In [42]:
#Получение ответов

y_pred_solution = model_solution.predict(pool_test_solution)

In [43]:
#Вот так они выглядят

y_pred_solution.astype(int)

array([10,  9, 10, ..., 10,  8,  9])

In [44]:
min(y_pred_solution.astype(int))

-156

In [45]:
#Формируем sample_solution для отправки на платформу

test['PATIENT_ID_COUNT'] = y_pred_solution.astype(int)

In [46]:
for value in test['PATIENT_ID_COUNT'].tolist():
  if value < 0:
    test['PATIENT_ID_COUNT'].replace(value, 0, inplace=True)

In [47]:
test

Unnamed: 0,PATIENT_SEX,MKB_CODE,ADRES,VISIT_MONTH_YEAR,AGE_CATEGORY,PATIENT_ID_COUNT
0,0,A00,Калининград,04.22,children,10
1,0,A00,Калининград,04.22,elderly,9
2,0,A00,Калининград,04.22,middleage,10
3,0,A00,Калининград,04.22,young,10
4,0,A01,Калининград,04.22,middleage,10
...,...,...,...,...,...,...
39368,1,Z96.6,Балтийск,04.22,elderly,9
39369,1,Z96.6,Гусев,04.22,middleage,9
39370,1,Z96.7,Гусев,04.22,young,10
39371,1,Z98.8,Озерск,04.22,children,8


In [48]:
#Сохраняем в csv файл
 
test.to_csv('sample_solution.csv', sep=';', index=None)