In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
test_df = pd.read_parquet('test.parquet')

In [3]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20000 non-null  int64 
 1   dates   20000 non-null  object
 2   values  20000 non-null  object
dtypes: int64(1), object(2)
memory usage: 468.9+ KB


In [4]:
test_df.shape

(20000, 3)

In [5]:
test_df.head()

Unnamed: 0,id,dates,values
0,6125,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[1.85, -0.04, 0.19, -0.45, -0.75, -0.95, -2.91..."
1,26781,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-0.41, 0.39, -0.47, -0.9, -1.46, -0.51, 0.51,..."
2,13333,"[2016-06-01, 2016-07-01, 2016-08-01, 2016-09-0...","[-0.29, -1.26, 0.17, -1.22, 0.45, -0.94, 0.16,..."
3,53218,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[-1.47, 1.55, -0.03, 0.57, -0.57, 0.6, 0.27, 1..."
4,84204,"[2016-01-01, 2016-02-01, 2016-03-01, 2016-04-0...","[2.33, 1.39, -1.03, -2.64, 1.89, 1.77, 1.43, 1..."


In [6]:
test_df.isnull().sum()

Unnamed: 0,0
id,0
dates,0
values,0


In [7]:
test_df['dates'] = test_df['dates'].apply(lambda x: [pd.to_datetime(d) for d in x])
test_df.head()

Unnamed: 0,id,dates,values
0,6125,"[2016-01-01 00:00:00, 2016-02-01 00:00:00, 201...","[1.85, -0.04, 0.19, -0.45, -0.75, -0.95, -2.91..."
1,26781,"[2016-01-01 00:00:00, 2016-02-01 00:00:00, 201...","[-0.41, 0.39, -0.47, -0.9, -1.46, -0.51, 0.51,..."
2,13333,"[2016-06-01 00:00:00, 2016-07-01 00:00:00, 201...","[-0.29, -1.26, 0.17, -1.22, 0.45, -0.94, 0.16,..."
3,53218,"[2016-01-01 00:00:00, 2016-02-01 00:00:00, 201...","[-1.47, 1.55, -0.03, 0.57, -0.57, 0.6, 0.27, 1..."
4,84204,"[2016-01-01 00:00:00, 2016-02-01 00:00:00, 201...","[2.33, 1.39, -1.03, -2.64, 1.89, 1.77, 1.43, 1..."


In [8]:
test_expanded = test_df.explode(['dates', 'values'])
test_expanded.head(10)

Unnamed: 0,id,dates,values
0,6125,2016-01-01,1.85
0,6125,2016-02-01,-0.04
0,6125,2016-03-01,0.19
0,6125,2016-04-01,-0.45
0,6125,2016-05-01,-0.75
0,6125,2016-06-01,-0.95
0,6125,2016-07-01,-2.91
0,6125,2016-08-01,-1.54
0,6125,2016-09-01,0.34
0,6125,2016-10-01,3.12


In [9]:
test_expanded.dtypes

Unnamed: 0,0
id,int64
dates,datetime64[ns]
values,object


In [10]:
test_expanded['values'] = pd.to_numeric(test_expanded['values'])

In [11]:
test_expanded.dtypes

Unnamed: 0,0
id,int64
dates,datetime64[ns]
values,float64


In [12]:
test_expanded.isna().sum()

Unnamed: 0,0
id,0
dates,0
values,1079


In [13]:
test_expanded['values'].fillna(test_expanded['values'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_expanded['values'].fillna(test_expanded['values'].mean(), inplace=True)


In [14]:
test_expanded.isna().sum()

Unnamed: 0,0
id,0
dates,0
values,0


In [15]:
import numpy as np
test_expanded['year'] = test_expanded['dates'].dt.year
test_expanded['month'] = test_expanded['dates'].dt.month
test_expanded['day'] = test_expanded['dates'].dt.day
test_expanded['day_of_week'] = test_expanded['dates'].dt.dayofweek
test_expanded['is_weekend'] = np.where(test_expanded['day_of_week'] >= 5, 1, 0)

test_expanded['lag_1'] = test_expanded.groupby('id')['values'].shift(1)
test_expanded['lag_2'] = test_expanded.groupby('id')['values'].shift(2)
test_expanded['lag_3'] = test_expanded.groupby('id')['values'].shift(3)

test_expanded['mean_value'] = test_expanded.groupby('id')['values'].transform('mean')
test_expanded['std_value'] = test_expanded.groupby('id')['values'].transform('std')
test_expanded['min_value'] = test_expanded.groupby('id')['values'].transform('min')
test_expanded['max_value'] = test_expanded.groupby('id')['values'].transform('max')


In [16]:
test_expanded.isna().sum()

Unnamed: 0,0
id,0
dates,0
values,0
year,0
month,0
day,0
day_of_week,0
is_weekend,0
lag_1,20000
lag_2,40000


In [17]:
test_expanded['lag_1'].fillna(method='ffill', inplace=True)
test_expanded['lag_2'].fillna(method='ffill', inplace=True)
test_expanded['lag_3'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_expanded['lag_1'].fillna(method='ffill', inplace=True)
  test_expanded['lag_1'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_expanded['lag_2'].fillna(method='ffill', inplace=True)
  test_expanded['lag_2'].fillna(method='ffill', in

In [18]:
test_expanded.isna().sum()

Unnamed: 0,0
id,0
dates,0
values,0
year,0
month,0
day,0
day_of_week,0
is_weekend,0
lag_1,1
lag_2,2


In [19]:
test_expanded

Unnamed: 0,id,dates,values,year,month,day,day_of_week,is_weekend,lag_1,lag_2,lag_3,mean_value,std_value,min_value,max_value
0,6125,2016-01-01,1.85,2016,1,1,4,0,,,,0.037895,1.519372,-3.14,3.13
0,6125,2016-02-01,-0.04,2016,2,1,0,0,1.85,,,0.037895,1.519372,-3.14,3.13
0,6125,2016-03-01,0.19,2016,3,1,1,0,-0.04,1.85,,0.037895,1.519372,-3.14,3.13
0,6125,2016-04-01,-0.45,2016,4,1,4,0,0.19,-0.04,1.85,0.037895,1.519372,-3.14,3.13
0,6125,2016-05-01,-0.75,2016,5,1,6,1,-0.45,0.19,-0.04,0.037895,1.519372,-3.14,3.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999,73528,2023-02-01,-0.53,2023,2,1,2,0,2.44,0.35,-0.26,0.020667,1.316218,-2.99,2.44
19999,73528,2023-03-01,1.65,2023,3,1,2,0,-0.53,2.44,0.35,0.020667,1.316218,-2.99,2.44
19999,73528,2023-04-01,0.07,2023,4,1,5,1,1.65,-0.53,2.44,0.020667,1.316218,-2.99,2.44
19999,73528,2023-05-01,0.62,2023,5,1,0,0,0.07,1.65,-0.53,0.020667,1.316218,-2.99,2.44


In [24]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [25]:
from catboost import CatBoostClassifier

In [27]:
model = CatBoostClassifier()
model.load_model("catboost_model.cbm")

<catboost.core.CatBoostClassifier at 0x789505dd1030>

In [28]:
ids=test_df['id'].unique()
ids.shape

(20000,)

In [29]:
predictions_proba = model.predict_proba(test_expanded)
#probs для класса 1
proba_class_1 = predictions_proba[:, 1]
test_expanded['score'] = proba_class_1
score_by_id = test_expanded.groupby('id', as_index=False)['score'].mean()
score_by_id['score'] = score_by_id['score'].apply(lambda x: f"{x:.17f}".replace('.', ','))
results = pd.DataFrame({
    'id': ids,
    'score': score_by_id['score']
})

results.head(10)

Unnamed: 0,id,score
0,6125,56698827654244544
1,26781,31633807762000371
2,13333,3220935191895943
3,53218,28542416704241202
4,84204,5704125503781809
5,69997,17024363225304581
6,99301,2014040675498498
7,4361,10931249829953109
8,46607,15309930097961236
9,29836,53035185058211065


In [30]:
results.to_csv('submission.csv', index=False)