In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train['completion_date'] = pd.to_datetime(train['completion_date'])
test['completion_date'] = pd.to_datetime(test['completion_date'])

train['dayofweek'] = train['completion_date'].map(lambda x: x.dayofweek)
train['dayofmonth'] = train['completion_date'].map(lambda x: x.day)
train['dayofyear'] = train['completion_date'].map(lambda x: x.dayofyear)

test['dayofweek'] = test['completion_date'].map(lambda x: x.dayofweek)
test['dayofmonth'] = test['completion_date'].map(lambda x: x.day)
test['dayofyear'] = test['completion_date'].map(lambda x: x.dayofyear)

weekly_N = 3
weekly_P = 7
for n in range(1, weekly_N + 1):
    col_name = 'weekly_' + str(n)
    train[col_name + '_sin'] = np.sin((2*np.pi*n*train['dayofweek']) / weekly_P)
    train[col_name + '_cos'] = np.cos((2*np.pi*n*train['dayofweek']) / weekly_P)
    test[col_name + '_sin'] = np.sin((2*np.pi*n*test['dayofweek']) / weekly_P)
    test[col_name + '_cos'] = np.cos((2*np.pi*n*test['dayofweek']) / weekly_P)

monthly_N = 5
monthly_P = 30.5
for n in range(1, monthly_N + 1):
    col_name = 'monthly_' + str(n)
    train[col_name + '_sin'] = np.sin((2*np.pi*n*train['dayofmonth']) / monthly_P)
    train[col_name + '_cos'] = np.cos((2*np.pi*n*train['dayofmonth']) / monthly_P)
    test[col_name + '_sin'] = np.sin((2*np.pi*n*test['dayofmonth']) / monthly_P)
    test[col_name + '_cos'] = np.cos((2*np.pi*n*test['dayofmonth']) / monthly_P)
    
yearly_N = 10
yearly_P = 365.25
for n in range(1, yearly_N + 1):
    col_name = 'yearly_' + str(n)
    train[col_name + '_sin'] = np.sin((2*np.pi*n*train['dayofyear']) / yearly_P)
    train[col_name + '_cos'] = np.cos((2*np.pi*n*train['dayofyear']) / yearly_P)
    test[col_name + '_sin'] = np.sin((2*np.pi*n*test['dayofyear']) / yearly_P)
    test[col_name + '_cos'] = np.cos((2*np.pi*n*test['dayofyear']) / yearly_P)

In [3]:
train['speed_load'] = train['speed_category'] * train['load_category']
train['speed_floor'] = train['speed_category'] * train['floors_category']
train['load_floor'] = train['load_category'] * train['floors_category']
train['load_floor'] = train['load_category'] * train['speed_category'] * train['floors_category']

test['speed_load'] = test['speed_category'] * test['load_category']
test['speed_floor'] = test['speed_category'] * test['floors_category']
test['load_floor'] = test['load_category'] * test['floors_category']
test['load_floor'] = test['load_category'] * test['speed_category'] * test['floors_category']

In [4]:
from catboost import CatBoostClassifier

cat_cols = ['case_id', 'equipment_id', 'action_recommendation_id', 'action_recommendation_type', 
            'action_recommendation_category', 'equipment_area', 'usage_type', 'equipment_category']

model = CatBoostClassifier(cat_features=cat_cols, iterations=400, random_seed=101)

In [5]:
X = train.drop(['feedback', 'completion_date'], axis=1)
y = train['feedback']

In [6]:
model.fit(X, y)

Learning rate set to 0.181354
0:	learn: 0.5362398	total: 466ms	remaining: 3m 5s
1:	learn: 0.4514366	total: 927ms	remaining: 3m 4s
2:	learn: 0.3876858	total: 1.42s	remaining: 3m 8s
3:	learn: 0.3488934	total: 1.83s	remaining: 3m 1s
4:	learn: 0.3209993	total: 2.34s	remaining: 3m 4s
5:	learn: 0.3006956	total: 2.79s	remaining: 3m 3s
6:	learn: 0.2888049	total: 3.3s	remaining: 3m 5s
7:	learn: 0.2798699	total: 3.56s	remaining: 2m 54s
8:	learn: 0.2715114	total: 3.92s	remaining: 2m 50s
9:	learn: 0.2659242	total: 4.23s	remaining: 2m 45s
10:	learn: 0.2617415	total: 4.66s	remaining: 2m 44s
11:	learn: 0.2588999	total: 5.08s	remaining: 2m 44s
12:	learn: 0.2559641	total: 5.58s	remaining: 2m 46s
13:	learn: 0.2535151	total: 6.66s	remaining: 3m 3s
14:	learn: 0.2517659	total: 7.31s	remaining: 3m 7s
15:	learn: 0.2504035	total: 7.89s	remaining: 3m 9s
16:	learn: 0.2493282	total: 8.49s	remaining: 3m 11s
17:	learn: 0.2482870	total: 8.97s	remaining: 3m 10s
18:	learn: 0.2473202	total: 9.32s	remaining: 3m 6s
19:	

157:	learn: 0.2182356	total: 1m 29s	remaining: 2m 16s
158:	learn: 0.2181142	total: 1m 29s	remaining: 2m 16s
159:	learn: 0.2180556	total: 1m 30s	remaining: 2m 15s
160:	learn: 0.2180037	total: 1m 30s	remaining: 2m 14s
161:	learn: 0.2179257	total: 1m 31s	remaining: 2m 13s
162:	learn: 0.2178070	total: 1m 31s	remaining: 2m 13s
163:	learn: 0.2177183	total: 1m 32s	remaining: 2m 12s
164:	learn: 0.2176863	total: 1m 32s	remaining: 2m 11s
165:	learn: 0.2175457	total: 1m 33s	remaining: 2m 11s
166:	learn: 0.2174513	total: 1m 34s	remaining: 2m 11s
167:	learn: 0.2173027	total: 1m 34s	remaining: 2m 10s
168:	learn: 0.2172257	total: 1m 35s	remaining: 2m 10s
169:	learn: 0.2171311	total: 1m 35s	remaining: 2m 9s
170:	learn: 0.2170189	total: 1m 36s	remaining: 2m 8s
171:	learn: 0.2169256	total: 1m 36s	remaining: 2m 8s
172:	learn: 0.2167869	total: 1m 37s	remaining: 2m 7s
173:	learn: 0.2167301	total: 1m 37s	remaining: 2m 6s
174:	learn: 0.2166694	total: 1m 38s	remaining: 2m 6s
175:	learn: 0.2165813	total: 1m 38

311:	learn: 0.2054716	total: 2m 50s	remaining: 48s
312:	learn: 0.2053371	total: 2m 50s	remaining: 47.5s
313:	learn: 0.2053281	total: 2m 51s	remaining: 46.9s
314:	learn: 0.2052852	total: 2m 51s	remaining: 46.4s
315:	learn: 0.2052104	total: 2m 52s	remaining: 45.8s
316:	learn: 0.2051486	total: 2m 52s	remaining: 45.2s
317:	learn: 0.2050771	total: 2m 52s	remaining: 44.6s
318:	learn: 0.2050202	total: 2m 53s	remaining: 44s
319:	learn: 0.2049726	total: 2m 53s	remaining: 43.4s
320:	learn: 0.2049249	total: 2m 54s	remaining: 42.9s
321:	learn: 0.2048187	total: 2m 54s	remaining: 42.3s
322:	learn: 0.2047572	total: 2m 54s	remaining: 41.7s
323:	learn: 0.2046563	total: 2m 55s	remaining: 41.1s
324:	learn: 0.2045682	total: 2m 55s	remaining: 40.6s
325:	learn: 0.2044903	total: 2m 56s	remaining: 40s
326:	learn: 0.2044183	total: 2m 56s	remaining: 39.4s
327:	learn: 0.2043383	total: 2m 56s	remaining: 38.8s
328:	learn: 0.2042881	total: 2m 57s	remaining: 38.3s
329:	learn: 0.2042041	total: 2m 57s	remaining: 37.7s

<catboost.core.CatBoostClassifier at 0x29ef20fc248>

In [7]:
test = test.drop(['completion_date'], axis=1)
mode = test['action_recommendation_id'].mode().values[0]
test['action_recommendation_id'] = test['action_recommendation_id'].fillna(mode)
mode = test['equipment_area'].mode().values[0]
test['equipment_area'] = test['equipment_area'].fillna(mode)

In [8]:
model.save_model('catboost_model')

In [9]:
pred_proba = model.predict_proba(test)[:,1]

In [23]:
pred_class = (pred_proba > 0.45).astype('int')

In [24]:
test = pd.read_csv('test.csv')

In [25]:
submission = test[['case_id', 'action_recommendation_id']]
submission['feedback'] = pred_class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [28]:
submission.to_csv('submission.csv', index=False)