In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime

# Dataset

In [35]:
train_set = pd.read_csv('train_set.csv', delimiter = ';')
train_set.head()

Unnamed: 0,client_id,target
0,75063019,0
1,86227647,1
2,6506523,0
3,50615998,0
4,95213230,0


In [36]:
test_set = pd.read_csv('test_set.csv', delimiter = ';')
test_set.head()

Unnamed: 0,client_id
0,61240380
1,34114030
2,15926722
3,50598019
4,76475250


In [37]:
codes = pd.read_csv('codes.csv', delimiter = ';')
codes.head()

Unnamed: 0,code,code_description
0,5944,"Магазины по продаже часов, ювелирных изделий и..."
1,5621,Готовые сумочные изделия
2,5697,"Услуги по переделке, починке и пошиву одежды"
3,7995,Транзакции по азартным играм
4,5137,"Мужская, женская и детская спец-одежда"


In [38]:
types = pd.read_csv('types.csv', delimiter = ';')
types.head()

Unnamed: 0,type,type_description
0,8001,Установление расх. лимита по карте
1,2411,Перевод с карты на счет др.лица в одном тер. б...
2,4035,н/д(нет данных)
3,3001,Комиссия за обслуживание ссудного счета
4,2420,Перевод с карты на счет физ.лица в другом тер....


In [39]:
transactions = pd.read_csv('transactions.csv', delimiter = ';')
transactions.head()

Unnamed: 0,client_id,datetime,code,type,sum
0,96372458,421 06:33:15,6011,2010,-561478.94
1,24567813,377 17:20:40,6011,7010,67377.47
2,21717441,55 13:38:47,6011,2010,-44918.32
3,14331004,263 12:57:08,6011,2010,-3368873.66
4,85302434,151 10:34:12,4814,1030,-3368.87


# 3. Feature engineering

# Filling missing values

In [40]:
types.loc[types['type_description']=='н/д(нет данных)',['type_description']]=np.nan

In [41]:
types.head()

Unnamed: 0,type,type_description
0,8001,Установление расх. лимита по карте
1,2411,Перевод с карты на счет др.лица в одном тер. б...
2,4035,
3,3001,Комиссия за обслуживание ссудного счета
4,2420,Перевод с карты на счет физ.лица в другом тер....


# Transactions sum

In [42]:
transcactions_sum=transactions.groupby("client_id").sum().drop(columns = ['code','type'])
transcactions_count=aggregated_1 = transactions.groupby("client_id").count().drop(columns = ['code','type', 'datetime']).rename(columns={'sum': 'count_transactions'})
transcactions_sum_count = pd.concat([transcactions_sum, transcactions_count], axis=1, join="outer")
transcactions_sum_count.head()

Unnamed: 0_level_0,sum,count_transactions
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1
22899,50847.54,9
27914,74115.21,4
28753,-2589800.29,13
31385,-83525.38,13
38084,693495.66,26


# Features 

In [43]:
time = transactions.drop(columns = ['code', 'type'])
time['day_number'] = transactions['datetime'].str[:3]
time['time'] = transactions['datetime'].str[-8:].str[:2]
time['time'] = time['time'].astype(int)
time.head()

Unnamed: 0,client_id,datetime,sum,day_number,time
0,96372458,421 06:33:15,-561478.94,421,6
1,24567813,377 17:20:40,67377.47,377,17
2,21717441,55 13:38:47,-44918.32,55,13
3,14331004,263 12:57:08,-3368873.66,263,12
4,85302434,151 10:34:12,-3368.87,151,10


In [44]:
buffer = transactions.groupby("client_id").median().drop(columns = 'sum')
predictors = pd.concat([transcactions_sum_count, buffer], axis=1, join="outer")
predictors = predictors.rename(columns={'sum': 'sum_of_transactions', 'code': 'Code_mode', 'type': 'Type_mode'})
predictors['Code_mode'] = predictors['Code_mode'].astype("category")
predictors['Type_mode'] = predictors['Type_mode'].astype("category")
buffer = time.drop(columns = ['sum', 'day_number', 'datetime']).groupby("client_id").mean()
buffer= buffer.rename(columns={'time': 'hour_mean'})
predictors = pd.merge(predictors, buffer_1, on = 'client_id', how ="left")
buffer = transactions.groupby('client_id')['code'].apply(lambda x: (x == 4814).sum()).reset_index(name='transactions_calls')
predictors = pd.merge(predictors, buffer, on = 'client_id', how ="left")
predictors.head()

Unnamed: 0,client_id,sum_of_transactions,count_transactions,Code_mode,Type_mode,mean_hour,transactions_calls
0,22899,50847.54,9,6010.0,4010.0,13.555556,1
1,27914,74115.21,4,5412.0,4020.0,12.25,2
2,28753,-2589800.29,13,5661.0,1030.0,7.0,3
3,31385,-83525.38,13,5411.0,1030.0,14.538462,4
4,38084,693495.66,26,5411.0,1210.0,13.0,3


# Train dataset after new features

In [45]:
train_set = pd.merge(train_set, predictors, on = 'client_id', how ="left")
train_set

Unnamed: 0,client_id,target,sum_of_transactions,count_transactions,Code_mode,Type_mode,mean_hour,transactions_calls
0,75063019,0,89032.60,29,5411.0,1110.0,15.068966,5
1,86227647,1,-606058.60,27,5411.0,1110.0,11.222222,7
2,6506523,0,2635753.74,53,6010.0,7030.0,13.735849,0
3,50615998,0,-42672.40,7,4829.0,2370.0,14.142857,3
4,95213230,0,214292.66,34,5812.0,1030.0,9.617647,12
...,...,...,...,...,...,...,...,...
5995,71577803,0,-114601.90,5,7230.0,1010.0,10.800000,0
5996,8128547,1,-262708.36,7,6010.0,2010.0,11.000000,2
5997,26055781,0,-42863.31,6,5747.0,1560.0,16.666667,0
5998,73504380,1,-75992.84,19,5411.0,1030.0,10.473684,3
