In [1]:
import pandas as pd
import numpy as np

## Загрузка данных

In [2]:
transactions = pd.read_csv('data/transactions.csv')
customers_gender_train = pd.read_csv('data/customers_gender_train.csv')

## Формирование таблицы

In [3]:
transactions['num_day'] = transactions['tr_datetime'].apply(lambda x: int(x.split(' ')[0]))
transactions['datetime'] = transactions['tr_datetime'].apply(lambda x: x.split(' ')[1])
transactions['hour'] = transactions['datetime'].apply(lambda x: int(x.split(':')[0]))
transactions['week_day'] = transactions['num_day'].apply(lambda x: (x + 4) % 7)
transactions.drop('tr_datetime', axis=1, inplace=True)
transactions.drop('datetime', axis=1, inplace=True)

active_days = transactions.groupby('customer_id').apply(lambda x: (max(x[['num_day']].values) - min(x[['num_day']].values))[0])
active_days = active_days.reset_index()
active_days.columns = ['customer_id', 'active_period']

transactions = pd.merge(transactions, active_days, on='customer_id', how='outer')
del active_days

transactions[:5]

Unnamed: 0,customer_id,mcc_code,tr_type,amount,term_id,num_day,hour,week_day,active_period
0,39026145,4814,1030,-2245.92,,0,10,4,445
1,39026145,6011,7010,56147.89,,1,10,5,445
2,39026145,4829,2330,-56147.89,,1,10,5,445
3,39026145,5499,1010,-1392.47,,1,10,5,445
4,39026145,5499,1010,-920.83,,2,15,6,445


## Формирование выборки

###### Создание выборки

In [4]:
ids = transactions.customer_id.unique()
X = pd.DataFrame(index=ids)
X.reset_index(inplace=True)
X.columns = ['customer_id']

X = pd.merge(X, customers_gender_train, on='customer_id', how='left')
del customers_gender_train

###### Количество трат по дням недели за активный период

In [5]:
tmp = transactions.groupby(['customer_id', 'week_day']).apply(
    lambda x: x.amount.size / (x.active_period.max() + 1)
).unstack().reset_index()

X = pd.merge(X, tmp, on='customer_id', how='left')
del tmp
X[:5]

Unnamed: 0,customer_id,gender,0,1,2,3,4,5,6
0,39026145,1.0,0.320628,0.376682,0.318386,0.318386,0.325112,0.414798,0.2713
1,52220754,1.0,0.287611,0.303097,0.219027,0.252212,0.283186,0.219027,0.188053
2,78029866,,0.354486,0.308534,0.286652,0.286652,0.468271,0.269147,0.133479
3,79780256,1.0,0.038095,0.057143,0.042857,0.040476,0.054762,0.035714,0.045238
4,1585299,1.0,0.269147,0.234136,0.207877,0.229759,0.380744,0.330416,0.218818


###### Промежуток дня, в котором больше всего транзакций

In [6]:
def daylight(df):
    hour = df.hour.mode()
    if hour.size == 0:
        tmp = np.unique(df.hour.values, return_counts=True)
        hour = tmp[0][tmp[1].argmax()]
    else:
        hour = hour[0]

    m, d, e, n = 0, 0, 0, 0
    if 6 <= hour <= 10:
        m = 1
    
    elif 10 < hour <= 18:
        d = 1
    
    elif 18 < hour <= 23:
        e = 1
    
    else:
        n = 1
    
    return pd.Series({'morning': m, 'day': d, 'evening': e, 'night': n})

In [7]:
# tmp = transactions.groupby('customer_id').apply(daylight).reset_index()

# X = pd.merge(X, tmp, on='customer_id', how='left')
# del tmp
# X[:5]

###### Нормированные суммы положительные и отрицательные транзакции

In [8]:
tmp = transactions.groupby('customer_id').apply(
    lambda x: pd.Series({'pos': x[x.amount > 0].amount.sum(), 'neg': np.abs(x[x.amount < 0].amount.sum())})
).reset_index()
tmp.neg = tmp.neg / tmp.neg.max()
tmp.pos = tmp.pos / tmp.pos.max()

X = pd.merge(X, tmp, on='customer_id', how='left')
del tmp
X[:5]

Unnamed: 0,customer_id,gender,0,1,2,3,4,5,6,neg,pos
0,39026145,1.0,0.320628,0.376682,0.318386,0.318386,0.325112,0.414798,0.2713,0.0021,0.000421
1,52220754,1.0,0.287611,0.303097,0.219027,0.252212,0.283186,0.219027,0.188053,0.009795,0.002873
2,78029866,,0.354486,0.308534,0.286652,0.286652,0.468271,0.269147,0.133479,0.002368,0.000939
3,79780256,1.0,0.038095,0.057143,0.042857,0.040476,0.054762,0.035714,0.045238,0.000842,8.3e-05
4,1585299,1.0,0.269147,0.234136,0.207877,0.229759,0.380744,0.330416,0.218818,0.003616,0.001258


###### Количество транзакций по кажому mcc_code и tr_type

In [9]:
tmp = transactions.groupby('customer_id').apply(
    lambda x: x[['mcc_code', 'tr_type']].unstack().value_counts()
).unstack().fillna(0).reset_index()

X = pd.merge(X, tmp, on='customer_id', how='left')
del tmp
X[:5]

Unnamed: 0,customer_id,gender,0,1,2,3,4,5,6,neg,...,8299,8398,8641,8699,8999,9211,9222,9311,9399,9402
0,39026145,1.0,0.320628,0.376682,0.318386,0.318386,0.325112,0.414798,0.2713,0.0021,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,52220754,1.0,0.287611,0.303097,0.219027,0.252212,0.283186,0.219027,0.188053,0.009795,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,78029866,,0.354486,0.308534,0.286652,0.286652,0.468271,0.269147,0.133479,0.002368,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,79780256,1.0,0.038095,0.057143,0.042857,0.040476,0.054762,0.035714,0.045238,0.000842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1585299,1.0,0.269147,0.234136,0.207877,0.229759,0.380744,0.330416,0.218818,0.003616,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0


# Классификация

In [10]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_predict

In [11]:
import xgboost as xgb



In [12]:
X_train = X[~X.gender.isnull()].copy()
X_test = X[X.gender.isnull()].copy()

In [13]:
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [14]:
X_train, y_train = X_train.values[:, 2:], X_train.values[:, 1]
X_test = X_test.values[:, 2:]

"period of a day" feature didn't work on the public data, so I didn't choose it as a final ans. My bad :'(

In [15]:
param = { 
    'max_depth': 6,
    'gamma': 6,
    'colsample_bytree': 0.1,
    'min_child_weight': 12,
    'subsample': 0.7,
    'n_estimators': 500,
}
# 0.88161715443875743; with a period of a day
# 0.8813132834609162; without a period of a day

In [16]:
clf = xgb.XGBClassifier(**param)

In [17]:
tmp = cross_val_predict(clf, X_train, y_train, cv=10, method='predict_proba')
roc_auc_score(y_train, tmp[:, 1])

0.8813132834609162

## Predict на тестовой выборке

In [18]:
clf = xgb.XGBClassifier(**param)

In [19]:
clf.fit(X_train, y_train, eval_metric='auc')

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.1,
       gamma=6, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=12, missing=None, n_estimators=500, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.7)

In [20]:
result = X[X.gender.isnull()][['customer_id']].copy()
result['gender'] = clf.predict_proba(X_test)[:, 1]

In [21]:
result.sort_values(by='customer_id', inplace=True)

In [22]:
result.to_csv('data/myans.csv', index=False)