In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import *
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("data/public_train_trx.csv")
test_df = pd.read_csv("data/public_test_trx.csv")
alldf = pd.concat([train_df,test_df])
alldf = alldf.sort_values(['session_id','duration_of_session','click_num'])
alldf = alldf.reset_index(drop=True)
target = 'TARGET_successful_purchase'

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
alldf = alldf.fillna(0)
# alldf['lifetime_customer_account'] = alldf['lifetime_customer_account'].replace(-1, 0)
aggregalando_valtozok = list(alldf.columns)
aggregalando_valtozok.remove('test_or_train_flag')
aggregalando_valtozok.remove(target)
aggregalando_valtozok.remove('start_date_of_session')
aggregalando_valtozok.remove('start_time_of_session')
aggregalando_valtozok.remove('session_id')

In [4]:
cust_df = alldf.groupby('session_id',as_index=False).agg({target:'min',
                                        'test_or_train_flag':'min'})
for aggregalos_modszer in ['min','max','mean']:
    task={}
    ujoszlonevek=[]
    for v in aggregalando_valtozok:
        task[v]=aggregalos_modszer
        ujoszlonevek.append(aggregalos_modszer+"_"+v)
    stat = alldf.groupby(['session_id'],as_index=False).agg(task)
    stat.columns=['session_id']+ujoszlonevek
    cust_df = cust_df.merge(stat,on='session_id',how='left')

In [5]:
cust_df['click_per_durr'] = cust_df['max_click_num'] / cust_df['max_duration_of_session']
cust_df['vis_price_per_durr'] = cust_df['max_sum_price_of_visited_products'] / cust_df['max_duration_of_session']
cust_df['basket_elem_per_durr'] = cust_df['max_basket_element_number'] / cust_df['max_duration_of_session']
cust_df['basket_sum_per_durr'] = cust_df['max_sum_price_of_products_in_basket'] / cust_df['max_duration_of_session']

cust_df['bought_elem_visited_price_ratio'] = cust_df['max_basket_element_number'] / cust_df['max_sum_price_of_visited_products']

cust_df['bought_price_visited_price_ratio'] = cust_df['max_sum_price_of_products_in_basket']  / cust_df['max_sum_price_of_visited_products']

cust_df['payment_per_cust_existance'] = cust_df['max_num_of_previous_payments'] / cust_df['max_lifetime_customer_account']

cust_df['max_registration_age'] = cust_df['max_customer_age'] - (cust_df['max_lifetime_customer_account'] / 365)

cust_df.replace(np.inf, 0, inplace=True)
cust_df.fillna(0, inplace=True)

In [6]:
bemeno_valtozok = list(cust_df.columns)[3:]
ismert_df = cust_df[ cust_df['test_or_train_flag']==0].copy()
x_train = ismert_df[bemeno_valtozok]
y_train = ismert_df[target]
x_pred = cust_df[cust_df['test_or_train_flag'] == 1][bemeno_valtozok]
scaler = StandardScaler()
# x_train = scaler.fit_transform(x_train)
# x_pred = scaler.transform(x_pred)
print(f"Training set size: {x_train.shape}")

Training set size: (24584, 59)


In [7]:
%%time
cv_results = cross_validate(RandomForestClassifier(max_depth = 14, n_estimators = 100),
                            x_train, y_train,
                            scoring=['roc_auc', 'accuracy'], cv=3, n_jobs=4)
auc = cv_results['test_roc_auc'].mean()
print(f"\t{auc},\t{cv_results['test_accuracy'].mean()}")

	0.8790137871045206,	0.8012933316435796
Wall time: 9.47 s


In [8]:
def rounding_score_decorator(score):
    return lambda y_true, y_pred: score(y_true, y_pred > 0.5)

def text2score(optimalization):
    if optimalization == 'AUC':
        score = roc_auc_score
    elif optimalization == 'Precision':
        score = rounding_score_decorator(precision_score)
    elif optimalization == 'Recall':
        score = rounding_score_decorator(recall_score)
    elif optimalization == 'Accuracy':
        score = rounding_score_decorator(accuracy_score)
    return score

def modell_evaluator(data, input_attributes, target_attribute):
    def test_attributes(fix_input, possible_inputs):
        best_score = -1
        best_input = None
        for possible_input in possible_inputs:
            cv_results = cross_validate(RandomForestClassifier(max_depth = 14, n_estimators = 100),
                            data[fix_input + [possible_input]], data[target_attribute],
                            scoring=['roc_auc', 'accuracy'], cv=3, n_jobs=4)
            s = cv_results['test_roc_auc'].mean()
            if s > best_score:
                best_score = s
                best_input = possible_input
        return best_input, best_score
    good_inputs = []
    in_race_inputs = input_attributes
    best_s = -1
    while len(in_race_inputs):
        i_to_accept, s = test_attributes(good_inputs, input_attributes)
        print(i_to_accept, s)
        if s < best_s:
            return best_s, good_inputs
        
        best_s = s
        good_inputs.append(i_to_accept)
        in_race_inputs.remove(i_to_accept)
    return best_s, good_inputs

In [10]:
modell_evaluator(ismert_df, 
                 bemeno_valtozok, 
                 target)

max_level_of_purchasing_process
mean_max_val
max_basket_element_number
mean_sum_price_of_products_in_basket
vis_price_per_durr
min_duration_of_session
mean_minimum_price_of_visited_products
basket_elem_per_durr
min_last_order_of_customer


(0.8686583731349793,
 ['max_level_of_purchasing_process',
  'mean_max_val',
  'max_basket_element_number',
  'mean_sum_price_of_products_in_basket',
  'vis_price_per_durr',
  'min_duration_of_session',
  'mean_minimum_price_of_visited_products',
  'basket_elem_per_durr'])

In [None]:
model = RandomForestClassifier(max_depth = 14, n_estimators = 1000)
model.fit(x_train, y_train)
pred = model.predict_proba(x_pred)
# accuracy_score(ismert_df[target],model.predict(ismert_df[bemeno_valtozok]))

In [None]:
res_df = pd.DataFrame({'session_id': cust_df[cust_df['test_or_train_flag'] == 1].session_id, 'prob': pred[:, 1]})
res_df.to_csv('res.csv', index=False)