In [None]:
!pip install pandas
!pip install missingno
!pip install sklearn
!pip install scikit-learn

In [1]:
import pandas as pd
import numpy as np
import pickle
import missingno as msno

from sklearn.model_selection import train_test_split

## Загрузка

In [6]:
with open('data/ga_hits-002.pkl', 'rb') as f:
    df_hits = pickle.load(f)
with open('data/ga_sessions.pkl', 'rb') as f:
    df_sessions = pickle.load(f)

df = pd.merge(df_sessions, df_hits, on=['session_id','session_id']) #объединили две таблицы

target = ['sub_car_claim_click', 
          'sub_car_claim_submit_click',
          'sub_open_dialog_click', 
          'sub_custom_question_submit_click',
          'sub_call_number_click', 
          'sub_callback_submit_click', 
          'sub_submit_success',
          'sub_car_request_submit_click'
         ]

#колонка с таргетом
df['target'] = df.apply(lambda x: 1 if x.event_action in target else 0, axis=1)
df = df.drop(columns='event_action')

#Разделим датасет на тренировочную и валидационную выборку
dftrain, dfvalidate = train_test_split(df, stratify=df['target'], test_size=0.3, random_state=42)

## Data Understanding

In [7]:
#столбцы выглядят бесполезными
dftrain = dftrain.drop(columns=['session_id', 'client_id', 'utm_keyword', 'event_label', 'hit_referer', 'hit_type', 'event_category'])

#косвенно связанно с target - event_category

#Пропущено много данных
dftrain = dftrain.drop(columns=['device_model', 'hit_time', 'device_os', 'event_value', 'device_brand']) 

#преобразование к дате
dftrain.visit_date = pd.to_datetime(dftrain.visit_date, utc=True) 
dftrain.hit_date = pd.to_datetime(dftrain.hit_date, utc=True)

## Data Cleaning

In [8]:
#Очистка числовых выбросов
def calculate_outliers(data):
    q25 = data.quantile(0.25)
    q75 = data.quantile(0.75)
    iqr = q75 - q25
    boundaries = (q25 - 1.5 * iqr, q75 + 1.5 * iqr)
    return boundaries

def outliers_num(df, data, boundaries):
    out = (data < boundaries[0]) | (data > boundaries[1])
    print('Количество выбросов:', df[out].shape[0], f'= {(df[out].shape[0]/len(data) * 100):.1f}%')
    return df[out].shape[0]

def remove_outliers(df, data):
    boundaries = calculate_outliers(data)
    if outliers_num(df, data, boundaries)/len(data) > 1:
        return df.loc[(data >= boundaries[0]) & (data <= boundaries[1])]
    data.loc[data < boundaries[0]] = boundaries[0]
    data.loc[data > boundaries[1]] = boundaries[1]
    return df


#Применение
clear = ['hit_number', 'visit_number']

for elem in clear:
 dftrain = remove_outliers(dftrain, dftrain[elem])

Количество выбросов: 734567 = 6.7%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data < boundaries[0]] = boundaries[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data > boundaries[1]] = boundaries[1]


Количество выбросов: 992655 = 9.0%


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data < boundaries[0]] = boundaries[0]


In [9]:
#Удалили пропуски в device_browser
dftrain = dftrain.loc[dftrain.device_browser != '(not set)']
#Удалили пропуски в geo_city (<4% выборки, кажется, что город может быть важен для модели и не хочется все заполнять Москвой)
dftrain = dftrain.loc[dftrain.geo_city != '(not set)']

In [10]:
#Восстанавливаем utm_medium по utm_campaign
#utm_campaign == LTuZkdKfxRGVceoWkVyg для всех (none) и (not set) из utm_medium 
#Мода значения utm_medium для utm_campaign == 'LTuZkdKfxRGVceoWkVyg' - это referral
#Заменим (none) и (not set) на referral

dftrain.loc[dftrain['utm_medium'] == '(none)', 'utm_medium'] = 'referral'
dftrain.loc[dftrain['utm_medium'] == '(not set)', 'utm_medium'] = 'referral'

#в них больше нет необходимости
dftrain = dftrain.drop(columns=['utm_source', 'utm_campaign', 'utm_adcontent'])

In [11]:
#Работа с категориальными признаками. 
#--------------------------------------------------------------------------------------------------------------------------------------
#utm_medium. Оставляем самые большие категории
dftrain['utm_medium'] = dftrain.apply(lambda x: 'referral' if x.utm_medium == 'referral' else
                                                'banner'   if x.utm_medium == 'banner'   else
                                                'cpc'      if x.utm_medium == 'cpc'      else 'other', axis=1)
#--------------------------------------------------------------------------------------------------------------------------------------
#Разрешение экрана преобразовать в категориальный признак
dftrain['screen_resolution_x'] = dftrain.apply(lambda x: int(x.device_screen_resolution.split('x')[0]), axis=1)
dftrain['screen_resolution_y'] = dftrain.apply(lambda x: int(x.device_screen_resolution.split('x')[1]), axis=1)
dftrain = dftrain.drop(columns='device_screen_resolution')
#!!!!!!!!!!!!!!!!! TOCHANGE попробовать изменить порядок ветвлений !!!!!!!!!!!!!!!!!!!!!!!
dftrain['screen_resolution_cat'] = dftrain.apply(lambda x: '8k'     if x.screen_resolution_x >= 7680 and x.screen_resolution_y >= 4320 else
                                                           '4k'     if x.screen_resolution_x >= 3840 and x.screen_resolution_y >= 2160 else
                                                           '2k'     if x.screen_resolution_x >= 2560 and x.screen_resolution_y >= 1440 else
                                                           'fullhd' if x.screen_resolution_x >= 1920 and x.screen_resolution_y >= 1080 else
                                                           'hd'     if x.screen_resolution_x >= 1280 and x.screen_resolution_y >= 720  else 'mobile', axis=1)
dftrain = dftrain.drop(columns=['screen_resolution_x', 'screen_resolution_y'])
#--------------------------------------------------------------------------------------------------------------------------------------
#device_browser. Оставляем самые большие категории
dftrain['device_browser'] = dftrain.apply(lambda x: 'Chrome'    if x.device_browser == 'Chrome'    else
                                                    'Safari'    if x.device_browser == 'Safari'    else
                                                    'YaBrowser' if x.device_browser == 'YaBrowser' else 'other', axis=1)
#--------------------------------------------------------------------------------------------------------------------------------------
#geo_city. Оставляем самые большие категории
dftrain.loc[:, 'geo_city'] = dftrain.apply(lambda x: 'Moscow'           if x.geo_city == 'Moscow'           else
                                                     'Saint_Petersburg' if x.geo_city == 'Saint Petersburg' else 'other', axis=1)
#--------------------------------------------------------------------------------------------------------------------------------------
#hit_page_path. Оставляем самые большие категории
dftrain['hit_page_path'] = dftrain.apply(lambda x: x.hit_page_path.split('/')[0], axis=1)

dftrain.loc[:, 'hit_page_path'] = dftrain.apply(lambda x: 'sberauto.com'          if x.hit_page_path == 'sberauto.com'          else
                                                          'podpiska.sberauto.com' if x.hit_page_path == 'podpiska.sberauto.com' else 'other', axis=1)

## Feature Engineering

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [12]:
#Заменили признак geo_country на бинарный признак country_is_russia со значениями 1 и 0.
dftrain['country_is_russia'] = dftrain.apply(lambda x: 1 if x.geo_country == 'Russia' else 0, axis=1)
dftrain = dftrain.drop(columns='geo_country')

dftrain['visit_month'] = dftrain.visit_date.dt.month
dftrain['hit_month'] = dftrain.hit_date.dt.month

dftrain.loc[:, 'hit_week'] = dftrain.apply(lambda x: 1 if x.hit_date.day < 7  else
                                                     2 if x.hit_date.day < 15 else
                                                     3 if x.hit_date.day < 22 else 4, axis=1)

dftrain.loc[:, 'visit_week'] = dftrain.apply(lambda x: 1 if x.visit_date.day < 7  else
                                                       2 if x.visit_date.day < 15 else
                                                       3 if x.visit_date.day < 22 else 4, axis=1)

dftrain.loc[:, 'visit_time'] = dftrain.apply(lambda x: 'night'   if x.visit_time.hour > 0  and x.visit_time.hour < 7  else
                                                       'morning' if x.visit_time.hour > 6  and x.visit_time.hour < 13 else
                                                       'day'     if x.visit_time.hour > 12 and x.visit_time.hour < 19 else 'evening', axis=1)

dftrain = dftrain.drop(columns=['visit_date', 'hit_date'])

AttributeError: 'Timestamp' object has no attribute 'dt'

Закодировать признаки \
Обучить модели \
Подобрать параметры

In [3]:
with open('df_checkpoint.pkl', 'rb') as f:
    dftrain = pickle.load(f)

In [4]:
X = dftrain.drop(columns='target')
y = dftrain.target

#Разделим датасет на тренировочную и тестовую выборку
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [5]:
#Кодирование непрерывных признаков
numeric = ['visit_number', 'hit_number']

scaler = StandardScaler()

Xtrain[numeric] = scaler.fit_transform(Xtrain[numeric])
#-----------------------------------------------------------------------------------------------------------------------------------------
#Кодирование категориальных признаков
categorical = Xtrain.columns.to_list()
for elem in numeric:
    categorical.remove(elem)

ohe = OneHotEncoder(sparse_output=False, drop='first') 

Xtrain[ohe.get_feature_names_out()] = ohe.fit_transform(Xtrain[categorical])
Xtrain = Xtrain.drop(columns=categorical)

In [6]:
#Кодирование тестовой выборки
Xtest[numeric] = scaler.transform(Xtest[numeric])
Xtest[ohe.get_feature_names_out()] = ohe.transform(Xtest[categorical])
Xtest = Xtest.drop(columns=categorical)

## Modeling

In [7]:
tmp1, Xsmall, tmp2, ysmall = train_test_split(Xtrain, ytrain, stratify=ytrain, test_size=0.02, random_state=42)
del tmp1
del tmp2

In [8]:
Xsmall_train, Xsmall_test, ysmall_train, ysmall_test = train_test_split(Xsmall, ysmall, stratify=ysmall, test_size=0.2, random_state=42)

In [9]:
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [101]:
import math

#расчет диапазона перебора гиперпараметров
def params_calculate(start_finish_step, model, param):
    if model.get_params()[param] < start_finish_step[2]:
        start_finish_step[1] = start_finish_step[2]
    else:
        start_finish_step[0] = model.get_params()[param] - start_finish_step[2]
        start_finish_step[1] = model.get_params()[param] + start_finish_step[2]
    start_finish_step[2] = math.ceil((start_finish_step[1] - start_finish_step[0]) / 10)
    return start_finish_step

#grid template
def grid(param_1, param_2, values_1, values_2, model, Xtrain, ytrain, metric):
    param_grid = {param_1: values_1, 
                  param_2: values_2} 
            
    grid_rf = GridSearchCV(model, 
                            param_grid,
                            cv=3, 
                            scoring=metric, 
                           )
    
    grid_search_model = grid_rf.fit(Xtrain, ytrain)
    return grid_search_model.best_estimator_

In [15]:
import time

In [102]:
%%time

model_rf = RandomForestClassifier(class_weight='balanced', n_jobs=-1)

metric = 'roc_auc'
estimators_start_finish_step = [1, 302, 50]
max_depth_start_finish_step = [2, 103, 30]

for _ in range(3):
    model_rf = grid('n_estimators', 
                    'max_depth', 
                    range(estimators_start_finish_step[0], estimators_start_finish_step[1], estimators_start_finish_step[2]), 
                    range(max_depth_start_finish_step[0], max_depth_start_finish_step[1], max_depth_start_finish_step[2]),
                    model_rf,
                    Xsmall_train,
                    ysmall_train,
                    metric)
    
    estimators_start_finish_step = params_calculate(estimators_start_finish_step, model_rf, 'n_estimators')
    max_depth_start_finish_step = params_calculate(max_depth_start_finish_step, model_rf, 'max_depth')

model_rf = grid('max_features', 
                'criterion', 
                ['sqrt', 'log2', None], 
                ['gini', 'entropy', 'log_loss'],
                model_rf,
                Xsmall_train, 
                ysmall_train,
                metric)

print(roc_auc_score(ysmall_train, model_rf.predict_proba(Xsmall_train)[:,1]))
print(roc_auc_score(ysmall_test, model_rf.predict_proba(Xsmall_test)[:,1]))
model_rf

0.8032351230762748
0.6959809861686649
CPU times: total: 1h 49min 17s
Wall time: 14min 56s


In [None]:
model_rf.fit(Xtrain, ytrain)

Написать функции препроцессинга \
Обучить модель на всей выборке \
Протестировать на валидационной выборке

Подобрать гиперпараметры для нейросети

In [35]:
#mlp нейронка
mlp = MLPClassifier()
mlp.fit(Xsmall_train, ysmall_train)

print(roc_auc_score(ysmall_train, mlp.predict_proba(Xsmall_train)[:,1]))
print(roc_auc_score(ysmall_test, mlp.predict_proba(Xsmall_test)[:,1]))

0.9243700395539993
0.6402971386452013


In [104]:
#mlp нейронка

mlp = MLPClassifier()
mlp.fit(Xtrain, ytrain)

print(roc_auc_score(ytrain, mlp.predict_proba(Xtrain)[:,1]))
print(roc_auc_score(ytest, mlp.predict_proba(Xtest)[:,1]))

TypeError: MLPClassifier.__init__() got an unexpected keyword argument 'n_jobs'

In [None]:
#grid_1
param_grid = {'hidden_layer_sizes': , 
              'solver': ['lbfgs', 'sgd', 'adam'], 
              'activation': ['identity', 'logistic', 'tanh', 'relu']}
            
grid_rf = GridSearchCV(mlp, 
                        param_grid,
                        cv=3, 
                        scoring='roc_auc', 
                       )

grid_search_rf = grid_rf.fit(Xtrain, ytrain)

#grid_2
param_grid = {'alpha': np.arange(0.0001, 0.01, 0.001), 
              'batch_size': range(10, 500, 25)}
            
grid_rf = GridSearchCV(grid_search_rf.best_estimator_, 
                        param_grid,
                        cv=3, 
                        scoring='roc_auc', 
                       )

grid_search_rf = grid_rf.fit(Xtrain, ytrain)
mlp = grid_search_rf.best_estimator_

In [None]:
print(roc_auc_score(ytrain, mlp.predict_proba(Xtrain)[:,1]))
print(roc_auc_score(ytest, mlp.predict_proba(Xtest)[:,1]))