In [109]:
import pandas as pd
import numpy as np
from typing import Dict, Any, Tuple, List, Union
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from pandas.api.types import is_object_dtype


from config import *

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

%load_ext autoreload
%autoreload all

# Prepare Data

In [4]:
GEO_INFO:pd.DataFrame = pd.read_csv(GEO_INFO_FILE_PATH, sep=';')
print(GEO_INFO.shape)
GEO_INFO.head()

(5533, 4)


Unnamed: 0,geo_id,country_id,region_id,timezone_id
0,6447,c31b4e,470e75,f6155e
1,8730,a0a6e9,,d816ca
2,7769,e878d4,,ec4385
3,7330,c31b4e,23f9c2,f6155e
4,600,c31b4e,6dbc37,e56e80


In [5]:
REFERER_VECTORS:pd.DataFrame = pd.read_csv(REFERER_VECTORS_FILE_PATH, sep=';')
print(REFERER_VECTORS.shape)
REFERER_VECTORS.head()

(200000, 11)


Unnamed: 0,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,referer
0,16708,-3741,11395,-1597,-3212,6269,5610,-15351,13779,14102,https://a6899a4/15652e67
1,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,https://9b48ee5/
2,10551,2947,12282,-470,16222,4472,-3316,9606,4197,18948,https://7a4c700/161af7e3
3,12816,20498,-10110,7731,-569,12035,3014,6398,11439,-271,https://9653126/159bc361
4,3710,11096,11333,14673,8030,1852,10554,11625,4306,13210,https://72879b4/125c29e6


In [129]:
test:pd.DataFrame = pd.read_csv(TEST_FILE_PATH, sep=';')
print(test.shape)
test.head()

(150000, 5)


Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,https://9b48ee5/,8816,"{'browser': 'Chrome Mobile', 'browser_version'..."
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,https://9b48ee5/,3663,"{'browser': 'Chrome', 'browser_version': '116...."
2,1700969752,6ef1eedbdb72554e53e69782066065c5,https://72879b4/12411b9e,2336,"{'browser': 'Chrome', 'browser_version': '114...."
3,1700991608,7e057293ecae62985a327b7af51858ea,https://9b48ee5/,9652,"{'browser': 'Chrome Mobile', 'browser_version'..."
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,https://9b48ee5/,3871,"{'browser': 'Chrome Mobile', 'browser_version'..."


In [132]:
test[USER_ID_FIELD_NAME].value_counts(sort=True, ascending=False).head()

user_id
841b7e1f4482e3457d431f905c482e33    6
3be04672eedc93f052b28b771294def5    6
1817a81c4e35140c9dcf2ca8e79ef045    6
a0e01fb73f95ba1c297548b9d9d0ce8f    6
b3c3349a51bde33e6f246d793165b216    5
Name: count, dtype: int64

In [133]:
def user_agent_string_2_dict(s: Union[str, float])->Dict[str, Any]:
    if s is np.nan:
        return {}
    d:Dict[str, Any] = eval(s)
    key_values:List[Tuple[str, str]] = list(d.items())
    for key, value in key_values:
        if value.isdigit():
            digit_value = d.pop(key)
            if '.' in digit_value:
                d[key] = float(digit_value)
            else:
                d[key] = int(digit_value)
    return d
        

user_agent_string_2_dict("{'browser': 'Chrome Mobile', 'browser_version': '96.0.4664', 'os': 'Android', 'os_version': '12'}")

{'browser': 'Chrome Mobile',
 'browser_version': '96.0.4664',
 'os': 'Android',
 'os_version': 12}

In [134]:
def user_agent_df_2_flat_df(
    user_agent_df:pd.DataFrame, 
    user_agent_field_name:str = USER_AGENT_FIELD_NAME
    )->pd.DataFrame:
    user_agent_df_copy = user_agent_df.copy()
    user_agent_list:List[Dict[str,Any]] = user_agent_df_copy.pop(user_agent_field_name).apply(user_agent_string_2_dict).to_list()
    return pd.concat([user_agent_df_copy,pd.DataFrame(user_agent_list)], axis=1)


In [135]:
def referer_2_domain_path(
        referer:str,
        domain_referer_sep:str = DOMAIN_URL_SEP,
        domain_path_sep:str = DOMAIN_PATH_SEP,
        )->Dict[str,str]:
    domain_start_index:int = referer.find(domain_referer_sep) + len(domain_referer_sep)
    domain_end_index:int = referer.find(domain_path_sep, domain_start_index)
    domain_path:Dict[str,str] = {}
    domain_path['domain'] = referer[
        domain_start_index:
        domain_end_index
        ]
    path_start_index:int = domain_end_index + len(domain_path_sep)
    domain_path['path'] = referer[path_start_index:]
    
    return domain_path

referer_2_domain_path('https://72879b4/12411b9e')

{'domain': '72879b4', 'path': '12411b9e'}

In [136]:
def prepare_X(
    raw:pd.DataFrame,
    referer_vectors:pd.DataFrame = REFERER_VECTORS,
    geo_info:pd.DataFrame = GEO_INFO,
    geo_id_field_name:str = GEO_ID_FIELD_NAME,
    user_agent_field_name:str = USER_AGENT_FIELD_NAME,
    referer_field_name:str = REFERER_FIELD_NAME,
    ):
    raw_parsed_user_agent:pd.DataFrame = user_agent_df_2_flat_df(
        raw, 
        user_agent_field_name=user_agent_field_name,
        )
    domain_path:pd.DataFrame = pd.DataFrame(raw_parsed_user_agent[referer_field_name].apply(referer_2_domain_path).to_list())
    raw_parsed_user_agent_with_geo:pd.DataFrame = raw_parsed_user_agent.merge(geo_info, on=geo_id_field_name)
    raw_parsed_user_agent_with_geo.drop(columns=[geo_id_field_name], inplace=True)
    raw_parsed_user_agent_with_geo_referer_vectors:pd.DataFrame = raw_parsed_user_agent_with_geo.merge(referer_vectors, on=referer_field_name)
    raw_parsed_user_agent_with_geo_referer_vectors.drop(columns=[referer_field_name], inplace=True)
    
    return pd.concat([raw_parsed_user_agent_with_geo_referer_vectors, domain_path], axis=1)
    pass

X_test  = prepare_X(test)

In [12]:
X_test.head()

Unnamed: 0,request_ts,user_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,domain,path
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,Chrome Mobile,96.0.4664,Android,12,c31b4e,36e3f3,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,Chrome,116.0.5845,Android,10,c31b4e,8ccc01,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
2,1700969752,6ef1eedbdb72554e53e69782066065c5,Chrome,114.0.0,Android,10,c31b4e,1fbfa5,e56e80,-7307,11682,9741,13564,13577,1200,10169,16461,-3932,3340,72879b4,12411b9e
3,1700991608,7e057293ecae62985a327b7af51858ea,Chrome Mobile,91.0.4472,Android,11,c31b4e,f66ff,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,Chrome Mobile,119.0.0,Android,10,c31b4e,245864,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,


In [13]:
test_users:pd.DataFrame = pd.read_csv(TEST_USERS_FILE_PATH, sep=';')
print(test_users.shape)
test_users.head()

(85000, 1)


Unnamed: 0,user_id
0,c2802dadd33d8ae09bb366bdd41212ea
1,e5b1988db74527ec092f28b0bbfdaac9
2,6ef1eedbdb72554e53e69782066065c5
3,7e057293ecae62985a327b7af51858ea
4,a27bd7ce8828497823fa8d5d05e7bbf7


In [14]:
train_users:pd.DataFrame = pd.read_csv(TRAIN_USERS_FILE_PATH, sep=';')
print(train_users.shape)
train_users.head()

(750000, 5)


Unnamed: 0,request_ts,user_id,referer,geo_id,user_agent
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,https://9b48ee5/,4799,"{'browser': 'Chrome Mobile', 'browser_version'..."
1,1700986581,46a5f128fd569c764a92c2eaa788095e,https://9b48ee5/,8257,"{'browser': 'Chrome Mobile', 'browser_version'..."
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,https://9634fd0/1409e548,3150,"{'browser': 'Yandex Browser', 'browser_version..."
3,1700992803,af735816ca19115431ae3d89518c8c91,https://9b48ee5/,2740,"{'browser': 'Chrome Mobile', 'browser_version'..."
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,https://9b48ee5/,4863,"{'browser': 'Yandex Browser', 'browser_version..."


In [15]:
train_users['user_agent'].apply(type).unique()


array([<class 'str'>, <class 'float'>], dtype=object)

In [75]:
X_train = prepare_X(train_users)
X_train.head()

Unnamed: 0,request_ts,user_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,domain,path
0,1701011363,fb858e8e0a2bec074450eaf94b627fd3,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
1,1700986581,46a5f128fd569c764a92c2eaa788095e,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
2,1701011071,5a74e9ac53ffb21a20cce117c0ad77ba,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870,9634fd0,1409e548
3,1700992803,af735816ca19115431ae3d89518c8c91,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
4,1701021666,364f0ae0a3f29a685c4fb5bae6033b9a,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,


In [76]:
train_labels:pd.DataFrame = pd.read_csv(TRAIN_LABELS_FILE_PATH, sep=';')
print(train_labels.shape)
train_labels.head()

(500000, 2)


Unnamed: 0,user_id,target
0,fb858e8e0a2bec074450eaf94b627fd3,0
1,46a5f128fd569c764a92c2eaa788095e,0
2,5a74e9ac53ffb21a20cce117c0ad77ba,0
3,af735816ca19115431ae3d89518c8c91,0
4,364f0ae0a3f29a685c4fb5bae6033b9a,0


In [77]:
# y_train = pd.DataFrame(train_users[USER_ID_FIELD_NAME]).merge(train_labels, on=USER_ID_FIELD_NAME)[TARGET_FIELD_NAME]
# y_train.head()
X_train_with_target = X_train.merge(train_labels, on=USER_ID_FIELD_NAME)
X_feature_names = list(X_train_with_target.columns)
X_feature_names.remove(TARGET_FIELD_NAME)
X_train = X_train_with_target[X_feature_names]
y_train = X_train_with_target[TARGET_FIELD_NAME]
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [78]:
X_train.shape,y_train.shape

((601290, 21), (601290,))

In [81]:
X_train.drop(columns=[USER_ID_FIELD_NAME],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train.drop(columns=[USER_ID_FIELD_NAME],inplace=True)


In [98]:
X_train.head()

Unnamed: 0,request_ts,browser,browser_version,os,os_version,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,domain,path
0,1701011363,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
1,1700986581,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
2,1701011071,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870,9634fd0,1409e548
3,1700992803,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
4,1701021666,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,


In [108]:
is_object_dtype(X_train['request_ts'].dtype)

False

In [137]:
cat_features_names:List[str] = []
for feature in X_train.columns:
    if is_object_dtype(X_train[feature].dtype):
        print(feature, len(X_train[feature].unique()))
        cat_features_names.append(feature)

cat_features_names

browser 60
browser_version 1438
os 14
os_version 231
country_id 151
region_id 259
timezone_id 218
domain 5015
path 130577


['browser',
 'browser_version',
 'os',
 'os_version',
 'country_id',
 'region_id',
 'timezone_id',
 'domain',
 'path']

# Train models

In [82]:
X_train.head()

Unnamed: 0,request_ts,browser,browser_version,os,os_version,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,domain,path
0,1701011363,Chrome Mobile,119.0.0,Android,10,c31b4e,470e75,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
1,1700986581,Chrome Mobile,111.0.0,Android,10,c31b4e,44520b,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
2,1701011071,Yandex Browser,20.12.5,Android,11,c31b4e,616bb9,af47f1,12498,2451,10304,-6380,11608,3106,-2188,10573,3347,21870,9634fd0,1409e548
3,1700992803,Chrome Mobile,119.0.0,Android,10,c31b4e,3c9dca,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
4,1701021666,Yandex Browser,18.11.1,Android,4.4.4,c31b4e,776e76,10b7947,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,


In [25]:
X_test.head()

Unnamed: 0,request_ts,user_id,browser,browser_version,os,os_version,country_id,region_id,timezone_id,component0,component1,component2,component3,component4,component5,component6,component7,component8,component9,domain,path
0,1700993094,c2802dadd33d8ae09bb366bdd41212ea,Chrome Mobile,96.0.4664,Android,12,c31b4e,36e3f3,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
1,1701005579,e5b1988db74527ec092f28b0bbfdaac9,Chrome,116.0.5845,Android,10,c31b4e,8ccc01,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
2,1700969752,6ef1eedbdb72554e53e69782066065c5,Chrome,114.0.0,Android,10,c31b4e,1fbfa5,e56e80,-7307,11682,9741,13564,13577,1200,10169,16461,-3932,3340,72879b4,12411b9e
3,1700991608,7e057293ecae62985a327b7af51858ea,Chrome Mobile,91.0.4472,Android,11,c31b4e,f66ff,f6155e,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,
4,1701019815,a27bd7ce8828497823fa8d5d05e7bbf7,Chrome Mobile,119.0.0,Android,10,c31b4e,245864,e56e80,11731,4045,22213,-1184,-8992,9381,-3496,-3120,-899,16817,9b48ee5,


In [139]:
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train.select_dtypes(exclude=['object']), y_train, random_state=RANDOM_STATE)

In [140]:
X_train_train.shape, X_train_test.shape

((450967, 11), (150323, 11))

In [141]:
X_train_train.dtypes

request_ts    int64
component0    int64
component1    int64
component2    int64
component3    int64
component4    int64
component5    int64
component6    int64
component7    int64
component8    int64
component9    int64
dtype: object

In [145]:
train_data = lgb.Dataset(X_train_train, label=y_train_train)

params = {
    # 'objective': 'multiclass',
    'num_class': 1,
    'metric': 'logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'verbose': -1
}
bst = lgb.train(params=params,train_set=train_data)

In [150]:
y_train_test_pred = pd.Series(bst.predict(X_train_test)).apply(round)
y_train_test_pred.head()

0    1
1    0
2    0
3    0
4    1
dtype: int64

In [152]:
y_train_test_pred.shape, y_train_test.shape

((150323,), (150323,))

In [155]:
print(classification_report(y_true=y_train_test, y_pred=y_train_test_pred))

              precision    recall  f1-score   support

           0       0.78      0.77      0.77     78319
           1       0.75      0.76      0.76     72004

    accuracy                           0.77    150323
   macro avg       0.77      0.77      0.77    150323
weighted avg       0.77      0.77      0.77    150323

