In [35]:
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split

In [2]:
train_labels = pd.read_csv('train_labels.csv', sep=';')
test = pd.read_csv('test.csv', sep=';')
referer_vectors = pd.read_csv('referer_vectors.csv', sep=';')
train = pd.read_csv('train.csv', sep=';')
test_users = pd.read_csv('test_users.csv', sep=';')
geo_info = pd.read_csv('geo_info.csv', sep=';')

In [3]:
train_labels.shape, train.shape, test.shape

((500000, 2), (750000, 5), (150000, 5))

In [4]:
train_labels.user_id.nunique(), train.user_id.nunique(), test.user_id.nunique()

(500000, 655672, 131998)

In [5]:
train.user_agent[0]

"{'browser': 'Chrome Mobile', 'browser_version': '119.0.0', 'os': 'Android', 'os_version': '10'}"

In [6]:
df = train.copy()

In [7]:
df.user_agent = df.user_agent.fillna('{}')

In [8]:
df.user_agent = df.user_agent.apply(ast.literal_eval)

In [9]:
df = df.merge(referer_vectors, how='left', on='referer')

In [10]:
df = df.merge(geo_info, how='left', on='geo_id')

In [11]:
df['browser'] = df.user_agent.apply(lambda x: x.get('browser', 'undefined'))
df['browser_version'] = df.user_agent.apply(lambda x: x.get('browser_version', 'undefined'))
df['os'] = df.user_agent.apply(lambda x: x.get('os', 'undefined'))
df['os_version'] = df.user_agent.apply(lambda x: x.get('os_version', 'undefined'))

In [12]:
df = df.merge(train_labels, how='left', on='user_id')

In [13]:
df['request_datetime'] = pd.to_datetime(df.request_ts, unit='s')

In [14]:
df['year'] = df.request_datetime.dt.year
df['month'] = df.request_datetime.dt.month
df['day'] = df.request_datetime.dt.day
df['hour'] = df.request_datetime.dt.hour
df['minute'] = df.request_datetime.dt.minute

In [15]:
df.columns

Index(['request_ts', 'user_id', 'referer', 'geo_id', 'user_agent',
       'component0', 'component1', 'component2', 'component3', 'component4',
       'component5', 'component6', 'component7', 'component8', 'component9',
       'country_id', 'region_id', 'timezone_id', 'browser', 'browser_version',
       'os', 'os_version', 'target', 'request_datetime', 'year', 'month',
       'day', 'hour', 'minute'],
      dtype='object')

In [16]:
df.dtypes

request_ts                   int64
user_id                     object
referer                     object
geo_id                       int64
user_agent                  object
component0                   int64
component1                   int64
component2                   int64
component3                   int64
component4                   int64
component5                   int64
component6                   int64
component7                   int64
component8                   int64
component9                   int64
country_id                  object
region_id                   object
timezone_id                 object
browser                     object
browser_version             object
os                          object
os_version                  object
target                     float64
request_datetime    datetime64[ns]
year                         int64
month                        int64
day                          int64
hour                         int64
minute              

In [17]:
df.shape, df[df.target.isnull()].shape

((759972, 29), (158682, 29))

In [18]:
df = df.drop(columns='user_agent').drop_duplicates()

In [19]:
df = df[df.target.notnull()]

In [20]:
df.shape

(591836, 28)

In [21]:
df.columns

Index(['request_ts', 'user_id', 'referer', 'geo_id', 'component0',
       'component1', 'component2', 'component3', 'component4', 'component5',
       'component6', 'component7', 'component8', 'component9', 'country_id',
       'region_id', 'timezone_id', 'browser', 'browser_version', 'os',
       'os_version', 'target', 'request_datetime', 'year', 'month', 'day',
       'hour', 'minute'],
      dtype='object')

In [22]:
# df = pd.get_dummies(df, columns=['country_id','region_id','timezone_id','browser','os','os_version'], drop_first=True )

In [23]:
from sklearn.preprocessing import OrdinalEncoder
encoder = OrdinalEncoder()
df[['country_id','region_id','timezone_id','browser','browser_version','os','os_version']] = encoder.fit_transform(df[['country_id','region_id','timezone_id','browser','browser_version','os','os_version']])

In [24]:
train_users, val_users = train_test_split( df.user_id, test_size=0.2, random_state=42)


In [25]:
X_train = df[df.user_id.isin(train_users)][['component0', 'component1', 'component2', 'component3', 'component4',
       'component5', 'component6', 'component7', 'component8', 'component9',
       'country_id', 'region_id', 'timezone_id', 'browser', 'browser_version',
       'os', 'os_version','hour']]
y_train = df[df.user_id.isin(train_users)]['target']
X_val = df[df.user_id.isin(val_users)][['component0', 'component1', 'component2', 'component3', 'component4',
       'component5', 'component6', 'component7', 'component8', 'component9',
       'country_id', 'region_id', 'timezone_id', 'browser', 'browser_version',
       'os', 'os_version','hour']]
y_val = df[df.user_id.isin(val_users)]['target']

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [26]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((501952, 18), (501952,), (151076, 18), (151076,))

In [56]:
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score

In [31]:
D_train = xgb.DMatrix(X_train_scaled, label=y_train)
D_val = xgb.DMatrix(X_val_scaled, label=y_val)

In [50]:
param = {
'eta': 0.3,
'max_depth': 20,
'objective': 'multi:softprob',
'num_class': 18}

steps = 20

In [51]:
model = xgb.train(param, D_train, steps)
preds = model.predict(D_val)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [57]:
print("precision_score: {}".format(precision_score(y_val, best_preds, average='macro')))
print("recall_score: {}".format(recall_score(y_val, best_preds, average='macro')))
print("accuracy_score: {}".format(accuracy_score(y_val, best_preds)))
print("f1_score: {}".format(f1_score(y_val, best_preds)))
print("roc_auc_score: {}".format(roc_auc_score(y_val, best_preds)))

precision_score: 0.8685880872721967
recall_score: 0.868119623930363
accuracy_score: 0.868576080912918
f1_score: 0.8623942227058196
roc_auc_score: 0.868119623930363
