In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, roc_auc_score, f1_score

In [2]:
df = pd.read_csv('../data/desktop_data.csv')

In [3]:
df['browser_id'].value_counts()

ABDB3866-071A-4B1A-9A2C-2AF9ED43234A.-lckIf_inpU27IAoqXb20Q..    1
efb9435e-b633-483a-aae0-ffd0d93b09b8.b3PRm202c4ANl-7MiNS40A..    1
91A1F1CF-2230-49F9-A7CE-DC700291F753.YUdGQAvkRGIZ75vVfuTnsQ..    1
59c587e4-7f00-45a7-b96e-0b6c73f4fbc8.I7rnkpgqsVOIpGKe1x7kyA..    1
E8AA5FB6-7531-46B7-85E0-37D6870A5A7A.lBBJiJAcIhwNlbC-x_OluA..    1
                                                                ..
552FFFE6-F10A-46B4-A9A9-E870DA8F315F.4JHvZf3oMqyuAOraA6bU9Q..    1
4593699a-de89-4e82-a9c5-ef36f10cf3a2.OE7yXyXxw7EKqoCrVV7Wgw..    1
CD58DD2F-DE77-4455-901A-43031792663C.fO6IolYC2cztoitFlMt2TQ..    1
4593699a-de89-4e82-a9c5-ef36f10cf3a2.D8sgqb0RUsAvghqyMT7NVw..    1
94B380F6-44C9-4B72-B282-A79992A01F4B.IjwtbQ1cJhMKdeG5zN4EdA..    1
Name: browser_id, Length: 7902414, dtype: int64

In [4]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

In [5]:
df['churn_user'].value_counts(dropna=False, normalize=True)

0    0.755348
1    0.244652
Name: churn_user, dtype: float64

In [6]:
train['churn_user'].value_counts(dropna=False, normalize=True)

0    0.755319
1    0.244681
Name: churn_user, dtype: float64

In [7]:
test['churn_user'].value_counts(dropna=False, normalize=True)

0    0.755465
1    0.244535
Name: churn_user, dtype: float64

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6321931 entries, 7113150 to 6413414
Data columns (total 48 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   browser_id                     object 
 1   churn_user                     int64  
 2   browser_id_hash                uint64 
 3   os_name                        object 
 4   age_group                      object 
 5   gender                         object 
 6   country                        object 
 7   region                         object 
 8   province_type                  object 
 9   province                       object 
 10  active_day                     int64  
 11  first_date                     object 
 12  life_time                      int64  
 13  lastest_active_day             object 
 14  not_active_day                 int64  
 15  total_active_time              float64
 16  clicks                         float64
 17  search_volume                  int64  
 

In [9]:
train.head()

Unnamed: 0,browser_id,churn_user,browser_id_hash,os_name,age_group,gender,country,region,province_type,province,...,pip_count,sidebar_count,incognito_count,signin_count,youtube_count,work_count,social_count,news_count,entertainment_count,ecommerce_count
7113150,D8BF706A-FA05-4502-892C-19CF1E4ABFE0.lj7eIXMsE...,1,8430570851456054515,windows,unknown,unknown,VN,Southern Vietnam,rural,Bình Dương,...,0,0,0,0,,,,,,
5308916,3cb00cff-c2e3-4977-8ebe-9d1b4cf1165f.FmfZZ2JOk...,0,5176414647394971691,windows,25-34,female,VN,Southern Vietnam,rural,Tây Ninh,...,0,27,0,0,279.0,0.0,0.0,0.0,0.0,0.0
4858862,39921383-E431-422C-84AF-A3134CC68EE7.gvah8sxwB...,0,12275853707732323202,windows,25-34,female,VN,Southern Vietnam,rural,Đồng Nai,...,0,0,0,6,,,,,,
2949562,ac164a45-207a-4607-a45f-818f2b62d314.t40QFp_UM...,0,7870210021874436399,windows,45-54,male,VN,Northern Vietnam,rural,Quảng Ninh,...,0,0,0,4,,,,,,
4154052,DA915DF2-A4DA-4FF3-9C64-D85F87D88216.dezWal7eO...,0,4758427890026231294,windows,unknown,unknown,VN,Northern Vietnam,rural,Bắc Giang,...,0,0,0,0,,,,,,


In [10]:
train['first_date']

7113150    2023-12-29
5308916    2021-09-05
4858862    2022-02-23
2949562    2022-05-18
4154052    2019-03-07
              ...    
6550634    2021-02-22
7705870    2020-04-04
6423388    2023-04-26
6962611    2023-09-22
6413414    2020-10-14
Name: first_date, Length: 6321931, dtype: object

In [11]:
c = [x for x in train.columns if train[x].dtypes=='object']
n = [x for x in train.columns if x not in c]
list(n)

['churn_user',
 'browser_id_hash',
 'active_day',
 'life_time',
 'not_active_day',
 'total_active_time',
 'clicks',
 'search_volume',
 'dating_search',
 'videoclip_search',
 'technical_search',
 'housekeeping_family_search',
 'marketing_search',
 'other_search',
 'serp_click',
 'search_volume_gg',
 'search_clicks_gg',
 'other_search_gg',
 'housekeeping_family_search_gg',
 'videoclip_search_gg',
 'dating_search_gg',
 'marketing_search_gg',
 'technical_search_gg',
 'ads_impression',
 'ads_click',
 'ads_revenue',
 'newtab_count',
 'download_count',
 'pip_count',
 'sidebar_count',
 'incognito_count',
 'signin_count',
 'youtube_count',
 'work_count',
 'social_count',
 'news_count',
 'entertainment_count',
 'ecommerce_count']

In [12]:
TARGET ='churn_user'
CATEGORICAL_FEATURES  = ['os_name', 'age_group','gender', 'country', 'region', 'province_type',
                         'province']
DATETIME_FEATURES  = ['first_date', 'lastest_active_day']
NUMERICAL_FEATURES = ['active_day', 'life_time',  'not_active_day', 'total_active_time',
                      'clicks', 'search_volume', 'dating_search', 'videoclip_search', 'technical_search', 'housekeeping_family_search', 'marketing_search', 'other_search',
                      'serp_click', 'search_volume_gg', 'search_clicks_gg', 'other_search_gg','housekeeping_family_search_gg','videoclip_search_gg', 'dating_search_gg', 'marketing_search_gg', 'technical_search_gg',
                      'ads_impression', 'ads_click', 'ads_revenue', 'newtab_count', 'download_count', 'pip_count', 'sidebar_count', 'incognito_count', 'signin_count', 'youtube_count',
                    'work_count', 'social_count', 'news_count', 'entertainment_count', 'ecommerce_count']

In [32]:
for c  in CATEGORICAL_FEATURES:
    train[c] = train[c].astype("category")
    test[c] = test[c].astype("category")

In [13]:
train.reset_index(drop=True, inplace=True)

In [39]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

In [70]:
hyper_parameters = {
    "n_estimators": 500,
    'learning_rate':0.03,
     'max_depth':8,
     'colsample_bytree':0.5,
     'subsample':0.8,
     # 'reg_alpha':8,
     # 'reg_lambda':32,

    "random_state":42,
    'device':'gpu',
    # "class_weight": "balanced"
}
models = []
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, valid_index in kfold.split(train, train[TARGET]):
    X_train = train.iloc[train_index][CATEGORICAL_FEATURES + NUMERICAL_FEATURES]
    X_valid = train.iloc[valid_index][CATEGORICAL_FEATURES + NUMERICAL_FEATURES]
    y_train = train.iloc[train_index][TARGET].values
    y_valid = train.iloc[valid_index][TARGET].values
    for c  in CATEGORICAL_FEATURES:
        X_train[c] = X_train[c].astype("category")
        X_valid[c] = X_valid[c].astype("category")
    model =LGBMClassifier(**hyper_parameters)
    callbacks = [lgb.early_stopping(200, verbose=50), lgb.log_evaluation(period=50)]
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric=["logloss", "auc"],
              categorical_feature=CATEGORICAL_FEATURES,
              callbacks=callbacks)

    y_pred_prob = model.predict_proba(test[CATEGORICAL_FEATURES + NUMERICAL_FEATURES])[:,1]
    y_pred = [1 if y_hat >=0.4 else 0 for y_hat in y_pred_prob]
    y_test = test[TARGET].values
    print(roc_auc_score(y_test, y_pred_prob))
    print(classification_report(y_test, y_pred))

New categorical_feature is ['age_group', 'country', 'gender', 'os_name', 'province', 'province_type', 'region']


Training until validation scores don't improve for 200 rounds
[50]	training's binary_logloss: 0.359521	training's auc: 0.901038	valid_1's binary_logloss: 0.359564	valid_1's auc: 0.901102
[100]	training's binary_logloss: 0.331106	training's auc: 0.903509	valid_1's binary_logloss: 0.331178	valid_1's auc: 0.903546
[150]	training's binary_logloss: 0.323726	training's auc: 0.905	valid_1's binary_logloss: 0.323841	valid_1's auc: 0.905017
[200]	training's binary_logloss: 0.320994	training's auc: 0.905886	valid_1's binary_logloss: 0.321151	valid_1's auc: 0.905874
[250]	training's binary_logloss: 0.319858	training's auc: 0.906351	valid_1's binary_logloss: 0.320079	valid_1's auc: 0.906295
[300]	training's binary_logloss: 0.319251	training's auc: 0.906653	valid_1's binary_logloss: 0.319544	valid_1's auc: 0.906547
[350]	training's binary_logloss: 0.318731	training's auc: 0.906915	valid_1's binary_logloss: 0.3191	valid_1's auc: 0.90676
[400]	training's binary_logloss: 0.318353	training's auc: 0.907

New categorical_feature is ['age_group', 'country', 'gender', 'os_name', 'province', 'province_type', 'region']


Training until validation scores don't improve for 200 rounds


KeyboardInterrupt: 

In [69]:

xgboost_hyperparameters = {
    'objective' : 'binary:logistic',
     'eval_metric':['logloss', 'auc'],
     'n_estimators':500,
     'learning_rate':0.01,
     'max_depth':5,
     'colsample_bytree':0.5,
     'subsample':0.8,
     'reg_alpha':8,
     'reg_lambda':32,
     'seed':42,
     # 'scale_pos_weight':3,
     'enable_categorical':True,
     'early_stopping_rounds': 50,
     'tree_method':'gpu_hist'}
from xgboost import XGBClassifier
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, valid_index in kfold.split(train, train[TARGET]):
    X_train = train.iloc[train_index][CATEGORICAL_FEATURES + NUMERICAL_FEATURES]
    X_valid = train.iloc[valid_index][CATEGORICAL_FEATURES + NUMERICAL_FEATURES]
    y_train = train.iloc[train_index][TARGET].values
    y_valid = train.iloc[valid_index][TARGET].values
    for c  in CATEGORICAL_FEATURES:
        X_train[c] = X_train[c].astype("category")
        X_valid[c] = X_valid[c].astype("category")
    model = XGBClassifier(**xgboost_hyperparameters)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              verbose=50)


    y_pred_prob = model.predict_proba(test[CATEGORICAL_FEATURES + NUMERICAL_FEATURES])[:,1]
    y_pred = [1 if y_hat >=0.4 else 0 for y_hat in y_pred_prob]
    y_test = test[TARGET].values
    print(roc_auc_score(y_test, y_pred_prob))
    print(classification_report(y_test, y_pred))

[0]	validation_0-logloss:0.68755	validation_0-auc:0.89446	validation_1-logloss:0.68754	validation_1-auc:0.89469
[50]	validation_0-logloss:0.51111	validation_0-auc:0.89812	validation_1-logloss:0.51098	validation_1-auc:0.89829
[100]	validation_0-logloss:0.42882	validation_0-auc:0.89860	validation_1-logloss:0.42867	validation_1-auc:0.89878
[150]	validation_0-logloss:0.38409	validation_0-auc:0.90019	validation_1-logloss:0.38393	validation_1-auc:0.90038
[200]	validation_0-logloss:0.35962	validation_0-auc:0.90104	validation_1-logloss:0.35946	validation_1-auc:0.90123
[250]	validation_0-logloss:0.34538	validation_0-auc:0.90196	validation_1-logloss:0.34523	validation_1-auc:0.90214
[300]	validation_0-logloss:0.33723	validation_0-auc:0.90248	validation_1-logloss:0.33711	validation_1-auc:0.90265
[350]	validation_0-logloss:0.33235	validation_0-auc:0.90288	validation_1-logloss:0.33227	validation_1-auc:0.90303
[400]	validation_0-logloss:0.32878	validation_0-auc:0.90351	validation_1-logloss:0.32871	va

KeyboardInterrupt: 

In [31]:
X_valid[CATEGORICAL_FEATURES]

Unnamed: 0,os_name,age_group,gender,country,region,province_type,province
3,windows,45-54,male,VN,Northern Vietnam,rural,Quảng Ninh
5,windows,25-34,female,VN,Northern Vietnam,urban,Hà Nội
9,windows,18-24,female,VN,Northern Vietnam,urban,Hải Phòng
13,windows,55+,female,VN,Central Vietnam,rural,Nghệ An
18,macos,45-54,male,VN,Northern Vietnam,urban,Hà Nội
...,...,...,...,...,...,...,...
6321908,windows,25-34,female,VN,Northern Vietnam,urban,Hải Phòng
6321909,windows,35-44,male,VN,Northern Vietnam,urban,Hà Nội
6321912,windows,25-34,female,VN,Northern Vietnam,rural,Lạng Sơn
6321913,windows,18-24,male,UNKNOWN,,unknown,
