In [1]:
import pandas as pd
import pandas_gbq
import numpy as np
import math
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils import class_weight
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error
from xgboost import plot_importance
import matplotlib.pyplot as plt
import operator
import pickle
from catboost import CatBoostClassifier

In [4]:
%%bigquery data
Select * from maximal-furnace-783.madan.sc_live_ranker_data_all

Query complete after 0.00s: 100%|██████████| 2/2 [00:00<00:00, 1221.58query/s]                        
Downloading: 100%|██████████| 566026/566026 [00:05<00:00, 101783.49rows/s]


In [3]:
data.columns.values

array(['time', 'distinct_id', 'entityId', 'userGiftAmount_1_DAY',
       'userGiftAmount_3_DAY', 'userGiftAmount_7_DAY',
       'userGiftAmount_1_HOUR', 'userGiftAmount_6_HOUR',
       'userGiftAmount_15_MINUTE', 'userCommentCount_1_DAY',
       'userCommentCount_3_DAY', 'userCommentCount_7_DAY',
       'userCommentCount_1_HOUR', 'userCommentCount_6_HOUR',
       'userCommentCount_15_MINUTE', 'userRequestAudioSeatCount_1_DAY',
       'userRequestAudioSeatCount_3_DAY',
       'userRequestAudioSeatCount_7_DAY',
       'userRequestAudioSeatCount_1_HOUR',
       'userRequestAudioSeatCount_6_HOUR',
       'userRequestAudioSeatCount_15_MINUTE', 'userRechargeCount_1_DAY',
       'userRechargeCount_3_DAY', 'userRechargeCount_7_DAY',
       'userRechargeCount_1_HOUR', 'userRechargeCount_6_HOUR',
       'userRechargeCount_15_MINUTE', 'userHostChatroomCount_1_DAY',
       'userHostChatroomCount_3_DAY', 'userHostChatroomCount_7_DAY',
       'userHostChatroomCount_1_HOUR', 'userHostChatroomCount_6_

In [7]:
data.label.value_counts()

0    560251
1      5775
Name: label, dtype: int64

In [9]:
neg = data[data.label == 0].sample(n = 100000, replace = False)

In [10]:
pos = data[data.label == 1]

In [11]:
df = pos.append(neg)

In [12]:
df.shape

(105775, 72)

In [13]:
df = df.sample(frac = 1)

In [14]:
from catboost import CatBoostClassifier, Pool, metrics, cv

In [15]:
y = df['label']
X = df.drop(['label', 'time', 'distinct_id', 'entityId'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
)

clf = CatBoostClassifier(n_estimators=200, max_depth=7, eta=0.015, eval_metric = metrics.AUC(), logging_level = 'Silent',)

clf.fit(X_train, y_train, sample_weight = classes_weights)

<catboost.core.CatBoostClassifier at 0x7f92dd7df210>

In [16]:
pred_test = clf.predict(X_test)
print("****** Confusion Matrix ******")
print(confusion_matrix(y_test, pred_test))

print(precision_recall_fscore_support(y_test, pred_test, average = "weighted"))

****** Confusion Matrix ******
[[12872  7128]
 [  329   826]]
(0.9275110535263582, 0.6475064996454738, 0.7429655581516613, None)


In [20]:
clf.get_feature_importance(prettified = True)

Unnamed: 0,Feature Id,Importances
0,userCommentCount_7_DAY,12.837164
1,chatroomJoinCount_6_HOUR,6.829745
2,userRequestAudioSeatCount_7_DAY,5.323230
3,userCommentCount_3_DAY,4.643473
4,chatroomJoinCount_1_HOUR,4.257922
...,...,...
63,userRechargeCount_15_MINUTE,0.069949
64,chatroomBattleCount_6_HOUR,0.037098
65,userGiftAmount_1_HOUR,0.026594
66,userRechargeCount_1_HOUR,0.008460
