In [159]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline

In [160]:
import lightgbm as lgb

In [161]:
df = pd.read_csv('data/jiebao_all_user_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day,properties,mileage,car_like,excellent_post_count,all_post_count,label
0,0,oden123,0,0,2,1,1,2018,3,3,1,4430.0,1,0,0,True
1,1,生活1934626,2,4,2,2,1,2012,5,5,1,4270.0,2,0,0,True
2,2,wangzi1125,0,1,2,3,1,2017,6,6,1,2090.0,3,0,1,True
3,3,房产专家谢广财,3,2,2,4,1,2014,2,2,1,1120.0,4,0,0,True
4,4,南宫晗笑,4,5,2,5,1,2017,7,7,1,740.0,1,0,14,True


In [162]:
df.keys()

Index(['Unnamed: 0', 'user_id', 'follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count', 'label'],
      dtype='object')

In [163]:
df.dtypes

Unnamed: 0                int64
user_id                  object
follow_count              int64
fans_count                int64
gender                    int64
location                  int64
level                     int64
regis_year                int64
regis_month               int64
regis_day                 int64
properties                int64
mileage                 float64
car_like                  int64
excellent_post_count      int64
all_post_count            int64
label                      bool
dtype: object

In [164]:
x = df.drop(columns=['Unnamed: 0','user_id', 'label'])
x.head()

Unnamed: 0,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day,properties,mileage,car_like,excellent_post_count,all_post_count
0,0,0,2,1,1,2018,3,3,1,4430.0,1,0,0
1,2,4,2,2,1,2012,5,5,1,4270.0,2,0,0
2,0,1,2,3,1,2017,6,6,1,2090.0,3,0,1
3,3,2,2,4,1,2014,2,2,1,1120.0,4,0,0
4,4,5,2,5,1,2017,7,7,1,740.0,1,0,14


In [165]:
y = df['label'].astype('uint8')
y.describe()

count    2961.000000
mean        0.083755
std         0.277068
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64

In [166]:
x, y = np.array(x).astype('int'), np.array(y).astype('int')
x.shape, y.shape

((2961, 13), (2961,))

### Split dataset into train/validation/test set

In [167]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,  random_state = 42)

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5,  random_state = 42)

In [168]:
#print(x_train.shape, y_train.shape)

#y_test.describe()

### Build SVM model then train it

In [169]:
clf = SVC(C=0.8, kernel='rbf', class_weight='balanced')

In [170]:
clf.fit(x_train, y_train)

SVC(C=0.8, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [138]:
pred = clf.predict(x_test)

In [139]:
print(classification_report(y_true=y_test, y_pred=pred))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92       271
          1       0.00      0.00      0.00        26

avg / total       0.83      0.85      0.84       297



### Build lightgbm model and train it 

#### Dataset construction based on numpy data

In [280]:
x_train[0], y_train[0]

(array([   0,    0,    2,   13,    1, 2018,    3,    3,    1,  490,   35,
           0,    0]), 0)

In [281]:
train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)
test_data = lgb.Dataset(x_test, label = y_test)

In [282]:
train_data = lgb.Dataset(x_train, 
                         label=y_train,
                         feature_name=['follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count',],
                         categorical_feature=['gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'car_like', ],
                        )
val_data = lgb.Dataset(x_val, 
                       label=y_val,
                       feature_name=['follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count',],
                         categorical_feature=['gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'car_like', ],)


#### Setting parameters

In [283]:
# param = {'max_depth': 10,
#         'num_leaves': 20,
#         'num_trees': 5,
#         'objective':'binary',
#         'min_data_in_leaf':100}
# param['metric'] = 'auc'

# para searching
params = {
    'boosting_type': 'gbdt',  # np.random.choice(['dart', 'gbdt']),
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'], 
    
    'learning_rate': 0.35,
    
    'num_leaves': np.random.randint(30, 128),
    'max_depth': np.random.randint(3, 12),
    'min_data_in_leaf': int(2 ** (np.random.rand()*3.5 + 9)),
    
    'feature_fraction': np.random.rand()*0.35+0.65,
    'bagging_fraction': np.random.rand()*0.35+0.65,
    'bagging_freq': 1,
    
    'lambda_l1': 10 ** (np.random.rand() * 4),
    'lambda_l2': 10 ** (np.random.rand() * 3 + 2),
    'min_gain_to_split': 0.0,
    'min_sum_hessian_in_leaf': 0.1,
    
    'num_threads': 16,
    'verbose': 0,
    'is_training_metric': 'True'
}
print('Hyper-parameters:')
print(params)

Hyper-parameters:
{'boosting_type': 'gbdt', 'objective': 'binary', 'metric': ['binary_logloss', 'auc'], 'learning_rate': 0.35, 'num_leaves': 110, 'max_depth': 5, 'min_data_in_leaf': 574, 'feature_fraction': 0.9622906611160201, 'bagging_fraction': 0.7796884603338681, 'bagging_freq': 1, 'lambda_l1': 2.807851284391429, 'lambda_l2': 573.5019653384691, 'min_gain_to_split': 0.0, 'min_sum_hessian_in_leaf': 0.1, 'num_threads': 16, 'verbose': 0, 'is_training_metric': 'True'}


#### Training 

In [284]:
# num_round = 100
# bst = lgb.train(param, 
#                 train_data,
#                 num_round,
#                 valid_sets=[val_data],
#                 early_stopping_rounds=30
#                )

evals_result = {}  # dict to store evaluation results of all the items in `valid_sets`

gbm = lgb.train(params, train_data, 5000, valid_sets=[train_data, val_data], \
                valid_names = ['train', 'valid'], evals_result=evals_result, \
                early_stopping_rounds=2500, verbose_eval=100, 
                categorical_feature='auto')



Training until validation scores don't improve for 2500 rounds.
[100]	train's auc: 0.760884	train's binary_logloss: 0.247865	valid's auc: 0.724711	valid's binary_logloss: 0.306775
[200]	train's auc: 0.775175	train's binary_logloss: 0.241602	valid's auc: 0.709251	valid's binary_logloss: 0.310528
[300]	train's auc: 0.783075	train's binary_logloss: 0.238469	valid's auc: 0.692818	valid's binary_logloss: 0.317923
[400]	train's auc: 0.789429	train's binary_logloss: 0.236148	valid's auc: 0.68661	valid's binary_logloss: 0.322043
[500]	train's auc: 0.794905	train's binary_logloss: 0.23442	valid's auc: 0.680402	valid's binary_logloss: 0.326324
[600]	train's auc: 0.798235	train's binary_logloss: 0.233142	valid's auc: 0.676141	valid's binary_logloss: 0.328477
[700]	train's auc: 0.80196	train's binary_logloss: 0.231891	valid's auc: 0.671881	valid's binary_logloss: 0.331903
[800]	train's auc: 0.805586	train's binary_logloss: 0.230795	valid's auc: 0.662021	valid's binary_logloss: 0.335426
[900]	train

In [285]:
# save best round results to csv
bst_round = np.argmax(evals_result['valid']['auc'])
trn_auc = evals_result['train']['auc'][bst_round]
trn_loss = evals_result['train']['binary_logloss'][bst_round]
val_auc = evals_result['valid']['auc'][bst_round]
val_loss = evals_result['valid']['binary_logloss'][bst_round]

In [286]:
print('Best Round: %d'%bst_round)
print('Training loss: %.5f, Validation loss: %.5f'%(trn_loss, val_loss))
print('Training AUC : %.5f, Validation AUC : %.5f'%(trn_auc, val_auc))

Best Round: 48
Training loss: 0.25511, Validation loss: 0.30890
Training AUC : 0.74540, Validation AUC : 0.74297


In [287]:
feature_importance = pd.DataFrame({'name':gbm.feature_name(), 'importance':gbm.feature_importance()}).sort_values(by='importance', ascending=False)
feature_importance.to_csv('./feat_importance.csv', index=False)


In [266]:
# save model
json_model = bst.dump_model()

#### Prediction

In [267]:
y_pred = bst.predict(x_test)

In [268]:
y_pred

array([0.15822865, 0.06390479, 0.05668082, 0.06095951, 0.11880461,
       0.06357549, 0.06309877, 0.06480551, 0.0798649 , 0.06487235,
       0.07616751, 0.05748772, 0.13948987, 0.0714921 , 0.05951458,
       0.06390479, 0.0705908 , 0.08486904, 0.06946347, 0.0763037 ,
       0.11636706, 0.0633582 , 0.0673254 , 0.07830939, 0.07166258,
       0.07878452, 0.10805357, 0.06100318, 0.08690044, 0.09629005,
       0.06672327, 0.12361672, 0.06295566, 0.08486904, 0.07218936,
       0.10565559, 0.07218936, 0.10601032, 0.17562203, 0.06211493,
       0.0633582 , 0.16260094, 0.06090006, 0.13351148, 0.07218936,
       0.0763037 , 0.08487169, 0.06159632, 0.06357549, 0.07546008,
       0.06774196, 0.08983227, 0.07218936, 0.10625665, 0.07721165,
       0.05585371, 0.07429053, 0.06295566, 0.05584941, 0.06390479,
       0.16951065, 0.08402641, 0.08086826, 0.05480372, 0.05587033,
       0.05587033, 0.07218936, 0.0633582 , 0.06406544, 0.06278664,
       0.06672327, 0.05587033, 0.13351148, 0.07186149, 0.06480

In [269]:
y_pred = y_pred >= 0.5

In [270]:
from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred))

0.9124579124579124


In [271]:
print(classification_report(y_true=y_test, y_pred=y_pred))

             precision    recall  f1-score   support

          0       0.91      1.00      0.95       271
          1       0.00      0.00      0.00        26

avg / total       0.83      0.91      0.87       297



  'precision', 'predicted', average, warn_for)
