In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report

%matplotlib inline

In [6]:
import lightgbm as lgb

In [7]:
df = pd.read_csv('data/jiebao_all_user_processed.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,user_id,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day,properties,mileage,car_like,excellent_post_count,all_post_count,label
0,0,oden123,0,0,2,1,1,2018,3,3,1,4430.0,1,0,0,True
1,1,生活1934626,2,4,2,2,1,2012,5,5,1,4270.0,2,0,0,True
2,2,wangzi1125,0,1,2,3,1,2017,6,6,1,2090.0,3,0,1,True
3,3,房产专家谢广财,3,2,2,4,1,2014,2,2,1,1120.0,4,0,0,True
4,4,南宫晗笑,4,5,2,5,1,2017,7,7,1,740.0,1,0,14,True


In [8]:
df.keys()

Index(['Unnamed: 0', 'user_id', 'follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count', 'label'],
      dtype='object')

In [9]:
df.dtypes

Unnamed: 0                int64
user_id                  object
follow_count              int64
fans_count                int64
gender                    int64
location                  int64
level                     int64
regis_year                int64
regis_month               int64
regis_day                 int64
properties                int64
mileage                 float64
car_like                  int64
excellent_post_count      int64
all_post_count            int64
label                      bool
dtype: object

In [10]:
x = df.drop(columns=['Unnamed: 0','user_id', 'label'])
x.head()

Unnamed: 0,follow_count,fans_count,gender,location,level,regis_year,regis_month,regis_day,properties,mileage,car_like,excellent_post_count,all_post_count
0,0,0,2,1,1,2018,3,3,1,4430.0,1,0,0
1,2,4,2,2,1,2012,5,5,1,4270.0,2,0,0
2,0,1,2,3,1,2017,6,6,1,2090.0,3,0,1
3,3,2,2,4,1,2014,2,2,1,1120.0,4,0,0
4,4,5,2,5,1,2017,7,7,1,740.0,1,0,14


In [11]:
y = df['label'].astype('uint8')
y.describe()

count    2961.000000
mean        0.083755
std         0.277068
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
Name: label, dtype: float64

In [12]:
x, y = np.array(x).astype('int'), np.array(y).astype('int')
x.shape, y.shape

((2961, 13), (2961,))

### Split dataset into train/validation/test set

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,  random_state = 42)

x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5,  random_state = 42)

In [14]:
#print(x_train.shape, y_train.shape)

#y_test.describe()

In [15]:
y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0])

### Build SVM model then train it

In [16]:
clf = SVC(C=0.8, kernel='rbf', class_weight='balanced')

In [17]:
clf.fit(x_train, y_train)

SVC(C=0.8, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
pred = clf.predict(x_test)

In [19]:
print(classification_report(y_true=y_test, y_pred=pred))

             precision    recall  f1-score   support

          0       0.91      0.93      0.92       271
          1       0.00      0.00      0.00        26

avg / total       0.83      0.85      0.84       297



### Build lightgbm model and train it 

#### Dataset construction based on numpy data

In [20]:
x_train[0], y_train[0]

(array([   0,    0,    2,   13,    1, 2018,    3,    3,    1,  490,   35,
           0,    0]), 0)

In [21]:
train_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_val, label=y_val)
test_data = lgb.Dataset(x_test, label = y_test)

In [22]:
train_data = lgb.Dataset(x_train, 
                         label=y_train,
                         feature_name=['follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count',],
                         categorical_feature=['gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'car_like', ],
                        )
val_data = lgb.Dataset(x_val, 
                       label=y_val,
                       feature_name=['follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count',],
                         categorical_feature=['gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'car_like', ],
                      )
test_data = lgb.Dataset(x_test, 
                        label = y_test,
                        feature_name=['follow_count', 'fans_count', 'gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'mileage', 'car_like', 'excellent_post_count',
       'all_post_count',],
                        categorical_feature=['gender',
       'location', 'level', 'regis_year', 'regis_month', 'regis_day',
       'properties', 'car_like', ],
                       )


#### Setting parameters

In [23]:
# param = {'max_depth': 10,
#         'num_leaves': 20,
#         'num_trees': 5,
#         'objective':'binary',
#         'min_data_in_leaf':100}
# param['metric'] = 'auc'

# para searching
params = {
    'boosting_type': 'gbdt',  # np.random.choice(['dart', 'gbdt']),
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'], 
    
    'learning_rate': 0.25,
    
    'num_leaves': np.random.randint(20, 128),
    'max_depth': np.random.randint(6, 12),
    'min_data_in_leaf': int(2 ** (np.random.rand()*3.5 + 9)),
    
    'feature_fraction': np.random.rand()*0.35+0.65,
    'bagging_fraction': np.random.rand()*0.35+0.65,
    'bagging_freq': 1,
    
    'lambda_l1': 10 ** (np.random.rand() * 4),
    'lambda_l2': 10 ** (np.random.rand() * 3 + 2),
    'min_gain_to_split': 0.0,
    'min_sum_hessian_in_leaf': 0.1,
    
    'num_threads': 16,
    'verbose': 0,
    'is_training_metric': 'True'
}
print('Hyper-parameters:')
print(params)

Hyper-parameters:
{'boosting_type': 'gbdt', 'objective': 'binary', 'metric': ['binary_logloss', 'auc'], 'learning_rate': 0.25, 'num_leaves': 86, 'max_depth': 10, 'min_data_in_leaf': 5391, 'feature_fraction': 0.6531829065699625, 'bagging_fraction': 0.6883960792775455, 'bagging_freq': 1, 'lambda_l1': 54.1641407805608, 'lambda_l2': 108.23836870107053, 'min_gain_to_split': 0.0, 'min_sum_hessian_in_leaf': 0.1, 'num_threads': 16, 'verbose': 0, 'is_training_metric': 'True'}


#### Training 

In [24]:
# num_round = 100
# bst = lgb.train(param, 
#                 train_data,
#                 num_round,
#                 valid_sets=[val_data],
#                 early_stopping_rounds=30
#                )

evals_result = {}  # dict to store evaluation results of all the items in `valid_sets`

gbm = lgb.train(params, 
                train_data, 
                5000, 
                valid_sets=[train_data, val_data], \
                valid_names = ['train', 'valid'], 
                evals_result=evals_result, \
                early_stopping_rounds=2500, 
                verbose_eval=200)



Training until validation scores don't improve for 2500 rounds.
[200]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[400]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[600]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[800]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[1000]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[1200]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[1400]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[1600]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.61724
[1800]	train's auc: 0.5	train's binary_logloss: 2.78586	valid's auc: 0.5	valid's binary_logloss: 3.6

In [25]:
# save best round results to csv
bst_round = np.argmax(evals_result['valid']['auc'])
trn_auc = evals_result['train']['auc'][bst_round]
trn_loss = evals_result['train']['binary_logloss'][bst_round]
val_auc = evals_result['valid']['auc'][bst_round]
val_loss = evals_result['valid']['binary_logloss'][bst_round]

In [26]:
print('Best Round: %d'%bst_round)
print('Training loss: %.5f, Validation loss: %.5f'%(trn_loss, val_loss))
print('Training AUC : %.5f, Validation AUC : %.5f'%(trn_auc, val_auc))

Best Round: 0
Training loss: 0.28038, Validation loss: 0.33895
Training AUC : 0.50000, Validation AUC : 0.50000


In [27]:
feature_importance = pd.DataFrame({'name':gbm.feature_name(), 'importance':gbm.feature_importance()}).sort_values(by='importance', ascending=False)
feature_importance.to_csv('./feat_importance.csv', index=False)

In [28]:
#  save model
# json_model = bst.dump_model()

#### Prediction

In [30]:
y_pred = gbm.predict(x_test)
print(y_pred[:10])

y_pred = y_pred >= 0.5
print(y_pred[:10])

from sklearn.metrics import accuracy_score, confusion_matrix
print(accuracy_score(y_test, y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred))

[0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5]
[ True  True  True  True  True  True  True  True  True  True]
0.08754208754208755
             precision    recall  f1-score   support

          0       0.00      0.00      0.00       271
          1       0.09      1.00      0.16        26

avg / total       0.01      0.09      0.01       297



  'precision', 'predicted', average, warn_for)
