In [1]:
import pandas as pd
from pandas import option_context
import numpy as np
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn import set_config
set_config(transform_output = "pandas")

#Classifiers
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb

#MissingIndicator, Imputer and Pipeline
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer, precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score

In [2]:
RAND_SEED = 9309
SPLIT = 0.4

## Read the features and select the contributors that have made >50 activities

In [3]:
df_features_read = pd.read_csv('data/accounts_features.csv',index_col=0)
df_active_contributors = pd.read_csv('active_contributors.csv',index_col=0)
df_active_contributors_list = df_active_contributors.contributor.to_list()

important_features = ['NAT_mean','feat_NT','DCAT_median','feat_NOR','DCA_gini','NAR_mean']
df_active_account_features = df_features_read[df_features_read.index.isin(df_active_contributors_list)]
df_features = (
                df_active_account_features
                .query('feat_NA>=5')
                .assign(bot = lambda d: np.where(d.acc_type == "bot", 1, 0))
                .drop('acc_type',axis=1)
                [important_features+['bot']]
              )
with option_context('display.max_column',None):
    display(df_features.query('bot == 1 '))

Unnamed: 0,NAT_mean,feat_NT,DCAT_median,feat_NOR,DCA_gini,NAR_mean,bot
0crat,130.000,2,1.511,3,0.844,12.381,1
24emebot,300.000,1,,1,0.786,300.000,1
47erbot,28.000,8,0.002,3,0.957,17.231,1
5imon-bot,19.667,3,0.004,1,0.716,14.750,1
9cibot,18.500,2,0.000,1,0.778,9.250,1
...,...,...,...,...,...,...,...
zephyrbot,1.750,4,1.515,1,0.707,7.000,1
zeuswpi-bot,25.000,1,,1,0.677,25.000,1
zhaobot,3.000,3,0.032,3,0.748,2.250,1
zulipbot,28.667,3,1.339,1,0.666,21.500,1


In [4]:
df_features.columns[df_features.isna().any()]

Index(['DCAT_median'], dtype='object')

## Split data to train and test

In [5]:
def data_split(df):
    '''
    args: df - DataFrame; contributor behavioural features based on their activities
    
    return: x_train - DataFrame; features for training the model
            y_train - DataFrame; labels that represent bot/human for trianing the model
            x_test - DataFrame; features for testing the model
            y_test - DataFrame; labels that represent bot/human for testing the model
    
    method: invoke train_test_split to split the data and the labels accordingly
    '''
    
    x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,:-1], 
                                                        df.iloc[:,-1:], 
                                                        test_size=SPLIT, 
                                                        random_state=RAND_SEED,
                                                        stratify=df.iloc[:,-1:])
    
    return(x_train, x_test, y_train, y_test)

In [6]:
x_train, x_test, y_train, y_test = data_split(df_features)

In [7]:
total = df_features.shape[0]
bot = df_features.query('bot == 1').shape[0]
hum = df_features.query('bot == 0').shape[0]

print(f'original data: \n   total number of contributors: {total} \n   bot contributors: {bot} \n   human contributors: {hum}')

tr_total = y_train.shape[0]
tr_bot = y_train.query('bot == 1').shape[0]
tr_hum = y_train.query('bot == 0').shape[0]

print(f'\ntrain data: \n   total number of contributors: {tr_total} \n   bot contributors: {tr_bot} \n   human contributors: {tr_hum}')

print(f'\ntraining proportion: \n   total number of contributors: {tr_total/total} \n   bot contributors: {tr_bot/bot} \n   human contributors: {tr_hum/hum}')

te_total = y_test.shape[0]
te_bot = y_test.query('bot == 1').shape[0]
te_hum = y_test.query('bot == 0').shape[0]

print(f'\ntest data: \n   total number of contributors: {te_total} \n   bot contributors: {te_bot} \n   human contributors: {te_hum}')

original data: 
   total number of contributors: 1335 
   bot contributors: 644 
   human contributors: 691

train data: 
   total number of contributors: 801 
   bot contributors: 386 
   human contributors: 415

training proportion: 
   total number of contributors: 0.6 
   bot contributors: 0.5993788819875776 
   human contributors: 0.6005788712011577

test data: 
   total number of contributors: 534 
   bot contributors: 258 
   human contributors: 276


## Performance functions

In [8]:
def botrecall(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[1][1]
bot_recall = make_scorer(botrecall, greater_is_better=True)

def humanrecall(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[1][0]
human_recall = make_scorer(humanrecall, greater_is_better=True)

def botprecision(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[0][1]
bot_precision = make_scorer(botprecision, greater_is_better=True)

def humanprecision(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[0][0]
human_precision = make_scorer(humanprecision, greater_is_better=True)

def botfscore(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[2][1]
bot_fscore = make_scorer(botfscore, greater_is_better=True)

def humanfscore(y_true, y_pred):
    return precision_recall_fscore_support(y_true,y_pred,zero_division=0.0)[2][0]
human_fscore = make_scorer(humanfscore, greater_is_better=True)

def wpscore(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,average='weighted',zero_division=0.0)[0]
wprecision_score = make_scorer(wpscore, greater_is_better=True)

def wrscore(y_true, y_pred):   
    return precision_recall_fscore_support(y_true,y_pred,average='weighted',zero_division=0.0)[1]
wrecall_score = make_scorer(wrscore, greater_is_better=True)

## Model search

### Pipeline construction

In [9]:
def make_nested_imputers(df):
    '''
    args: df - DataFrame; contributor behavioural features
    
    returns: preprocessor - ColumnTransformer that specifies all the indicator, imputers
    
    method: Define all the individual indicator and imputers and give it to the column transformer 
            in the form of list of tuples. Each tuple is of the form 
            ('transformer name', 'transformer', 'column names')  
    '''
    
    #indicator and imputer
    imputers = {'nan_indicator': MissingIndicator(missing_values=np.nan, features='all'),
                'nan_imputer': SimpleImputer(missing_values=np.nan, strategy='median', 
                                             keep_empty_features = True)
               }
    
    #names for indicator and imputers
    imputer_names = {'nan_indicator': 'indicate_nan',
                     'nan_imputer': 'impute_nan'
                    }
    
    #group of columns to be used in indicator and imputers

    nan_col_names = df.columns[df.isna().any()].to_list()
    
    nested_transformers = [(imputer_names[imputer], imputers[imputer], nan_col_names) 
                           for imputer in imputers]
    preprocessor = ColumnTransformer(transformers=nested_transformers,
                                     remainder='passthrough',
                                     verbose_feature_names_out=False)
    return(preprocessor)

In [10]:
def make_custom_pipeline(df, clf):
    '''
    args: df - DataFrame of features
          clf - classifier to be used as an estimator
    
    returns: pipe - Pipeline of transformers to be applied on the data
    
    method: invoke make_nested_imputers for getting all the transformations that has to applied on 
            the data and build the pipeline with nested transformations and classifier
    '''
    
    preprocessor = make_nested_imputers(df)
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('clf', clf)
    ])
    
    return(pipe)

### Grid search

In [11]:
def perform_grid_search(training_data_features, training_data_labels, clf, param_grid, clf_method):
    '''
    args: training_data_features - DataFrame; to train/cross-validate the model
          training_data_labels - array; array of labels (bot/human) for each row of features
          clf - method; classifier
          param_grid - dict; classifier's parameters for performing grid search
          clf_method - str; classifier name as a string
    
    invoke: make_custom_pipeline(...)
    
    returns: df_model_performance - DataFrame; performance of the model for the set of 
             parameters in grid search
    
    method: construct the pipeline with the given classifier as estimator
            perform grid search and store all the performance metrics for all combinations in a dataframe
    '''
    pipe = make_custom_pipeline(training_data_features, clf)
    grid_search = GridSearchCV(pipe, param_grid=param_grid,
                               cv=StratifiedShuffleSplit(n_splits = 10, random_state=RAND_SEED),
                               scoring={'bot_f1':bot_fscore,
                                        'human_f1':human_fscore,
                                        'f1_micro':'f1_micro',
                                        'f1_macro':'f1_macro',
                                        'f1_weighted':'f1_weighted',
                                        'bot_precision':bot_precision,
                                        'human_precision':human_precision,
                                        'precision':wprecision_score,
                                        'bot_recall':bot_recall,
                                        'human_recall':human_recall,
                                        'recall':wrecall_score,
                                        'roc_auc':'roc_auc'
                                  },refit=False, n_jobs=-1)
    display(grid_search)
    grid_search.fit(training_data_features,training_data_labels)
    # Create a data frame of resulting classifiers with precision scores
    df_model_performance = (
                         pd.DataFrame()
                         .from_dict(grid_search.cv_results_)
                         .assign(method=clf_method)
                        )
    
    return(df_model_performance)

### Classifiers and parameters

In [12]:
# parameters for ComplementNB 9*2 = 18 models
cnb_param_grid = {'clf__alpha': [0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4],
                  'clf__norm': [True, False]
                 }

# parameters for DecisionTreeClassifier 10*2*1 = 20 models
dtc_param_grid = {'clf__max_depth': [4, 6, 8, 10, 12, 14, 16, 18, 20, None],
                  'clf__criterion': ['gini','entropy'],
                  'clf__class_weight': ['balanced']
                 }

# parameters for RandomForestClassifier 10*9*3*1*3 = 810 models
rfc_param_grid = {'clf__max_depth': [4, 6, 8, 10, 12, 14, 16, 18, 20, None],
                  'clf__n_estimators': [25, 50, 75, 100, 125, 150, 175, 200, 225],
                  'clf__criterion': ['gini','entropy','log_loss'],
                  'clf__class_weight': ['balanced'],
                  'clf__max_features': ['sqrt','log2',None]
                 }

# parameters for GradientBoostingClassifier 2*6*9*2*3 = 648 models
gbc_param_grid = {'clf__loss': ['log_loss','exponential'],
                  'clf__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
                  'clf__n_estimators': [25, 50, 75, 100, 125, 150, 175, 200, 225],
                  'clf__criterion': ['friedman_mse','squared_error'],
                  'clf__max_features': ['sqrt','log2',None]
                 }

# parameters for SupportVectorMachineClassifier (SVC) 3*5*5*1*1 = 75 models
svc_param_grid = {'clf__kernel': ['poly','rbf','sigmoid'],
                  'clf__degree': [2, 3, 4, 5, 6],
                  'clf__C': [0.1, 0.5, 1, 1.5, 2], #1.5 not converging
                  'clf__gamma': ['scale'],
                  'clf__class_weight': ['balanced']
                 }

# parameters for LinearDiscriminantAnalysis 1*2 = 2 models
lda_param_grid = {'clf__solver': ['svd','lsqr']}

# parameters for xgboost 9*10*6*1*4 = 2,160 models
xgb_param_grid = {'clf__n_estimators': [25, 50, 75, 100, 125, 150, 175, 200, 225],
                  'clf__max_depth': [4, 6, 8, 10, 12, 14, 16, 18, 20, None],
                  'clf__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2, 0.25],
                  'clf__booster': ['gbtree'],
                  'clf__eval_metric': ['auc', 'error', 'logloss', 'mae']
                 }

# parameters for DummyClassifier = 1 model
dc_param_grid = {'clf__strategy': ['most_frequent']}

In [13]:
num_model = 18+20+810+648+75+2+2160+1
print(f'total number of models = {num_model}')

total number of models = 3734


In [14]:
'''
Classifiers and their parameters
'''
classifiers = {
               'ComplementNB':ComplementNB(),
               'DecisionTreeClassifier':DecisionTreeClassifier(random_state=RAND_SEED),
               'RandomForestClassifier':RandomForestClassifier(random_state=RAND_SEED),
               'GradientBoostingClassifier':GradientBoostingClassifier(random_state=RAND_SEED),
               'SVC':SVC(random_state=RAND_SEED),
               'DummyClassifier':DummyClassifier(random_state=RAND_SEED),
               'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
               'XGBClassifier':xgb.XGBClassifier(random_state=RAND_SEED)
              }
classifiers_params ={
               'ComplementNB':cnb_param_grid,
               'DecisionTreeClassifier':dtc_param_grid,
               'RandomForestClassifier':rfc_param_grid,
               'GradientBoostingClassifier':gbc_param_grid,
               'SVC':svc_param_grid,
               'DummyClassifier':dc_param_grid,
               'LinearDiscriminantAnalysis':lda_param_grid,
               'XGBClassifier':xgb_param_grid
}

In [15]:
'''
Train the model, perform grid-search 10-fold cross-validation
'''
df_models_performance_all = pd.DataFrame()

for clf in tqdm(classifiers.keys()):
    df_model_performance = perform_grid_search(x_train, 
                                               y_train['bot'].values, 
                                               classifiers[clf], 
                                               classifiers_params[clf],
                                               clf)
    df_models_performance_all = pd.concat([df_models_performance_all, df_model_performance])

  0%|                                                     | 0/8 [00:00<?, ?it/s]

 12%|█████▋                                       | 1/8 [00:04<00:33,  4.86s/it]

 25%|███████████▎                                 | 2/8 [00:06<00:17,  2.96s/it]

 38%|████████████████▌                           | 3/8 [05:21<12:07, 145.60s/it]

 50%|██████████████████████                      | 4/8 [07:54<09:53, 148.40s/it]

 62%|███████████████████████████▌                | 5/8 [11:00<08:05, 161.82s/it]

 75%|█████████████████████████████████           | 6/8 [11:00<03:33, 106.85s/it]

 88%|███████████████████████████████████████▍     | 7/8 [11:00<01:11, 72.00s/it]

100%|████████████████████████████████████████████| 8/8 [17:16<00:00, 129.54s/it]


In [16]:
with option_context('display.max_column',None):
    display(df_models_performance_all)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__alpha,param_clf__norm,params,split0_test_bot_f1,split1_test_bot_f1,split2_test_bot_f1,split3_test_bot_f1,split4_test_bot_f1,split5_test_bot_f1,split6_test_bot_f1,split7_test_bot_f1,split8_test_bot_f1,split9_test_bot_f1,mean_test_bot_f1,std_test_bot_f1,rank_test_bot_f1,split0_test_human_f1,split1_test_human_f1,split2_test_human_f1,split3_test_human_f1,split4_test_human_f1,split5_test_human_f1,split6_test_human_f1,split7_test_human_f1,split8_test_human_f1,split9_test_human_f1,mean_test_human_f1,std_test_human_f1,rank_test_human_f1,split0_test_f1_micro,split1_test_f1_micro,split2_test_f1_micro,split3_test_f1_micro,split4_test_f1_micro,split5_test_f1_micro,split6_test_f1_micro,split7_test_f1_micro,split8_test_f1_micro,split9_test_f1_micro,mean_test_f1_micro,std_test_f1_micro,rank_test_f1_micro,split0_test_f1_macro,split1_test_f1_macro,split2_test_f1_macro,split3_test_f1_macro,split4_test_f1_macro,split5_test_f1_macro,split6_test_f1_macro,split7_test_f1_macro,split8_test_f1_macro,split9_test_f1_macro,mean_test_f1_macro,std_test_f1_macro,rank_test_f1_macro,split0_test_f1_weighted,split1_test_f1_weighted,split2_test_f1_weighted,split3_test_f1_weighted,split4_test_f1_weighted,split5_test_f1_weighted,split6_test_f1_weighted,split7_test_f1_weighted,split8_test_f1_weighted,split9_test_f1_weighted,mean_test_f1_weighted,std_test_f1_weighted,rank_test_f1_weighted,split0_test_bot_precision,split1_test_bot_precision,split2_test_bot_precision,split3_test_bot_precision,split4_test_bot_precision,split5_test_bot_precision,split6_test_bot_precision,split7_test_bot_precision,split8_test_bot_precision,split9_test_bot_precision,mean_test_bot_precision,std_test_bot_precision,rank_test_bot_precision,split0_test_human_precision,split1_test_human_precision,split2_test_human_precision,split3_test_human_precision,split4_test_human_precision,split5_test_human_precision,split6_test_human_precision,split7_test_human_precision,split8_test_human_precision,split9_test_human_precision,mean_test_human_precision,std_test_human_precision,rank_test_human_precision,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,split4_test_precision,split5_test_precision,split6_test_precision,split7_test_precision,split8_test_precision,split9_test_precision,mean_test_precision,std_test_precision,rank_test_precision,split0_test_bot_recall,split1_test_bot_recall,split2_test_bot_recall,split3_test_bot_recall,split4_test_bot_recall,split5_test_bot_recall,split6_test_bot_recall,split7_test_bot_recall,split8_test_bot_recall,split9_test_bot_recall,mean_test_bot_recall,std_test_bot_recall,rank_test_bot_recall,split0_test_human_recall,split1_test_human_recall,split2_test_human_recall,split3_test_human_recall,split4_test_human_recall,split5_test_human_recall,split6_test_human_recall,split7_test_human_recall,split8_test_human_recall,split9_test_human_recall,mean_test_human_recall,std_test_human_recall,rank_test_human_recall,split0_test_recall,split1_test_recall,split2_test_recall,split3_test_recall,split4_test_recall,split5_test_recall,split6_test_recall,split7_test_recall,split8_test_recall,split9_test_recall,mean_test_recall,std_test_recall,rank_test_recall,split0_test_roc_auc,split1_test_roc_auc,split2_test_roc_auc,split3_test_roc_auc,split4_test_roc_auc,split5_test_roc_auc,split6_test_roc_auc,split7_test_roc_auc,split8_test_roc_auc,split9_test_roc_auc,mean_test_roc_auc,std_test_roc_auc,rank_test_roc_auc,method,param_clf__class_weight,param_clf__criterion,param_clf__max_depth,param_clf__max_features,param_clf__n_estimators,param_clf__learning_rate,param_clf__loss,param_clf__C,param_clf__degree,param_clf__gamma,param_clf__kernel,param_clf__strategy,param_clf__solver,param_clf__booster,param_clf__eval_metric
0,0.038681,0.004098,0.100580,0.008730,0.05,True,"{'clf__alpha': 0.05, 'clf__norm': True}",0.760870,0.812500,0.742268,0.791667,0.835165,0.782609,0.813187,0.787234,0.762887,0.747475,0.783586,0.028996,10,0.685714,0.727273,0.615385,0.696970,0.788732,0.714286,0.760563,0.705882,0.646154,0.603175,0.694413,0.056444,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.723292,0.769886,0.678826,0.744318,0.811949,0.748447,0.786875,0.746558,0.704520,0.675325,0.739000,0.042253,10,0.721900,0.768308,0.676477,0.742565,0.811089,0.747182,0.785901,0.745052,0.702359,0.672652,0.737348,0.042767,10,0.660377,0.684211,0.620690,0.666667,0.730769,0.679245,0.711538,0.672727,0.637931,0.616667,0.668082,0.034763,10,0.857143,1.000,0.869565,0.958333,0.965517,0.892857,0.931034,0.923077,0.913043,0.904762,0.921533,0.041906,10,0.762404,0.847953,0.749736,0.817901,0.852490,0.790007,0.825351,0.802538,0.780582,0.766049,0.799501,0.034046,10,0.897436,1.000000,0.923077,0.974359,0.974359,0.923077,0.948718,0.948718,0.948718,0.948718,0.948718,0.028088,1,0.571429,0.571429,0.476190,0.547619,0.666667,0.595238,0.642857,0.571429,0.500000,0.452381,0.559524,0.064987,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.849206,0.944444,0.895604,0.896825,0.948718,0.864469,0.915751,0.887057,0.879121,0.924908,0.900611,0.031127,3,ComplementNB,,,,,,,,,,,,,,,
1,0.034671,0.006720,0.093701,0.006202,0.05,False,"{'clf__alpha': 0.05, 'clf__norm': False}",0.813953,0.857143,0.837209,0.835165,0.873563,0.837209,0.870588,0.813953,0.804348,0.813187,0.835632,0.023561,1,0.789474,0.816901,0.815789,0.788732,0.853333,0.815789,0.857143,0.789474,0.742857,0.760563,0.803006,0.034602,1,0.802469,0.839506,0.827160,0.814815,0.864198,0.827160,0.864198,0.802469,0.777778,0.790123,0.820988,0.027743,1,0.801714,0.837022,0.826499,0.811949,0.863448,0.826499,0.863866,0.801714,0.773602,0.786875,0.819319,0.028575,1,0.801260,0.836277,0.826103,0.811089,0.863074,0.826103,0.863617,0.801260,0.772464,0.785901,0.818715,0.028783,1,0.744681,0.750000,0.765957,0.730769,0.791667,0.765957,0.804348,0.744681,0.698113,0.711538,0.750771,0.031291,1,0.882353,1.000,0.911765,0.965517,0.969697,0.911765,0.942857,0.882353,0.928571,0.931034,0.932591,0.036117,1,0.816066,0.879630,0.841561,0.852490,0.883979,0.841561,0.876167,0.816066,0.817610,0.825351,0.845048,0.025652,1,0.897436,1.000000,0.923077,0.974359,0.974359,0.923077,0.948718,0.897436,0.948718,0.948718,0.943590,0.032026,10,0.714286,0.690476,0.738095,0.666667,0.761905,0.738095,0.785714,0.714286,0.619048,0.642857,0.707143,0.050000,1,0.802469,0.839506,0.827160,0.814815,0.864198,0.827160,0.864198,0.802469,0.777778,0.790123,0.820988,0.027743,1,0.854090,0.934066,0.888278,0.884005,0.956044,0.862637,0.924908,0.884005,0.849206,0.902930,0.894017,0.033623,11,ComplementNB,,,,,,,,,,,,,,,
2,0.024667,0.001206,0.082364,0.003400,0.1,True,"{'clf__alpha': 0.1, 'clf__norm': True}",0.760870,0.812500,0.742268,0.791667,0.835165,0.782609,0.813187,0.787234,0.762887,0.747475,0.783586,0.028996,10,0.685714,0.727273,0.615385,0.696970,0.788732,0.714286,0.760563,0.705882,0.646154,0.603175,0.694413,0.056444,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.723292,0.769886,0.678826,0.744318,0.811949,0.748447,0.786875,0.746558,0.704520,0.675325,0.739000,0.042253,10,0.721900,0.768308,0.676477,0.742565,0.811089,0.747182,0.785901,0.745052,0.702359,0.672652,0.737348,0.042767,10,0.660377,0.684211,0.620690,0.666667,0.730769,0.679245,0.711538,0.672727,0.637931,0.616667,0.668082,0.034763,10,0.857143,1.000,0.869565,0.958333,0.965517,0.892857,0.931034,0.923077,0.913043,0.904762,0.921533,0.041906,10,0.762404,0.847953,0.749736,0.817901,0.852490,0.790007,0.825351,0.802538,0.780582,0.766049,0.799501,0.034046,10,0.897436,1.000000,0.923077,0.974359,0.974359,0.923077,0.948718,0.948718,0.948718,0.948718,0.948718,0.028088,1,0.571429,0.571429,0.476190,0.547619,0.666667,0.595238,0.642857,0.571429,0.500000,0.452381,0.559524,0.064987,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.849206,0.944444,0.895604,0.896825,0.948718,0.864469,0.915751,0.887057,0.879121,0.924908,0.900611,0.031127,3,ComplementNB,,,,,,,,,,,,,,,
3,0.021476,0.002944,0.092872,0.012598,0.1,False,"{'clf__alpha': 0.1, 'clf__norm': False}",0.813953,0.857143,0.837209,0.835165,0.873563,0.837209,0.870588,0.813953,0.804348,0.813187,0.835632,0.023561,1,0.789474,0.816901,0.815789,0.788732,0.853333,0.815789,0.857143,0.789474,0.742857,0.760563,0.803006,0.034602,1,0.802469,0.839506,0.827160,0.814815,0.864198,0.827160,0.864198,0.802469,0.777778,0.790123,0.820988,0.027743,1,0.801714,0.837022,0.826499,0.811949,0.863448,0.826499,0.863866,0.801714,0.773602,0.786875,0.819319,0.028575,1,0.801260,0.836277,0.826103,0.811089,0.863074,0.826103,0.863617,0.801260,0.772464,0.785901,0.818715,0.028783,1,0.744681,0.750000,0.765957,0.730769,0.791667,0.765957,0.804348,0.744681,0.698113,0.711538,0.750771,0.031291,1,0.882353,1.000,0.911765,0.965517,0.969697,0.911765,0.942857,0.882353,0.928571,0.931034,0.932591,0.036117,1,0.816066,0.879630,0.841561,0.852490,0.883979,0.841561,0.876167,0.816066,0.817610,0.825351,0.845048,0.025652,1,0.897436,1.000000,0.923077,0.974359,0.974359,0.923077,0.948718,0.897436,0.948718,0.948718,0.943590,0.032026,10,0.714286,0.690476,0.738095,0.666667,0.761905,0.738095,0.785714,0.714286,0.619048,0.642857,0.707143,0.050000,1,0.802469,0.839506,0.827160,0.814815,0.864198,0.827160,0.864198,0.802469,0.777778,0.790123,0.820988,0.027743,1,0.854090,0.934066,0.888278,0.884005,0.956044,0.862637,0.924908,0.884005,0.849206,0.902930,0.894017,0.033623,11,ComplementNB,,,,,,,,,,,,,,,
4,0.026919,0.002111,0.094068,0.006754,0.2,True,"{'clf__alpha': 0.2, 'clf__norm': True}",0.760870,0.812500,0.742268,0.791667,0.835165,0.782609,0.813187,0.787234,0.762887,0.747475,0.783586,0.028996,10,0.685714,0.727273,0.615385,0.696970,0.788732,0.714286,0.760563,0.705882,0.646154,0.603175,0.694413,0.056444,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.723292,0.769886,0.678826,0.744318,0.811949,0.748447,0.786875,0.746558,0.704520,0.675325,0.739000,0.042253,10,0.721900,0.768308,0.676477,0.742565,0.811089,0.747182,0.785901,0.745052,0.702359,0.672652,0.737348,0.042767,10,0.660377,0.684211,0.620690,0.666667,0.730769,0.679245,0.711538,0.672727,0.637931,0.616667,0.668082,0.034763,10,0.857143,1.000,0.869565,0.958333,0.965517,0.892857,0.931034,0.923077,0.913043,0.904762,0.921533,0.041906,10,0.762404,0.847953,0.749736,0.817901,0.852490,0.790007,0.825351,0.802538,0.780582,0.766049,0.799501,0.034046,10,0.897436,1.000000,0.923077,0.974359,0.974359,0.923077,0.948718,0.948718,0.948718,0.948718,0.948718,0.028088,1,0.571429,0.571429,0.476190,0.547619,0.666667,0.595238,0.642857,0.571429,0.500000,0.452381,0.559524,0.064987,10,0.728395,0.777778,0.691358,0.753086,0.814815,0.753086,0.790123,0.753086,0.716049,0.691358,0.746914,0.038747,10,0.849206,0.944444,0.895604,0.896825,0.948718,0.864469,0.915751,0.887057,0.879121,0.924908,0.900611,0.031127,3,ComplementNB,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2155,0.176835,0.002731,0.060600,0.001386,,,"{'clf__booster': 'gbtree', 'clf__eval_metric':...",0.960000,0.900000,0.904110,0.950000,0.935065,0.923077,0.918919,0.935065,0.928571,0.938272,0.929308,0.017771,153,0.965517,0.902439,0.921348,0.951220,0.941176,0.928571,0.931818,0.941176,0.923077,0.938272,0.934462,0.016488,149,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.938272,0.925926,0.938272,0.932099,0.016792,101,0.962759,0.901220,0.912729,0.950610,0.938121,0.925824,0.925369,0.938121,0.925824,0.938272,0.931885,0.016869,169,0.962861,0.901265,0.913048,0.950632,0.938234,0.925926,0.925607,0.938234,0.925722,0.938272,0.931980,0.016845,169,1.000000,0.878049,0.970588,0.926829,0.947368,0.923077,0.971429,0.947368,0.866667,0.904762,0.933614,0.040176,329,0.933333,0.925,0.872340,0.975000,0.930233,0.928571,0.891304,0.930233,1.000000,0.974359,0.936037,0.036590,469,0.965432,0.902394,0.919645,0.951807,0.938483,0.925926,0.929883,0.938483,0.935802,0.940849,0.934870,0.016355,121,0.923077,0.923077,0.846154,0.974359,0.923077,0.923077,0.871795,0.923077,1.000000,0.974359,0.928205,0.044114,461,1.000000,0.880952,0.976190,0.928571,0.952381,0.928571,0.976190,0.952381,0.857143,0.904762,0.935714,0.042658,293,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.938272,0.925926,0.938272,0.932099,0.016792,101,0.985348,0.963980,0.974359,0.970085,0.981685,0.966422,0.970696,0.963980,0.962759,0.991453,0.973077,0.009460,693,XGBClassifier,,,,,125,0.25,,,,,,,,gbtree,mae
2156,0.203861,0.002969,0.061602,0.001974,,,"{'clf__booster': 'gbtree', 'clf__eval_metric':...",0.960000,0.900000,0.904110,0.950000,0.935065,0.923077,0.918919,0.921053,0.926829,0.938272,0.927732,0.017815,325,0.965517,0.902439,0.921348,0.951220,0.941176,0.928571,0.931818,0.930233,0.925000,0.938272,0.933559,0.016258,217,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.962759,0.901220,0.912729,0.950610,0.938121,0.925824,0.925369,0.925643,0.925915,0.938272,0.930646,0.016820,313,0.962861,0.901265,0.913048,0.950632,0.938234,0.925926,0.925607,0.925813,0.925881,0.938272,0.930754,0.016791,305,1.000000,0.878049,0.970588,0.926829,0.947368,0.923077,0.971429,0.945946,0.883721,0.904762,0.935177,0.037533,181,0.933333,0.925,0.872340,0.975000,0.930233,0.928571,0.891304,0.909091,0.973684,0.974359,0.931292,0.033439,1169,0.965432,0.902394,0.919645,0.951807,0.938483,0.925926,0.929883,0.926836,0.930369,0.940849,0.933162,0.016482,285,0.923077,0.923077,0.846154,0.974359,0.923077,0.923077,0.871795,0.897436,0.974359,0.974359,0.923077,0.041345,1105,1.000000,0.880952,0.976190,0.928571,0.952381,0.928571,0.976190,0.952381,0.880952,0.904762,0.938095,0.038686,61,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.985348,0.963980,0.974359,0.969475,0.980464,0.966422,0.971306,0.965201,0.960928,0.991453,0.972894,0.009470,821,XGBClassifier,,,,,150,0.25,,,,,,,,gbtree,mae
2157,0.223124,0.005406,0.058167,0.003805,,,"{'clf__booster': 'gbtree', 'clf__eval_metric':...",0.960000,0.900000,0.904110,0.950000,0.935065,0.923077,0.918919,0.921053,0.926829,0.938272,0.927732,0.017815,325,0.965517,0.902439,0.921348,0.951220,0.941176,0.928571,0.931818,0.930233,0.925000,0.938272,0.933559,0.016258,217,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.962759,0.901220,0.912729,0.950610,0.938121,0.925824,0.925369,0.925643,0.925915,0.938272,0.930646,0.016820,313,0.962861,0.901265,0.913048,0.950632,0.938234,0.925926,0.925607,0.925813,0.925881,0.938272,0.930754,0.016791,305,1.000000,0.878049,0.970588,0.926829,0.947368,0.923077,0.971429,0.945946,0.883721,0.904762,0.935177,0.037533,181,0.933333,0.925,0.872340,0.975000,0.930233,0.928571,0.891304,0.909091,0.973684,0.974359,0.931292,0.033439,1169,0.965432,0.902394,0.919645,0.951807,0.938483,0.925926,0.929883,0.926836,0.930369,0.940849,0.933162,0.016482,285,0.923077,0.923077,0.846154,0.974359,0.923077,0.923077,0.871795,0.897436,0.974359,0.974359,0.923077,0.041345,1105,1.000000,0.880952,0.976190,0.928571,0.952381,0.928571,0.976190,0.952381,0.880952,0.904762,0.938095,0.038686,61,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.985958,0.963980,0.974359,0.969475,0.979853,0.962149,0.971917,0.965812,0.962759,0.991453,0.972772,0.009623,925,XGBClassifier,,,,,175,0.25,,,,,,,,gbtree,mae
2158,0.224553,0.028130,0.048057,0.008680,,,"{'clf__booster': 'gbtree', 'clf__eval_metric':...",0.960000,0.900000,0.904110,0.950000,0.935065,0.923077,0.918919,0.921053,0.926829,0.938272,0.927732,0.017815,325,0.965517,0.902439,0.921348,0.951220,0.941176,0.928571,0.931818,0.930233,0.925000,0.938272,0.933559,0.016258,217,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.962759,0.901220,0.912729,0.950610,0.938121,0.925824,0.925369,0.925643,0.925915,0.938272,0.930646,0.016820,313,0.962861,0.901265,0.913048,0.950632,0.938234,0.925926,0.925607,0.925813,0.925881,0.938272,0.930754,0.016791,305,1.000000,0.878049,0.970588,0.926829,0.947368,0.923077,0.971429,0.945946,0.883721,0.904762,0.935177,0.037533,181,0.933333,0.925,0.872340,0.975000,0.930233,0.928571,0.891304,0.909091,0.973684,0.974359,0.931292,0.033439,1169,0.965432,0.902394,0.919645,0.951807,0.938483,0.925926,0.929883,0.926836,0.930369,0.940849,0.933162,0.016482,285,0.923077,0.923077,0.846154,0.974359,0.923077,0.923077,0.871795,0.897436,0.974359,0.974359,0.923077,0.041345,1105,1.000000,0.880952,0.976190,0.928571,0.952381,0.928571,0.976190,0.952381,0.880952,0.904762,0.938095,0.038686,61,0.962963,0.901235,0.913580,0.950617,0.938272,0.925926,0.925926,0.925926,0.925926,0.938272,0.930864,0.016746,209,0.984737,0.965201,0.976801,0.968254,0.979853,0.961538,0.971917,0.963980,0.963980,0.990842,0.972711,0.009462,961,XGBClassifier,,,,,200,0.25,,,,,,,,gbtree,mae


### Best models of each classifier

In [17]:
best_models_f1_weighted = (
    df_models_performance_all
    .sort_values('mean_test_f1_weighted',ascending=False)
    .round(3)
    .drop_duplicates('method')
    [['method', 'params', 'mean_test_f1_weighted', 'mean_test_bot_f1', 'mean_test_human_f1', 
      'mean_test_f1_micro', 'mean_test_bot_recall', 'mean_test_human_recall', 'mean_test_recall', 
      'mean_test_bot_precision', 'mean_test_human_precision', 'mean_test_precision', 
      'mean_test_f1_macro','mean_test_roc_auc']]
    .reset_index(drop=True)
)
with option_context('display.max_column',None,'display.max_colwidth',None):
    display(best_models_f1_weighted)

Unnamed: 0,method,params,mean_test_f1_weighted,mean_test_bot_f1,mean_test_human_f1,mean_test_f1_micro,mean_test_bot_recall,mean_test_human_recall,mean_test_recall,mean_test_bot_precision,mean_test_human_precision,mean_test_precision,mean_test_f1_macro,mean_test_roc_auc
0,XGBClassifier,"{'clf__booster': 'gbtree', 'clf__eval_metric': 'error', 'clf__learning_rate': 0.25, 'clf__max_depth': 4, 'clf__n_estimators': 75}",0.937,0.934,0.939,0.937,0.933,0.94,0.937,0.939,0.941,0.94,0.937,0.974
1,GradientBoostingClassifier,"{'clf__criterion': 'squared_error', 'clf__learning_rate': 0.01, 'clf__loss': 'exponential', 'clf__max_features': 'log2', 'clf__n_estimators': 75}",0.936,0.933,0.938,0.936,0.933,0.938,0.936,0.934,0.939,0.937,0.936,0.978
2,RandomForestClassifier,"{'clf__class_weight': 'balanced', 'clf__criterion': 'entropy', 'clf__max_depth': 14, 'clf__max_features': 'sqrt', 'clf__n_estimators': 200}",0.934,0.932,0.936,0.935,0.938,0.931,0.935,0.929,0.944,0.937,0.934,0.976
3,DecisionTreeClassifier,"{'clf__class_weight': 'balanced', 'clf__criterion': 'gini', 'clf__max_depth': 6}",0.912,0.908,0.916,0.912,0.905,0.919,0.912,0.915,0.915,0.915,0.912,0.893
4,SVC,"{'clf__C': 2, 'clf__class_weight': 'balanced', 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}",0.882,0.871,0.892,0.883,0.833,0.929,0.883,0.917,0.86,0.888,0.882,0.958
5,LinearDiscriminantAnalysis,{'clf__solver': 'svd'},0.829,0.83,0.829,0.83,0.864,0.798,0.83,0.799,0.865,0.833,0.829,0.937
6,ComplementNB,"{'clf__alpha': 0.1, 'clf__norm': False}",0.819,0.836,0.803,0.821,0.944,0.707,0.821,0.751,0.933,0.845,0.819,0.894
7,DummyClassifier,{'clf__strategy': 'most_frequent'},0.354,0.0,0.683,0.519,0.0,1.0,0.519,0.0,0.519,0.269,0.341,0.5


In [18]:
model = xgb.XGBClassifier(booster='gbtree',eval_metric='error',
                          learning_rate=0.25,max_depth=4,n_estimators=75)
pipe = make_custom_pipeline(x_train, model)
pipe.fit(x_train,y_train['bot'].values)

In [19]:
def testing_model(x_test, y_test, model):
    '''
    args: x_test - DataFrame; contributor behavioural features
          y_test - array; contributor type
          model - method; the trained model

    returns: performance - dict; performance metric vlues
             y_pred - array; prediction values given by the model

    '''
    y_pred = model.predict(x_test)
    y_true = y_test
    performance = {'precision': [precision_score(y_test, y_pred, average='weighted', zero_division=0.0)],
                   'bot_precision': [botprecision(y_test, y_pred)],
                   'human_precision': [humanprecision(y_test, y_pred)],
                   'recall': [recall_score(y_test, y_pred, average='weighted', zero_division=0.0)],
                   'bot_recall': [botrecall(y_test, y_pred)],
                   'human_recall': [humanrecall(y_test, y_pred)],
                   'weighted_f1': [f1_score(y_test, y_pred, average='weighted', zero_division=0.0)],
                   'accuracy': [accuracy_score(y_test, y_pred)],
                   'tn_fp_fn_tp': [confusion_matrix(y_test, y_pred).ravel()],
                   'num_bot': y_test.query('bot == 1').shape[0],
                   'num_hum': y_test.query('bot == 0').shape[0],
                   'num_contrib': x_test.index.nunique(),
                   'roc_auc': [roc_auc_score(y_test, y_pred)]
                  }
    
    return(performance, y_pred)

In [20]:
performance, y_pred = testing_model(x_test,y_test,model=pipe)
performance_df = pd.DataFrame.from_dict(performance)
performance_df

Unnamed: 0,precision,bot_precision,human_precision,recall,bot_recall,human_recall,weighted_f1,accuracy,tn_fp_fn_tp,num_bot,num_hum,num_contrib,roc_auc
0,0.925136,0.919231,0.930657,0.925094,0.926357,0.923913,0.925102,0.925094,"[255, 21, 19, 239]",258,276,534,0.925135


In [21]:
test_preds = pd.DataFrame(y_test).assign(pred = y_pred)

In [22]:
TP = test_preds.query('bot == 1 and pred == 1')
TN = test_preds.query('bot == 0 and pred == 0')
FN = test_preds.query('bot == 1 and pred == 0')
FP = test_preds.query('bot == 0 and pred == 1')

In [23]:
FN[['pred']].merge(df_features, left_index=True, right_index=True)

Unnamed: 0,pred,NAT_mean,feat_NT,DCAT_median,feat_NOR,DCA_gini,NAR_mean,bot
gaugebot,0,5.8,5,0.009,3,0.89,2.9,1
jirarobot,0,11.125,8,0.016,2,0.977,22.25,1
jersey-bot,0,2.667,6,0.424,1,0.814,8.0,1
atyponci,0,16.0,2,6.981,1,0.766,3.556,1
zx2c4-bot,0,6.8,5,0.141,2,0.851,11.333,1
element-bot,0,5.375,8,0.02,1,0.911,43.0,1
opencv-pushbot,0,10.0,5,0.019,3,0.863,12.5,1
nemobot,0,5.0,1,,1,0.399,5.0,1
jetstack-bot,0,18.5,8,0.088,2,0.838,14.8,1
phpmyadmin-bot,0,13.75,4,31.699,12,0.688,3.235,1


## Model confidence

In [24]:
def testing_model_proba(x_test, y_test, model):
    '''
    args: x_test - DataFrame; Contributor behavioural features
          y_test - array; contributor type
          model - method; trained model

    returns: y_pred_proba - array; predictions made by the model
    '''
    y_pred_proba = model.predict_proba(x_test)
    
    return(y_pred_proba)

## Data split and test the model with test set

In [25]:
y_pred_proba = testing_model_proba(x_test, y_test, model=pipe)
y_pred_proba_df = pd.DataFrame(y_pred_proba, columns=['human_proba','bot_proba'])

In [26]:
# computing confidence from proabability
threshold = 0.5
y_bin_proba = (
    y_pred_proba_df
    .merge(y_test.reset_index(),left_index=True, right_index=True, how='inner')
    .assign(pred = lambda d: np.where(d.bot_proba>threshold, 'bot', 'human'))
    .rename(columns={'index':'contributor','bot_proba':'probability','bot':'type'})
    .assign(confidence=lambda d:
        np.where(d.pred == 'human', abs((0.5 - d.probability)*2), abs((d.probability - 0.5) * 2)))
    .assign(type = lambda d: np.where(d.type==1, 'bot', 'human'))
    [['probability','contributor','type','pred','confidence']]
)
y_bin_proba.query('type == "bot"').head(5)

Unnamed: 0,probability,contributor,type,pred,confidence
1,0.97146,guidesbot,bot,bot,0.94292
2,0.998861,azusabot,bot,bot,0.997721
3,0.068695,gaugebot,bot,human,0.86261
5,0.975754,azclibot,bot,bot,0.951508
6,0.965927,actions-bot,bot,bot,0.931853


## Model for the tool

In [27]:
x_train = x_train.rename(columns = {'feat_NT':'NT', 'feat_NOR':'NOR'})

In [28]:
best_model = xgb.XGBClassifier(booster='gbtree',eval_metric='error',
                          learning_rate=0.25,max_depth=4,n_estimators=75)
bot_identification_model = best_model.fit(x_train, y_train['bot'].values)
bot_identification_model.save_model('rabbit_model.json')