<a href="https://colab.research.google.com/github/nguyennhutlam/sockpuppet_final_19_07_2020/blob/master/1_All_model_Ensemble_Feature5_extratree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

Train all model with feature selection 

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
# ignore warning
import warnings
warnings.filterwarnings('ignore')
# ML library
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import  VotingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import  KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.utils import shuffle

import statistics

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import random
from numpy.random import seed
# evaluation metrics
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1':'f1', 'roc_auc': 'roc_auc'}

  import pandas.util.testing as tm


# Functions

In [2]:
# This function is used for undersampling
# paramters: 
#   data: dataset
#   ration: ratio between minority class and mojority class       
def undersampling(data, ratio):
  B1_class_len = len(data[data['class'] == 0]) #lenght of the minority class (B1: sockpuppet accounts) #163
  legitimate_indices = data[data['class'] == 1].index
  random_legitimate_indices = np.random.choice(legitimate_indices, B1_class_len * ratio, replace = True)
  B1_class_indices = data[data['class'] == 0].index
  under_sampe_indices = np.concatenate([B1_class_indices, random_legitimate_indices])
  under_sample = data.loc[under_sampe_indices]
  return under_sample #return the undersampled dataset

Training a model using cross validation \\
model: ML classifier \\
fold: #fold for cross validate \\
X, y : training set

In [3]:
def train_cross_val(model,label, fold, X, y):
    cv = KFold(n_splits=fold, shuffle=True, random_state=1)
    decimal = 3
    acc = []
    pre = []
    rec = []
    f1s = []
    auc = []
    df_pred = pd.DataFrame() # dataframe stores the prediction for each sample
    df_pred_prob = pd.DataFrame() # dataframe stores the predict probability for each sample
    cf_matrix = np.zeros((2,2))
    for train, test in (cv.split(X, y)):
      model.fit(X[train], y[train])

      pred = model.predict(X[test])

      #output dataframe of predict values
      df_pred_temp = pd.DataFrame(list(zip(pred)), index=test, columns=[label+'_y_pred'])
      df_pred = pd.concat([df_pred, df_pred_temp])
      
      #output dataframe of predict probability values
      pred_pro = model.predict_proba(X[test])
      df_pred_prob_temp = pd.DataFrame(list(zip(pred_pro[:,0])), index=test, columns=[label+'_y_prob_0'])
      df_pred_prob = pd.concat([df_pred_prob, df_pred_prob_temp])

      acc.append(accuracy_score(y[test], pred))
      pre.append(precision_score(y[test], pred))
      rec.append(recall_score(y[test], pred))
      f1s.append(f1_score(y[test], pred))
      auc.append(roc_auc_score(y[test], pred_pro[:,1]))
      cf_matrix += (confusion_matrix(y[test], pred))

    acc = np.round(np.mean(acc), decimal)
    pre = np.round(np.mean(pre), decimal)
    rec = np.round(np.mean(rec), decimal)
    f1s = np.round(np.mean(f1s), decimal)
    auc = np.round(np.mean(auc), decimal)
    return (acc, pre, rec, f1s, auc, cf_matrix , df_pred, df_pred_prob)

Train individual model using cross validation for all datasets (10)

In [4]:
def train_each_model_all_dataset(clf, lbl):
  cols = ['classifier','accuracy','precision','recall','f1_score', 'roc_auc']
  result = pd.DataFrame(columns=cols)
  for i in range(10):
    #print(i)
    df = dataset[str(i)].copy()
    df = df.reset_index(drop=True) # reset dataframe index
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    X = scaler.fit_transform(X)
    (acc, pre, rec, f1s, auc, cf_matrix , pred, prob) = train_cross_val(clf, lbl , fold, X, y)
    result.loc[len(result)] = [lbl, acc,pre, rec, f1s, auc] #add evaluate metrics
  return (result, (result.mean()).round(3), (result.std()).round(3), cf_matrix, pred, prob)

Train individual model using cross validation for one dataset

In [5]:
def train_a_model_one_dataset(dataset, clf):
  df = dataset.copy()
  df = df.reset_index(drop=True) # reset dataframe index *Importance*
  X = df.iloc[:, :-1]
  y = df.iloc[:, -1]
  X = scaler.fit_transform(X)
  (acc, pre, rec, f1s, auc, cf_matrix , pred, prob) = train_cross_val(clf, lbl , fold, X, y)
  return (acc, pre, rec, f1s, auc, cf_matrix , pred, prob)

Train ensemble model using soft-voting

In [6]:
def ensemble_model_soft_voting(y_label, result_pred_prob ): # df = result_pred_prob
  df = result_pred_prob.copy()
  y_actual = pd.DataFrame(y_label)
  df['mean'] = df.mean(axis = 1)
  df.loc[df['mean'] >=0.5, 'predict_class'] = 0
  df.loc[df['mean'] <0.5, 'predict_class'] = 1
  df = pd.concat([df, y_actual], axis=1)
  y_true = list(df.iloc[:,-1])
  y_pred = list(df.iloc[:,-2])
  acc= accuracy_score(y_true, y_pred)
  pre = precision_score(y_true, y_pred)
  rec = recall_score(y_true, y_pred)
  f1s = f1_score(y_true, y_pred)
  auc = roc_auc_score(y_true, list(1-df.iloc[:,-3]))
  return (acc, pre, rec, f1s, auc)

# Preparing Datasets

Read full dataset

In [7]:
file ="https://raw.githubusercontent.com/nguyennhutlam/sockpuppet_dataset/master/Feature_29_final.csv"
data = pd.read_csv(file)
data.type.replace(['Sockpuppet','Legitimate_user'], [0, 1], inplace = True)
data = data.drop(columns=['id']) #remove id column
data['class'] = data['type']
data = data.drop(columns=['type']) #remove type columns
#sns.countplot(x='class', data=data)
# number of instances in each class
data['class'].value_counts()

1    106230
0      1668
Name: class, dtype: int64

In [8]:
top5_extratrees =['median_push_interval',
                  'reponse_time_mean',
                  'hubs',
                  'received_rating_ratio',
                  'received_rating_push',
                  'class']

data = data[top5_extratrees]
data

Unnamed: 0,median_push_interval,reponse_time_mean,hubs,received_rating_ratio,received_rating_push,class
0,0.0,122391.0,0.001879,1.000000,46,1
1,0.0,122820.0,0.001879,1.000000,46,1
2,0.0,122212.0,0.001879,1.000000,46,1
3,0.0,122654.0,0.001879,1.000000,46,1
4,1231.5,37.5,0.094703,0.615894,122,1
...,...,...,...,...,...,...
107893,3742.0,100.0,0.021609,0.357143,19,1
107894,75574.0,156.0,0.002991,0.600000,4,1
107895,34.0,28.0,0.055788,0.560440,71,1
107896,6207.0,4.0,0.003495,1.000000,6,1


Generate 10 Dataset using undersampling for training, store in dictinary
*dataset*

In [9]:
import random
dataset = {} # dictionary of dataframe
for i in range(10):
    random.seed(i)
    dataset[str(i)] = undersampling(data,1)

# Traing

Define the list of model

In [10]:
model = []
model.append(('NB',GaussianNB(var_smoothing=1e-05)))
model.append(('kNN', KNeighborsClassifier(n_neighbors=15, weights='distance', algorithm='ball_tree')))
model.append(('SVM', SVC(probability=True, C=100, kernel='rbf')))
model.append(('RF', RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200)))
model.append(('Ada', AdaBoostClassifier(RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200), learning_rate=0.3, n_estimators=100)))
model.append(('XGB', XGBClassifier(colsample_bytree=0.7, gamma=1.5, max_depth=10, min_child_weight=5)))

fold = 10

Result for each model for all datasets

0 Naive Bayes

In [None]:
# result_all = []
# result_prob_all = []
# result_pred_all = []

cols = ['i','classifier','accuracy','precision','recall','f1_score', 'roc_auc']
result = pd.DataFrame(columns=cols)
seed(1)
for i in range(10): 
  print(i) 
  df = dataset[str(i)].copy()
  df = df.reset_index(drop=True) # reset dataframe index *Importance*
  X = df.iloc[:, :-1]
  y = df.iloc[:, -1]
  X = scaler.fit_transform(X)
  pred_all = pd.DataFrame()
  prob_all = pd.DataFrame()
  # cols = ['i','classifier','accuracy','precision','recall','f1_score', 'roc_auc']
  # result = pd.DataFrame(columns=cols)
  for label, clf in model:
    (acc, pre, rec, f1s, auc, cf_matrix , pred, prob) = train_cross_val(clf, label ,fold, X, y)
    result.loc[len(result)] = [i, label, acc,pre, rec, f1s, auc] #add evaluate metrics
    pred_all = pd.concat([pred_all, pred], axis=1)
    prob_all = pd.concat([prob_all, prob], axis=1)
    print(label)
  # train ensemble model

  x1 = prob_all#[en_cols]
  data_ensemble = pd.concat([x1, y], axis=1)
  df1 = data_ensemble.copy()
  X1 = df1.iloc[:, :-1]
  y1 = df1.iloc[:, -1]
  X1 = scaler.fit_transform(X1)
  mlp = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(6,6,6), random_state=1,
                )
  (acc1, pre1, rec1, f1s1, auc1, cf_matrix1 , pred1, prob1) = train_cross_val(mlp, 'MLP' , fold, X1, y1)
  result.loc[len(result)] = [i, 'Ensemble', acc1,pre1, rec1, f1s1, auc1]

  # Voting all classifers
  (acc, pre, rec, f1s, auc) = ensemble_model_soft_voting(y1, prob_all)
  result.loc[len(result)] = [i, 'Voting', acc,pre, rec, f1s, auc]

  # Voting top 2 classifers
  en_cols = [
          #  'NB_y_pred_0',	
          #  'kNN_y_pred_0',	
          #  'SVM_y_pred_0',
          # 'RF_y_prob_0',	
           'Ada_y_prob_0',
           'XGB_y_prob_0'
           ]
   
  (acc, pre, rec, f1s, auc) = ensemble_model_soft_voting(y1, prob_all[en_cols])
  result.loc[len(result)] = [i, 'Voting_Adaboost_XGboost', acc,pre, rec, f1s, auc]


  # print(acc1, pre1, rec1, f1s1, auc1)
  # result_all.append(result)
  # result_pred_all.append(pred_all)
  # result_prob_all.append(prob_all)

print(result.groupby(['classifier']).mean().round(3))

In [12]:
print(result.groupby(['classifier']).mean().round(3))

                         accuracy  precision  recall  f1_score  roc_auc
classifier                                                             
Ada                         0.767      0.762   0.780     0.770    0.848
Ensemble                    0.774      0.763   0.795     0.778    0.852
NB                          0.550      0.765   0.230     0.286    0.680
RF                          0.767      0.761   0.779     0.769    0.848
SVM                         0.732      0.744   0.712     0.726    0.808
Voting                      0.764      0.791   0.719     0.753    0.847
Voting_Adaboost_XGboost     0.771      0.764   0.785     0.774    0.850
XGB                         0.766      0.760   0.779     0.769    0.846
kNN                         0.700      0.706   0.685     0.695    0.772


In [14]:
(result.groupby(['classifier']).mean().round(3))

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ada,0.767,0.762,0.78,0.77,0.848
Ensemble,0.774,0.763,0.795,0.778,0.852
NB,0.55,0.765,0.23,0.286,0.68
RF,0.767,0.761,0.779,0.769,0.848
SVM,0.732,0.744,0.712,0.726,0.808
Voting,0.764,0.791,0.719,0.753,0.847
Voting_Adaboost_XGboost,0.771,0.764,0.785,0.774,0.85
XGB,0.766,0.76,0.779,0.769,0.846
kNN,0.7,0.706,0.685,0.695,0.772


In [13]:
print(result.groupby(['classifier']).std().round(3))

                         accuracy  precision  recall  f1_score  roc_auc
classifier                                                             
Ada                         0.005      0.004   0.009     0.005    0.005
Ensemble                    0.005      0.007   0.016     0.007    0.004
NB                          0.007      0.085   0.230     0.129    0.008
RF                          0.003      0.002   0.007     0.004    0.004
SVM                         0.006      0.015   0.033     0.012    0.005
Voting                      0.005      0.012   0.029     0.011    0.003
Voting_Adaboost_XGboost     0.005      0.005   0.008     0.006    0.004
XGB                         0.004      0.004   0.007     0.005    0.004
kNN                         0.007      0.008   0.012     0.009    0.007


In [15]:
(result.groupby(['classifier']).std().round(3))

Unnamed: 0_level_0,accuracy,precision,recall,f1_score,roc_auc
classifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Ada,0.005,0.004,0.009,0.005,0.005
Ensemble,0.005,0.007,0.016,0.007,0.004
NB,0.007,0.085,0.23,0.129,0.008
RF,0.003,0.002,0.007,0.004,0.004
SVM,0.006,0.015,0.033,0.012,0.005
Voting,0.005,0.012,0.029,0.011,0.003
Voting_Adaboost_XGboost,0.005,0.005,0.008,0.006,0.004
XGB,0.004,0.004,0.007,0.005,0.004
kNN,0.007,0.008,0.012,0.009,0.007
