<a href="https://colab.research.google.com/github/nguyennhutlam/sockpuppet_final_19_07_2020/blob/master/2020_07_19_Model_Training_Using_Hyperparameter_Turning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
# ignore warning
import warnings
warnings.filterwarnings('ignore')
# ML library
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import  VotingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import  KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.utils import shuffle

import statistics

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import random
from numpy.random import seed
# evaluation metrics
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1':'f1', 'roc_auc': 'roc_auc'}

# Functions

In [2]:
# This function is used for undersampling
# paramters: 
#   data: dataset
#   ration: ratio between minority class and mojority class       
def undersampling(data, ratio):
  B1_class_len = len(data[data['class'] == 0]) #lenght of the minority class (B1: sockpuppet accounts) #163
  legitimate_indices = data[data['class'] == 1].index
  random_legitimate_indices = np.random.choice(legitimate_indices, B1_class_len * ratio, replace = True)
  B1_class_indices = data[data['class'] == 0].index
  under_sampe_indices = np.concatenate([B1_class_indices, random_legitimate_indices])
  under_sample = data.loc[under_sampe_indices]
  return under_sample #return the undersampled dataset

Training a model using cross validation \\
model: ML classifier \\
fold: #fold for cross validate \\
X, y : training set

In [16]:
def train_cross_val(model,label, fold, X, y):
    cv = KFold(n_splits=fold, shuffle=True, random_state=1)
    decimal = 3
    acc = []
    pre = []
    rec = []
    f1s = []
    auc = []
    df_pred = pd.DataFrame() # dataframe stores the prediction for each sample
    df_pred_prob = pd.DataFrame() # dataframe stores the predict probability for each sample
    cf_matrix = np.zeros((2,2))
    for train, test in (cv.split(X, y)):
      model.fit(X[train], y[train])

      pred = model.predict(X[test])

      #output dataframe of predict values
      df_pred_temp = pd.DataFrame(list(zip(pred)), index=test, columns=[label+'_y_pred'])
      df_pred = pd.concat([df_pred, df_pred_temp])
      
      #output dataframe of predict probability values
      pred_pro = model.predict_proba(X[test])
      df_pred_prob_temp = pd.DataFrame(list(zip(pred_pro[:,0])), index=test, columns=[label+'_y_pred_0'])
      df_pred_prob = pd.concat([df_pred_prob, df_pred_prob_temp])

      acc.append(accuracy_score(y[test], pred))
      pre.append(precision_score(y[test], pred))
      rec.append(recall_score(y[test], pred))
      f1s.append(f1_score(y[test], pred))
      auc.append(roc_auc_score(y[test], pred_pro[:,1]))
      cf_matrix += (confusion_matrix(y[test], pred))

    acc = np.round(np.mean(acc), decimal)
    pre = np.round(np.mean(pre), decimal)
    rec = np.round(np.mean(rec), decimal)
    f1s = np.round(np.mean(f1s), decimal)
    auc = np.round(np.mean(auc), decimal)
    return (acc, pre, rec, f1s, auc, cf_matrix , df_pred, df_pred_prob)

Train individual model using cross validation for all datasets (10)

In [56]:
def train_each_model_all_dataset(clf, lbl):
  cols = ['classifier','accuracy','precision','recall','f1_score', 'roc_auc']
  result = pd.DataFrame(columns=cols)
  for i in range(10):
    #print(i)
    df = dataset[str(i)].copy()
    df = df.reset_index(drop=True) # reset dataframe index
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    X = scaler.fit_transform(X)
    (acc, pre, rec, f1s, auc, cf_matrix , pred, prob) = train_cross_val(clf, lbl , fold, X, y)
    result.loc[len(result)] = [lbl, acc,pre, rec, f1s, auc] #add evaluate metrics
  return (result, (result.mean()).round(3), (result.std()).round(3))

# Preparing Datasets

Read full dataset

In [5]:
file ="https://raw.githubusercontent.com/nguyennhutlam/sockpuppet_dataset/master/Feature_29_final.csv"
data = pd.read_csv(file)
data.type.replace(['Sockpuppet','Legitimate_user'], [0, 1], inplace = True)
data = data.drop(columns=['id']) #remove id column
data['class'] = data['type']
data = data.drop(columns=['type']) #remove type columns
#sns.countplot(x='class', data=data)
# number of instances in each class
data['class'].value_counts()

1    106230
0      1668
Name: class, dtype: int64

Generate 10 Dataset using undersampling for training, store in dictinary
*dataset*

In [6]:
import random
dataset = {} # dictionary of dataframe
for i in range(10):
    random.seed(i)
    dataset[str(i)] = undersampling(data,1)

# Traing

Define the list of model

In [13]:
model = []
model.append(('NB',GaussianNB(var_smoothing=1e-05)))
model.append(('kNN', KNeighborsClassifier(n_neighbors=15, weights='distance', algorithm='ball_tree')))
model.append(('SVM', SVC(probability=True, C=100, kernel='rbf')))
model.append(('RF', RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200)))
model.append(('Ada', AdaBoostClassifier(RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200), learning_rate=0.3, n_estimators=100)))
model.append(('XGB', XGBClassifier(colsample_bytree=0.7, gamma=1.5, max_depth=10, min_child_weight=5)))

fold = 10

Result for each model for all datasets

0 Naive Bayes

In [62]:
seed(1)
m = 0 #NB
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0         NB     0.618      0.575   0.915     0.705    0.748
1         NB     0.607      0.565   0.939     0.705    0.741
2         NB     0.638      0.591   0.902     0.713    0.746
3         NB     0.655      0.603   0.908     0.724    0.754
4         NB     0.596      0.557   0.939     0.699    0.732
5         NB     0.624      0.582   0.890     0.703    0.736
6         NB     0.614      0.571   0.923     0.705    0.750
7         NB     0.605      0.562   0.949     0.706    0.748
8         NB     0.608      0.565   0.947     0.707    0.746
9         NB     0.607      0.566   0.926     0.702    0.744
accuracy     0.617
precision    0.574
recall       0.924
f1_score     0.707
roc_auc      0.744
dtype: float64
accuracy     0.018
precision    0.014
recall       0.020
f1_score     0.007
roc_auc      0.007
dtype: float64


1 KNN

In [63]:
seed(1)
m = 1 # KNN
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0        kNN     0.743      0.745   0.740     0.742    0.813
1        kNN     0.740      0.736   0.750     0.742    0.816
2        kNN     0.727      0.724   0.735     0.729    0.800
3        kNN     0.742      0.744   0.740     0.741    0.811
4        kNN     0.740      0.744   0.733     0.738    0.813
5        kNN     0.737      0.745   0.723     0.733    0.808
6        kNN     0.751      0.756   0.743     0.749    0.812
7        kNN     0.751      0.757   0.742     0.748    0.818
8        kNN     0.749      0.744   0.761     0.751    0.821
9        kNN     0.748      0.743   0.760     0.751    0.817
accuracy     0.743
precision    0.744
recall       0.743
f1_score     0.742
roc_auc      0.813
dtype: float64
accuracy     0.007
precision    0.009
recall       0.012
f1_score     0.008
roc_auc      0.006
dtype: float64


2 SVM

In [64]:
seed(1)
m = 2 # SVM
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0        SVM     0.761      0.733   0.823     0.774    0.824
1        SVM     0.775      0.750   0.826     0.785    0.837
2        SVM     0.754      0.725   0.818     0.768    0.814
3        SVM     0.769      0.746   0.818     0.779    0.832
4        SVM     0.770      0.755   0.801     0.776    0.838
5        SVM     0.761      0.747   0.790     0.767    0.830
6        SVM     0.763      0.743   0.806     0.772    0.821
7        SVM     0.775      0.758   0.805     0.780    0.837
8        SVM     0.775      0.750   0.826     0.786    0.845
9        SVM     0.768      0.743   0.823     0.780    0.838
accuracy     0.767
precision    0.745
recall       0.814
f1_score     0.777
roc_auc      0.832
dtype: float64
accuracy     0.007
precision    0.010
recall       0.012
f1_score     0.007
roc_auc      0.009
dtype: float64


3 Random Forest

In [65]:
seed(1)
m = 3 #Random Forest
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0         RF     0.814      0.812   0.816     0.813    0.884
1         RF     0.819      0.809   0.833     0.820    0.885
2         RF     0.807      0.799   0.821     0.810    0.881
3         RF     0.809      0.807   0.812     0.809    0.885
4         RF     0.812      0.810   0.815     0.812    0.888
5         RF     0.801      0.805   0.794     0.799    0.880
6         RF     0.803      0.797   0.815     0.805    0.884
7         RF     0.825      0.814   0.842     0.827    0.891
8         RF     0.822      0.812   0.838     0.824    0.888
9         RF     0.821      0.816   0.831     0.823    0.885
accuracy     0.813
precision    0.808
recall       0.822
f1_score     0.814
roc_auc      0.885
dtype: float64
accuracy     0.008
precision    0.006
recall       0.014
f1_score     0.009
roc_auc      0.003
dtype: float64


4 AdaBoost

In [66]:
seed(1)
m = 4 #AdaBoost
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0        Ada     0.811      0.812   0.809     0.810    0.884
1        Ada     0.819      0.813   0.827     0.819    0.886
2        Ada     0.814      0.805   0.828     0.816    0.882
3        Ada     0.811      0.810   0.812     0.811    0.884
4        Ada     0.807      0.803   0.815     0.808    0.886
5        Ada     0.803      0.807   0.796     0.801    0.878
6        Ada     0.801      0.795   0.811     0.802    0.883
7        Ada     0.823      0.814   0.839     0.826    0.891
8        Ada     0.821      0.813   0.836     0.824    0.887
9        Ada     0.824      0.818   0.833     0.825    0.886
accuracy     0.813
precision    0.809
recall       0.821
f1_score     0.814
roc_auc      0.885
dtype: float64
accuracy     0.008
precision    0.007
recall       0.014
f1_score     0.009
roc_auc      0.003
dtype: float64


5 XGBoost

In [67]:
seed(1)
m = 5 #XGBoost
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

  classifier  accuracy  precision  recall  f1_score  roc_auc
0        XGB     0.823      0.816   0.835     0.825    0.895
1        XGB     0.814      0.805   0.827     0.816    0.892
2        XGB     0.820      0.805   0.844     0.823    0.889
3        XGB     0.813      0.805   0.824     0.814    0.890
4        XGB     0.822      0.813   0.836     0.824    0.898
5        XGB     0.805      0.803   0.808     0.805    0.885
6        XGB     0.817      0.809   0.831     0.819    0.892
7        XGB     0.831      0.819   0.849     0.833    0.902
8        XGB     0.825      0.812   0.845     0.828    0.898
9        XGB     0.830      0.821   0.845     0.832    0.897
accuracy     0.820
precision    0.811
recall       0.834
f1_score     0.822
roc_auc      0.894
dtype: float64
accuracy     0.008
precision    0.006
recall       0.012
f1_score     0.009
roc_auc      0.005
dtype: float64
