<a href="https://colab.research.google.com/github/nguyennhutlam/sockpuppet_final_19_07_2020/blob/master/2020_07_19_Model_Training_Using_Hyperparameter_Turning_using_top_5Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [44]:
import pandas as pd
import numpy as np
import seaborn as sns
# ignore warning
import warnings
warnings.filterwarnings('ignore')
# ML library
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import  VotingClassifier
from sklearn.model_selection import cross_validate
from sklearn.model_selection import  KFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.utils import shuffle

import statistics

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
import random
from numpy.random import seed
# evaluation metrics
scoring = {'accuracy': 'accuracy', 'recall': 'recall', 'precision': 'precision', 'f1':'f1', 'roc_auc': 'roc_auc'}

# Functions

In [45]:
# This function is used for undersampling
# paramters: 
#   data: dataset
#   ration: ratio between minority class and mojority class       
def undersampling(data, ratio):
  B1_class_len = len(data[data['class'] == 0]) #lenght of the minority class (B1: sockpuppet accounts) #163
  legitimate_indices = data[data['class'] == 1].index
  random_legitimate_indices = np.random.choice(legitimate_indices, B1_class_len * ratio, replace = True)
  B1_class_indices = data[data['class'] == 0].index
  under_sampe_indices = np.concatenate([B1_class_indices, random_legitimate_indices])
  under_sample = data.loc[under_sampe_indices]
  return under_sample #return the undersampled dataset

Training a model using cross validation \\
model: ML classifier \\
fold: #fold for cross validate \\
X, y : training set

In [46]:
def train_cross_val(model,label, fold, X, y):
    cv = KFold(n_splits=fold, shuffle=True, random_state=1)
    decimal = 3
    acc = []
    pre = []
    rec = []
    f1s = []
    auc = []
    df_pred = pd.DataFrame() # dataframe stores the prediction for each sample
    df_pred_prob = pd.DataFrame() # dataframe stores the predict probability for each sample
    cf_matrix = np.zeros((2,2))
    for train, test in (cv.split(X, y)):
      model.fit(X[train], y[train])

      pred = model.predict(X[test])

      #output dataframe of predict values
      df_pred_temp = pd.DataFrame(list(zip(pred)), index=test, columns=[label+'_y_pred'])
      df_pred = pd.concat([df_pred, df_pred_temp])
      
      #output dataframe of predict probability values
      pred_pro = model.predict_proba(X[test])
      df_pred_prob_temp = pd.DataFrame(list(zip(pred_pro[:,0])), index=test, columns=[label+'_y_pred_0'])
      df_pred_prob = pd.concat([df_pred_prob, df_pred_prob_temp])

      acc.append(accuracy_score(y[test], pred))
      pre.append(precision_score(y[test], pred))
      rec.append(recall_score(y[test], pred))
      f1s.append(f1_score(y[test], pred))
      auc.append(roc_auc_score(y[test], pred_pro[:,1]))
      cf_matrix += (confusion_matrix(y[test], pred))

    acc = np.round(np.mean(acc), decimal)
    pre = np.round(np.mean(pre), decimal)
    rec = np.round(np.mean(rec), decimal)
    f1s = np.round(np.mean(f1s), decimal)
    auc = np.round(np.mean(auc), decimal)
    return (acc, pre, rec, f1s, auc, cf_matrix , df_pred, df_pred_prob)

Train individual model using cross validation for all datasets (10)

In [47]:
def train_each_model_all_dataset(clf, lbl):
  cols = ['classifier','accuracy','precision','recall','f1_score', 'roc_auc']
  result = pd.DataFrame(columns=cols)
  for i in range(10):
    print(i)
    df = dataset[str(i)].copy()
    df = df.reset_index(drop=True) # reset dataframe index
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    X = scaler.fit_transform(X)
    (acc, pre, rec, f1s, auc, cf_matrix , pred, prob) = train_cross_val(clf, lbl , fold, X, y)
    result.loc[len(result)] = [lbl, acc,pre, rec, f1s, auc] #add evaluate metrics
  return (result, (result.mean()).round(3), (result.std()).round(3))

# Preparing Datasets

Read full dataset

In [48]:
file ="https://raw.githubusercontent.com/nguyennhutlam/sockpuppet_dataset/master/Feature_29_final.csv"
data = pd.read_csv(file)
data.type.replace(['Sockpuppet','Legitimate_user'], [0, 1], inplace = True)
data = data.drop(columns=['id']) #remove id column
data['class'] = data['type']
data = data.drop(columns=['type']) #remove type columns
#sns.countplot(x='class', data=data)
# number of instances in each class
data['class'].value_counts()

1    106230
0      1668
Name: class, dtype: int64

In [49]:
top5_chi2 = ['number_of_post_per_day',
             'number_of_received_comments_per_article',
            'median_push_interval',
            'authorities' ,
            'degree_in',
             'class']
top5_extratrees =['median_push_interval',
                  'reponse_time_mean',
                  'hubs',
                  'received_rating_ratio',
                  'received_rating_push',
                  'class']
top5_relief = [
               'number_of_received_comments_per_article', 
               'diff_weekday_weekend',
               'clustF',
               'received_rating_ratio',
               'number_of_comments_per_article',
               'class']



In [50]:
#data = data [top5_chi2]
#data = data [top5_extratrees]
data = data [top5_relief]

In [51]:
data.head()

Unnamed: 0,number_of_received_comments_per_article,diff_weekday_weekend,clustF,received_rating_ratio,number_of_comments_per_article,class
0,0,0.0,1.0,1.0,1.0,1
1,0,0.0,1.0,1.0,1.06383,1
2,0,0.0,1.0,1.0,1.0,1
3,0,0.0,1.0,1.0,1.0,1
4,24,0.2,0.322581,0.615894,1.239726,1


Generate 10 Dataset using undersampling for training, store in dictinary
*dataset*

In [52]:
import random
dataset = {} # dictionary of dataframe
for i in range(10):
    random.seed(i)
    dataset[str(i)] = undersampling(data,1)

# Traing

Define the list of model

In [54]:
model = []
model.append(('NB',GaussianNB(var_smoothing=1e-05)))
model.append(('kNN', KNeighborsClassifier(n_neighbors=15, weights='distance', algorithm='ball_tree')))
model.append(('SVM', SVC(probability=True, C=100, kernel='rbf')))
model.append(('RF', RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200)))
model.append(('Ada', AdaBoostClassifier(RandomForestClassifier(criterion='entropy', max_features=3, n_estimators= 200), learning_rate=0.3, n_estimators=100)))
model.append(('XGB', XGBClassifier(colsample_bytree=0.7, gamma=1.5, max_depth=10, min_child_weight=5)))

fold = 10

Result for each model for all datasets

0 Naive Bayes

In [None]:
seed(1)
m = 0 #NB
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0         NB     0.606      0.563   0.952     0.707    0.723
1         NB     0.605      0.561   0.965     0.709    0.722
2         NB     0.601      0.560   0.946     0.703    0.713
3         NB     0.580      0.546   0.954     0.694    0.690
4         NB     0.600      0.559   0.959     0.705    0.710
5         NB     0.600      0.559   0.951     0.703    0.703
6         NB     0.594      0.555   0.956     0.701    0.704
7         NB     0.600      0.559   0.945     0.702    0.710
8         NB     0.604      0.562   0.936     0.702    0.664
9         NB     0.588      0.551   0.953     0.698    0.680
accuracy     0.598
precision    0.558
recall       0.952
f1_score     0.702
roc_auc      0.702
dtype: float64
accuracy     0.008
precision    0.005
recall       0.008
f1_score     0.004
roc_auc      0.019
dtype: float64


1 KNN

In [None]:
seed(1)
m = 1 # KNN
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0        kNN     0.671      0.675   0.656     0.664    0.723
1        kNN     0.649      0.665   0.601     0.630    0.677
2        kNN     0.674      0.672   0.680     0.674    0.728
3        kNN     0.649      0.659   0.621     0.637    0.690
4        kNN     0.651      0.660   0.621     0.638    0.707
5        kNN     0.671      0.670   0.676     0.672    0.721
6        kNN     0.656      0.667   0.623     0.642    0.699
7        kNN     0.652      0.661   0.619     0.636    0.695
8        kNN     0.628      0.637   0.601     0.617    0.666
9        kNN     0.656      0.662   0.633     0.646    0.699
accuracy     0.656
precision    0.663
recall       0.633
f1_score     0.646
roc_auc      0.700
dtype: float64
accuracy     0.014
precision    0.011
recall       0.028
f1_score     0.019
roc_auc      0.020
dtype: float64


2 SVM

In [None]:
seed(1)
m = 2 # SVM
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0        SVM     0.707      0.682   0.778     0.726    0.762
1        SVM     0.702      0.677   0.775     0.722    0.757
2        SVM     0.709      0.685   0.780     0.729    0.764
3        SVM     0.701      0.673   0.785     0.724    0.752
4        SVM     0.702      0.677   0.779     0.723    0.753
5        SVM     0.699      0.671   0.782     0.722    0.751
6        SVM     0.708      0.681   0.791     0.731    0.759
7        SVM     0.709      0.685   0.774     0.725    0.760
8        SVM     0.693      0.669   0.771     0.715    0.748
9        SVM     0.709      0.687   0.768     0.725    0.764
accuracy     0.704
precision    0.679
recall       0.778
f1_score     0.724
roc_auc      0.757
dtype: float64
accuracy     0.005
precision    0.006
recall       0.007
f1_score     0.004
roc_auc      0.006
dtype: float64


3 Random Forest

In [None]:
seed(1)
m = 3 #Random Forest
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0         RF     0.709      0.706   0.725     0.714    0.759
1         RF     0.710      0.705   0.722     0.713    0.776
2         RF     0.713      0.709   0.725     0.716    0.772
3         RF     0.712      0.709   0.721     0.714    0.768
4         RF     0.704      0.702   0.716     0.708    0.758
5         RF     0.692      0.691   0.696     0.693    0.758
6         RF     0.713      0.710   0.728     0.717    0.776
7         RF     0.708      0.702   0.725     0.712    0.777
8         RF     0.696      0.694   0.704     0.698    0.755
9         RF     0.706      0.698   0.728     0.712    0.770
accuracy     0.706
precision    0.703
recall       0.719
f1_score     0.710
roc_auc      0.767
dtype: float64
accuracy     0.007
precision    0.007
recall       0.011
f1_score     0.008
roc_auc      0.009
dtype: float64


4 AdaBoost

In [55]:
seed(1)
m = 4 #AdaBoost
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0        Ada     0.708      0.704   0.724     0.713    0.738
1        Ada     0.710      0.707   0.720     0.712    0.751
2        Ada     0.709      0.705   0.721     0.712    0.746
3        Ada     0.698      0.694   0.708     0.701    0.733
4        Ada     0.695      0.692   0.707     0.698    0.729
5        Ada     0.701      0.696   0.712     0.702    0.735
6        Ada     0.708      0.704   0.720     0.711    0.743
7        Ada     0.723      0.716   0.742     0.727    0.759
8        Ada     0.693      0.687   0.710     0.698    0.720
9        Ada     0.692      0.690   0.701     0.694    0.727
accuracy     0.704
precision    0.699
recall       0.716
f1_score     0.707
roc_auc      0.738
dtype: float64
accuracy     0.010
precision    0.009
recall       0.012
f1_score     0.010
roc_auc      0.012
dtype: float64


5 XGBoost

In [None]:
seed(1)
m = 5 #XGBoost
clf = model[m][1]
lbl = model[m][0]
result, pred, prob = train_each_model_all_dataset(clf, lbl)
print(result)
print(pred)
print(prob)

0
1
2
3
4
5
6
7
8
9
  classifier  accuracy  precision  recall  f1_score  roc_auc
0        XGB     0.724      0.715   0.755     0.733    0.779
1        XGB     0.715      0.700   0.750     0.724    0.775
2        XGB     0.718      0.707   0.751     0.727    0.785
3        XGB     0.716      0.704   0.748     0.724    0.780
4        XGB     0.714      0.701   0.747     0.722    0.770
5        XGB     0.711      0.697   0.744     0.719    0.772
6        XGB     0.714      0.703   0.748     0.723    0.782
7        XGB     0.731      0.718   0.763     0.739    0.791
8        XGB     0.702      0.695   0.725     0.708    0.763
9        XGB     0.719      0.704   0.756     0.729    0.781
accuracy     0.716
precision    0.704
recall       0.749
f1_score     0.725
roc_auc      0.778
dtype: float64
accuracy     0.008
precision    0.007
recall       0.010
f1_score     0.008
roc_auc      0.008
dtype: float64
