# BotDetector

## Imports

In [37]:
import numpy as np

In [18]:
features_file = "features.json"
features_name_file = "features_name.json"
target_file = "target.json"

## Load features

In [20]:
import json
with open(features_file, 'r') as f :
    X = json.loads(f.read())
with open(target_file, 'r') as f :
    y = json.loads(f.read())
with open(features_name_file, 'r') as f :
    features_name = json.loads(f.read())
print("# of features: " + str(len(X)))
print("features name: " + str(features_name))

# of features: 1950
features name: ['has_name', 'has_image', 'has_address', 'has_biography', 'followers_ge_30', 'belongs_to_a_list', 'nb_tweets_ge_50', 'url_in_profile', 'followers_2_times_ge_friends', 'bot_in_biography', 'ratio_friends_followers_around_100', 'duplicate_profile_picture', 'ratio_friends_followers_ge_50', 'default_image_after_2_month', 'friends_ge_100', 'no_bio', 'no_location', 'no_tweets', 'nb_friends', 'nb_tweets', 'ratio_friends_followers_square', 'age', 'following_rate']


In [32]:
# Divise dataset
def divide_dataset(X, y) :
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = divide_dataset(X, y)

## Supervised Learning

In [99]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [68]:
results = dict()

In [23]:
import time
import datetime
def train(classifier, name, param_grid=None) :
    start_time = time.time()
    if param_grid == None :
        classifier.fit(X_train, y_train)
        results[name] = dict(model=classifier)
    else :
        grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=2) # Do a 10-fold cross validation
        grid.fit(X, y) # fit the grid with data
        results[name] = dict(grid=grid, model=classifier)
    #total_time = datetime.datetime.fromtimestamp(time.time() - start_time)
    total_time = datetime.timedelta(seconds=time.time() - start_time)
    print("Training time : " + str(total_time))#.strftime('%H:%M:%S'))

## k-NN Classifier

In [69]:
name = "k-NN"
classifier = KNeighborsClassifier(weights='uniform')
k_range = list(range(1, 31)) # list of parameter values to test
param_grid = dict(n_neighbors=k_range)
train(classifier, name, param_grid)

Training time : 0:00:05.334686


## Decision Tree

In [70]:
name = "Decision tree"
classifier = tree.DecisionTreeClassifier(criterion='entropy')
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:03.146888


## Naive Bayes

In [71]:
name = "NB - Gaussian"
classifier = GaussianNB()
train(classifier, name)

Training time : 0:00:00.013054


In [72]:
name = "NB - Multinomial"
classifier = MultinomialNB()
train(classifier, name)

Training time : 0:00:00.017592


In [73]:
name = "NB - Bernoulli"
classifier = BernoulliNB()
train(classifier, name)

Training time : 0:00:00.006829


## SVM - Support Vector Machine

In [78]:
name = "SVM - SVC"
classifier = svm.SVC()
C_range = np.logspace(-2, 10, 13)
print(C_range)
gamma_range = np.logspace(-9, 3, 13)
print(gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
train(classifier, name, param_grid)

[  1.00000000e-02   1.00000000e-01   1.00000000e+00   1.00000000e+01
   1.00000000e+02   1.00000000e+03   1.00000000e+04   1.00000000e+05
   1.00000000e+06   1.00000000e+07   1.00000000e+08   1.00000000e+09
   1.00000000e+10]
[  1.00000000e-09   1.00000000e-08   1.00000000e-07   1.00000000e-06
   1.00000000e-05   1.00000000e-04   1.00000000e-03   1.00000000e-02
   1.00000000e-01   1.00000000e+00   1.00000000e+01   1.00000000e+02
   1.00000000e+03]
Training time : 0:04:00.932656


In [111]:
name = "SVM - Linear"
classifier = svm.LinearSVC()
C_range = range(170,230,5)
C_range = range(1,200,10)
param_grid = dict(C=C_range)
train(classifier, name, param_grid)

Training time : 0:00:17.783268


## Random Forest

In [74]:
name = "Random forest"
classifier = RandomForestClassifier()
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:06.649900


## AdaBoost

In [95]:
name = "AdaBoost"
classifier = AdaBoostClassifier(n_estimators=100)
#param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name)

Training time : 0:00:00.243003


## Logistic Regression

In [100]:
name = "Log. Regression"
classifier = LogisticRegression()
#param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name)

Training time : 0:00:00.022902


## Neural networks

In [106]:
from sklearn.neural_network import MLPClassifier
name = "Neural net"
#classifier = MLPClassifier(alpha=1)
classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
train(classifier, name)

Training time : 0:00:00.022906


## Score Summary

In [112]:
from prettytable import PrettyTable
import operator
from sklearn import metrics
import math
t = PrettyTable(['Model', 'Best score', 'accuracy', 'precision', 'recall', 'F-M.', 'MCC', 'AUC'])#'FP', 'TN', 'FN', 'TP'])
for clf_name, result in results.items() :
    model = result['model']
    if 'grid' in result :
        grid = result['grid']
        score = grid.best_score_
        # Compute false positives and false negatives
        model.__init__(**grid.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #print(result.best_estimator_)
    else : # For non grid_search models
        #training_error = clf.score(X_train, y_train)
        score = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    print(clf_name + " tn=" + str(tn) + " fp=" + str(fp) + " fn=" + str(fn) + " tp=" + str(tp))
    accuracy = float(tp + tn) / (tp + tn + fp + fn)
    precision = float(tp) / (tp + fp)
    recall = float(tp) / (tp + fn) # a.k.a. sensitivity
    f_measure = float(2 * precision * recall) / (precision + recall)
    mcc = -1
    if fp!=0 and tp != 0 and tn != 0 and fn!= 0:
        mcc = float(tp * tn - fp * fn) / math.sqrt(float(tp+fn) * (tp+fp) * (tn+fp) * (tn+fn)) # Matthew Correlation Coefficient
    auc = metrics.auc(fpr, tpr)
    t.add_row([clf_name, round(score, 3), round(accuracy, 3), round(precision,3), round(recall,3), round(f_measure,3), round(mcc,3), round(auc,3)]) #fp, tn, fn, tp])

        
print(t.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True))

k-NN tn=207 fp=29 fn=92 tp=316
Decision tree tn=228 fp=8 fn=11 tp=397
NB - Gaussian tn=148 fp=88 fn=23 tp=385
NB - Multinomial tn=150 fp=86 fn=13 tp=395
NB - Bernoulli tn=205 fp=31 fn=16 tp=392
Random forest tn=234 fp=2 fn=8 tp=400
SVM - SVC tn=224 fp=12 fn=105 tp=303
AdaBoost tn=226 fp=10 fn=8 tp=400
Log. Regression tn=140 fp=96 fn=7 tp=401
Neural net tn=0 fp=236 fn=0 tp=408
SVM - Linear tn=0 fp=236 fn=0 tp=408
+------------------+------------+----------+-----------+--------+-------+-------+-------+
|      Model       | Best score | accuracy | precision | recall |  F-M. |  MCC  |  AUC  |
+------------------+------------+----------+-----------+--------+-------+-------+-------+
|  Random forest   |   0.986    |  0.984   |   0.995   |  0.98  | 0.988 | 0.967 | 0.986 |
|  Decision tree   |   0.981    |   0.97   |    0.98   | 0.973  | 0.977 | 0.937 |  0.97 |
|     AdaBoost     |   0.972    |  0.972   |   0.976   |  0.98  | 0.978 |  0.94 | 0.969 |
|  NB - Bernoulli  |   0.927    |  0.927   |