# BotDetector

## Imports

In [3]:
import numpy as np
import config as c
use_only_class_a = True

## Load features

In [4]:
import json
import json

if use_only_class_a:
    folder_features = c.folder_class_a
else:
    folder_features = c.folder_class_a_b_c

with open(folder_features + c.file_features, 'r') as f :
    X = json.loads(f.read())
with open(folder_features + c.file_target, 'r') as f :
    y = json.loads(f.read())
with open(folder_features + c.file_features_name, 'r') as f :
    features_name = json.loads(f.read())

print("# of features: " + str(len(X)))
print("features name: " + str(features_name))

# of features: 5301
features name: ['has_name', 'has_image', 'has_address', 'has_biography', 'followers_ge_30', 'belongs_to_a_list', 'nb_tweets_ge_50', 'url_in_profile', 'followers_2_times_ge_friends', 'bot_in_biography', 'ratio_friends_followers_around_100', 'duplicate_profile_picture', 'ratio_friends_followers_ge_50', 'default_image_after_2_month', 'friends_ge_100', 'no_bio', 'no_location', 'no_tweets', 'nb_friends', 'nb_tweets', 'ratio_friends_followers_square', 'age', 'following_rate']


In [6]:
# Divise dataset
def divide_dataset(X, y) :
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = divide_dataset(X, y)

## Supervised Learning

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [8]:
results = dict()

In [9]:
import time
import datetime
def train(classifier, name, param_grid=None) :
    start_time = time.time()
    if param_grid == None :
        classifier.fit(X_train, y_train)
        results[name] = dict(model=classifier)
    else :
        grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=2) # Do a 10-fold cross validation
        grid.fit(X, y) # fit the grid with data
        results[name] = dict(grid=grid, model=classifier)
    #total_time = datetime.datetime.fromtimestamp(time.time() - start_time)
    total_time = datetime.timedelta(seconds=time.time() - start_time)
    print("Training time : " + str(total_time))#.strftime('%H:%M:%S'))

## k-NN Classifier

In [10]:
name = "k-NN"
classifier = KNeighborsClassifier(weights='uniform')
k_range = list(range(1, 31)) # list of parameter values to test
param_grid = dict(n_neighbors=k_range)
train(classifier, name, param_grid)

Training time : 0:00:04.669401


## Decision Tree

In [11]:
name = "Decision tree"
classifier = tree.DecisionTreeClassifier(criterion='entropy')
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:03.023080


## Naive Bayes

In [12]:
name = "NB - Gaussian"
classifier = GaussianNB()
train(classifier, name)

Training time : 0:00:00.006423


In [13]:
name = "NB - Multinomial"
classifier = MultinomialNB()
train(classifier, name)

Training time : 0:00:00.014110


In [14]:
name = "NB - Bernoulli"
classifier = BernoulliNB()
train(classifier, name)

Training time : 0:00:00.009180


## SVM - Support Vector Machine

In [24]:
name = "SVM - SVC"
classifier = svm.SVC()
C_range = np.logspace(-2, 10, 13)
#print(C_range)
gamma_range = np.logspace(-9, 3, 13)
#print(gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
train(classifier, name, param_grid)

Training time : 0:03:39.888917


In [None]:
name = "SVM - Linear"
classifier = svm.LinearSVC()
C_range = range(170,230,5)
C_range = range(1,200,10)
param_grid = dict(C=C_range)
train(classifier, name, param_grid)

## Random Forest

In [20]:
name = "Random forest"
classifier = RandomForestClassifier()
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:05.930685


## AdaBoost

In [19]:
name = "AdaBoost"
classifier = AdaBoostClassifier(n_estimators=100)
train(classifier, name)

Training time : 0:00:00.249278


## Logistic Regression

In [18]:
name = "Log. Regression"
classifier = LogisticRegression()
train(classifier, name)

Training time : 0:00:00.016620


## Neural networks

In [21]:
from sklearn.neural_network import MLPClassifier
name = "Neural net"
#classifier = MLPClassifier(alpha=1)
classifier = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
train(classifier, name)

Training time : 0:00:00.020805


## Score Summary

In [26]:
from prettytable import PrettyTable
import operator
from sklearn import metrics
import math
t = PrettyTable(['Model', 'Best score', 'accuracy', 'precision', 'recall', 'F-M.', 'MCC', 'AUC'])#'FP', 'TN', 'FN', 'TP'])
for clf_name, result in results.items() :
    model = result['model']
    if 'grid' in result :
        grid = result['grid']
        score = grid.best_score_
        # Compute false positives and false negatives
        model.__init__(**grid.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        #print(result.best_estimator_)
    else : # For non grid_search models
        #training_error = clf.score(X_train, y_train)
        score = model.score(X_test, y_test)
        y_pred = model.predict(X_test)
        
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    #print(clf_name + " tn=" + str(tn) + " fp=" + str(fp) + " fn=" + str(fn) + " tp=" + str(tp))
    accuracy = float(tp + tn) / (tp + tn + fp + fn)
    precision = float(tp) / (tp + fp)
    recall = float(tp) / (tp + fn) # a.k.a. sensitivity
    f_measure = float(2 * precision * recall) / (precision + recall)
    mcc = -1
    if fp!=0 and tp != 0 and tn != 0 and fn!= 0:
        mcc = float(tp * tn - fp * fn) / math.sqrt(float(tp+fn) * (tp+fp) * (tn+fp) * (tn+fn)) # Matthew Correlation Coefficient
    auc = metrics.auc(fpr, tpr)
    t.add_row([clf_name, round(score, 3), round(accuracy, 3), round(precision,3), round(recall,3), round(f_measure,3), round(mcc,3), round(auc,3)]) #fp, tn, fn, tp])

        
print(t.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True))

+------------------+------------+----------+-----------+--------+-------+-------+-------+
|      Model       | Best score | accuracy | precision | recall |  F-M. |  MCC  |  AUC  |
+------------------+------------+----------+-----------+--------+-------+-------+-------+
|  Random forest   |   0.988    |  0.984   |    1.0    | 0.976  | 0.988 |   -1  | 0.988 |
|  Decision tree   |   0.982    |  0.988   |   0.998   | 0.983  |  0.99 | 0.973 | 0.989 |
|     AdaBoost     |    0.98    |   0.98   |    1.0    | 0.969  | 0.984 |   -1  | 0.984 |
|  NB - Bernoulli  |   0.949    |  0.949   |   0.949   | 0.974  | 0.961 | 0.887 | 0.938 |
|  NB - Gaussian   |    0.88    |   0.88   |    0.86   | 0.974  | 0.914 | 0.737 | 0.841 |
| Log. Regression  |    0.88    |   0.88   |   0.859   | 0.976  | 0.914 | 0.737 |  0.84 |
| NB - Multinomial |   0.879    |  0.879   |   0.857   | 0.976  | 0.913 | 0.734 | 0.838 |
|    SVM - SVC     |   0.842    |  0.821   |   0.975   | 0.744  | 0.844 | 0.676 | 0.854 |
|       k-