# BotDetector

## Imports

In [37]:
import numpy as np

In [18]:
features_file = "features.json"
features_name_file = "features_name.json"
target_file = "target.json"

## Load features

In [20]:
import json
with open(features_file, 'r') as f :
    X = json.loads(f.read())
with open(target_file, 'r') as f :
    y = json.loads(f.read())
with open(features_name_file, 'r') as f :
    features_name = json.loads(f.read())
print("# of features: " + str(len(X)))
print("features name: " + str(features_name))

# of features: 1950
features name: ['has_name', 'has_image', 'has_address', 'has_biography', 'followers_ge_30', 'belongs_to_a_list', 'nb_tweets_ge_50', 'url_in_profile', 'followers_2_times_ge_friends', 'bot_in_biography', 'ratio_friends_followers_around_100', 'duplicate_profile_picture', 'ratio_friends_followers_ge_50', 'default_image_after_2_month', 'friends_ge_100', 'no_bio', 'no_location', 'no_tweets', 'nb_friends', 'nb_tweets', 'ratio_friends_followers_square', 'age', 'following_rate']


In [32]:
# Divise dataset
def divide_dataset(X, y) :
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = divide_dataset(X, y)

## Supervised Learning

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [22]:
results = dict()

In [23]:
import time
import datetime
def train(classifier, name, param_grid=None) :
    start_time = time.time()
    if param_grid == None :
        classifier.fit(X_train, y_train)
        results[name] = dict(model=classifier)
    else :
        grid = GridSearchCV(classifier, param_grid, cv=10, scoring='accuracy', n_jobs=2) # Do a 10-fold cross validation
        grid.fit(X, y) # fit the grid with data
        results[name] = dict(grid=grid, model=classifier)
    #total_time = datetime.datetime.fromtimestamp(time.time() - start_time)
    total_time = datetime.timedelta(seconds=time.time() - start_time)
    print("Training time : " + str(total_time))#.strftime('%H:%M:%S'))

## k-NN Classifier

In [25]:
name = "k-NN"
classifier = KNeighborsClassifier(weights='uniform')
k_range = list(range(1, 31)) # list of parameter values to test
param_grid = dict(n_neighbors=k_range)
train(classifier, name, param_grid)

Training time : 0:00:05.815309


## Decision Tree

In [26]:
name = "Decision tree"
classifier = tree.DecisionTreeClassifier(criterion='entropy')
d_range = list(range(1, 31)) # list of parameter values to test
#s_range = list(range(2, 10))
param_grid = dict(max_depth=d_range)#, min_samples_split=s_range)
train(classifier, name, param_grid)

Training time : 0:00:04.462364


## Naive Bayes

In [33]:
name = "Naive Bayes - Gaussian"
classifier = GaussianNB()
train(classifier, name)

Training time : 0:00:00.012821


In [34]:
name = "Naive Bayes - Multinomial"
classifier = MultinomialNB()
train(classifier, name)

Training time : 0:00:00.020107


In [35]:
name = "Naive Bayes - Bernoulli"
classifier = BernoulliNB()
train(classifier, name)

Training time : 0:00:00.008154


## SVM - Support Vector Machine

In [None]:
name = "SVM - SVC"
classifier = svm.SVC()
C_range = np.logspace(-2, 10, 13)
print(C_range)
gamma_range = np.logspace(-9, 3, 13)
print(gamma_range)
param_grid = dict(gamma=gamma_range, C=C_range)
train(classifier, name, param_grid)

In [None]:
name = "SVM - Linear"
classifier = svm.LinearSVC()
C_range = range(170,230,5)
C_range = range(1,200,10)
param_grid = dict(C=C_range)
train(classifier, name, param_grid)

## Score Summary

In [41]:
from prettytable import PrettyTable
import operator
t = PrettyTable(['Model', 'Best score', 'Best params', 'FP', 'TN'])
for clf_name, result in results.items() :
    model = result['model']
    if 'grid' in result :
        grid = result['grid']
        # Compute false positives and false negatives
        model.__init__(**grid.best_params_)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        fp = cm[1,0]
        tn = cm[0,1]
        
        t.add_row([clf_name, round(grid.best_score_, 4), grid.best_params_, fp, tn])
        #print(result.best_estimator_)
    else : # For non grid_search models
        #training_error = clf.score(X_train, y_train)
        test_error = model.score(X_test, y_test)
        
        # Compute false positives and false negatives
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        fp = cm[1,0]
        tn = cm[0,1]
        t.add_row([clf_name, round(test_error, 4), None, fp, tn])
        
print(t.get_string(sort_key=operator.itemgetter(2, 1), sortby="Best score", reversesort=True))

+---------------------------+------------+--------------------+----+----+
|           Model           | Best score |    Best params     | FP | TN |
+---------------------------+------------+--------------------+----+----+
|       Decision tree       |    0.98    |  {'max_depth': 4}  | 10 | 8  |
|  Naive Bayes - Bernoulli  |   0.927    |        None        | 16 | 31 |
| Naive Bayes - Multinomial |   0.8463   |        None        | 13 | 86 |
|   Naive Bayes - Gaussian  |   0.8276   |        None        | 23 | 88 |
|            k-NN           |   0.8267   | {'n_neighbors': 8} | 92 | 29 |
+---------------------------+------------+--------------------+----+----+
