In [159]:
import pandas as pd
import numpy as np
from math import isinf
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

# get the cleaned-up account data into a dataframe
# these are the test accounts, not the larger data set
filename = 'cleaned_accounts.csv'
headers = ["id", "id_str", "screen_name", "location", "description", "url", "followers_count", "friends_count", "listed_count", "created_at", "favourites_count", "verified", "statuses_count", "lang", "status", "default_profile", "default_profile_image", "has_extended_profile", "name", "bot"]
accounts = pd.read_csv(filename, names=headers)

# add freinds-to-followers column
accounts['ratio'] = accounts['friends_count'] / accounts['followers_count']

# remove NaN from description column
def remove_NaN(text):
    if (pd.isnull(text)):
        text = ""
    else:
        text = text
    return text

accounts['description'] = accounts['description'].apply(remove_NaN)

# add column for lexical diversity of the description
# lexical diversity function (as written in Shellman [2])
# http://www.erinshellman.com/bot-or-not/

def lexical_diversity(text):
    if (pd.isnull(text)):
        diversity = 0
    elif len(text) == 0:
        diversity = 0
    else:
        diversity = float(len(set(text))) / len(text)
    return diversity

accounts['desc_lex_diversity'] = accounts['description'].apply(lexical_diversity)

# separate the bots and humans for exploratory diagnostics
bots = accounts.loc[accounts['bot'] == 1]
humans = accounts.loc[accounts['bot'] == 0]

In [160]:
# get the avg, std dev, and median for followers, friends, and friend-to-follower ratio of bot accounts and human accounts
b_follower_avg = bots['followers_count'].mean()
b_follower_std = bots['followers_count'].std()
b_follower_med = bots['followers_count'].median()
b_friend_avg = bots['friends_count'].mean()
b_friend_std = bots['friends_count'].std()
b_friend_med = bots['friends_count'].median()
b_ratio_avg = bots['ratio'].mean()
b_ratio_std = bots['ratio'].std()
b_ratio_med = bots['ratio'].median()

h_follower_avg = humans['followers_count'].mean()
h_follower_std = humans['followers_count'].std()
h_follower_med = humans['followers_count'].median()
h_friend_avg = humans['friends_count'].mean()
h_friend_std = humans['friends_count'].std()
h_friend_med = humans['friends_count'].median()
h_ratio_avg = humans['ratio'].mean()
h_ratio_std = humans['ratio'].std()
h_ratio_med = humans['ratio'].median()

In [161]:
# print the diagnostic results
print("Bots:")
print("="*100)
print("Followers:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(b_follower_avg, b_follower_std, b_follower_med))
print("\nFriends:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(b_friend_avg, b_friend_std, b_friend_med))
print("\nRatio:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(b_ratio_avg, b_ratio_std, b_ratio_med))
print("\nHumans:")
print("="*100)
print("Followers:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(h_follower_avg, h_follower_std, h_follower_med))
print("\nFriends:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(h_friend_avg, h_friend_std, h_friend_med))
print("\nRatio:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(h_ratio_avg, h_ratio_std, h_ratio_med))


Bots:
Followers:
Avg: 11887.1000		Std: 25015.9995		Med: 2579.5000

Friends:
Avg: 275.9800		Std: 883.7261		Med: 2.0000

Ratio:
Avg: 0.1482		Std: 0.3573		Med: 0.0015

Humans:
Followers:
Avg: 364099.1400		Std: 920812.4892		Med: 93702.0000

Friends:
Avg: 873.6000		Std: 1263.8037		Med: 397.0000

Ratio:
Avg: 0.0248		Std: 0.0968		Med: 0.0066


In [162]:
# get the ratio of verified accounts
b_verified = bots['verified'].sum() / len(bots.index)
h_verified = humans['verified'].sum() / len(bots.index)

print("Verification ratio for bots: {:.4f}".format(b_verified))
print("Verification ratio for humans: {:.4f}".format(h_verified))

Verification ratio for bots: 0.0200
Verification ratio for humans: 0.2600


In [163]:
# get the lexical diversity avg, std dev, and median
b_lexdev_avg = bots['desc_lex_diversity'].mean()
b_lexdev_std = bots['desc_lex_diversity'].std()
b_lexdev_med = bots['desc_lex_diversity'].median()

h_lexdev_avg = humans['desc_lex_diversity'].mean()
h_lexdev_std = humans['desc_lex_diversity'].std()
h_lexdev_med = humans['desc_lex_diversity'].median()

print("Bots: Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(b_lexdev_avg, b_lexdev_std, b_lexdev_med))
print("Humans: Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(h_lexdev_avg, h_lexdev_std, h_lexdev_med))

Bots: Avg: 0.3353		Std: 0.1872		Med: 0.3152
Humans: Avg: 0.4309		Std: 0.2317		Med: 0.3741


In [164]:
# train the bernoulli and multinomial naive bayes models on the screen names
scrnames = accounts['screen_name'].tolist()
targets = accounts['bot'].tolist()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(scrnames)

clf_mn = MultinomialNB().fit(X_train_counts, targets)
clf_bn = BernoulliNB().fit(X_train_counts, targets)

cv = 5
mn_accuracy = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
mn_recall = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
mn_auc = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Multinomial Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(mn_accuracy))
print("\tRecall: {:.4f}".format(mn_recall))
print("\tAUC Score: {:.4f}".format(mn_auc))
print()

bn_accuracy = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
bn_recall = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
bn_auc = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Bernoulli Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(bn_accuracy))
print("\tRecall: {:.4f}".format(bn_recall))
print("\tAUC Score: {:.4f}".format(bn_auc))

Multinomial Naive Bayes Classifier:
	Accuracy: 0.5000
	Recall: 0.0000
	AUC Score: 0.5000

Bernoulli Naive Bayes Classifier:
	Accuracy: 0.5000
	Recall: 0.2000
	AUC Score: 0.5000


In [165]:
# now train the logistic regression model
lr = LogisticRegression()
lr.fit(X_train_counts, targets)

lr_accuracy = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='recall'))/cv
lr_auc = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Logistic Regression Classifier:")
print("\tAccuracy: {:.4f}".format(lr_accuracy))
print("\tPrecision: {:.4f}".format(lr_precision))
print("\tRecall: {:.4f}".format(lr_recall))
print("\tAUC Score: {:.4f}".format(lr_auc))

Logistic Regression Classifier:
	Accuracy: 0.5000
	Precision: 0.5000
	Recall: 1.0000
	AUC Score: 0.5000


In [166]:
# train the models on the names
scrnames = accounts['name'].tolist()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(scrnames)

clf_mn = MultinomialNB().fit(X_train_counts, targets)
clf_bn = BernoulliNB().fit(X_train_counts, targets)

cv = 5
mn_accuracy = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
mn_precision = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
mn_recall = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
mn_auc = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Multinomial Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(mn_accuracy))
print("\tPrecision: {:.4f}".format(mn_precision))
print("\tRecall: {:.4f}".format(mn_recall))
print("\tAUC Score: {:.4f}".format(mn_auc))
print()

bn_accuracy = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
bn_precision = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
bn_recall = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
bn_auc = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Bernoulli Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(bn_accuracy))
print("\tPrecision: {:.4f}".format(bn_precision))
print("\tRecall: {:.4f}".format(bn_recall))
print("\tAUC Score: {:.4f}".format(bn_auc))

lr = LogisticRegression()
lr.fit(X_train_counts, targets)

lr_accuracy = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='recall'))/cv
lr_auc = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print()
print("Logistic Regression Classifier:")
print("\tAccuracy: {:.4f}".format(lr_accuracy))
print("\tPrecision: {:.4f}".format(lr_precision))
print("\tRecall: {:.4f}".format(lr_recall))
print("\tAUC Score: {:.4f}".format(lr_auc))

Multinomial Naive Bayes Classifier:
	Accuracy: 0.5400
	Precision: 0.6158
	Recall: 0.6400
	AUC Score: 0.6870

Bernoulli Naive Bayes Classifier:
	Accuracy: 0.5400
	Precision: 0.6158
	Recall: 0.6400
	AUC Score: 0.6130

Logistic Regression Classifier:
	Accuracy: 0.5300
	Precision: 0.5105
	Recall: 0.4400
	AUC Score: 0.5410


  'precision', 'predicted', average, warn_for)


In [167]:
# train the models on the URLS
scrnames = accounts['url'].tolist()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(scrnames)

clf_mn = MultinomialNB().fit(X_train_counts, targets)
clf_bn = BernoulliNB().fit(X_train_counts, targets)

cv = 5
mn_accuracy = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
mn_precision = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
mn_recall = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
mn_auc = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Multinomial Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(mn_accuracy))
print("\tPrecision: {:.4f}".format(mn_precision))
print("\tRecall: {:.4f}".format(mn_recall))
print("\tAUC Score: {:.4f}".format(mn_auc))
print()

bn_accuracy = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
bn_precision = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
bn_recall = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
bn_auc = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Bernoulli Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(bn_accuracy))
print("\tPrecision: {:.4f}".format(bn_precision))
print("\tRecall: {:.4f}".format(bn_recall))
print("\tAUC Score: {:.4f}".format(bn_auc))

lr = LogisticRegression()
lr.fit(X_train_counts, targets)

lr_accuracy = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='recall'))/cv
lr_auc = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print()
print("Logistic Regression Classifier:")
print("\tAccuracy: {:.4f}".format(lr_accuracy))
print("\tPrecision: {:.4f}".format(lr_precision))
print("\tRecall: {:.4f}".format(lr_recall))
print("\tAUC Score: {:.4f}".format(lr_auc))

Multinomial Naive Bayes Classifier:
	Accuracy: 0.5800
	Precision: 0.5711
	Recall: 0.7200
	AUC Score: 0.6370

Bernoulli Naive Bayes Classifier:
	Accuracy: 0.5800
	Precision: 0.5711
	Recall: 0.7200
	AUC Score: 0.5970

Logistic Regression Classifier:
	Accuracy: 0.5800
	Precision: 0.5711
	Recall: 0.7200
	AUC Score: 0.6370


In [168]:
# train the models on the descriptions
scrnames = accounts['description'].tolist()

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(scrnames)

clf_mn = MultinomialNB().fit(X_train_counts, targets)
clf_bn = BernoulliNB().fit(X_train_counts, targets)

cv = 5
mn_accuracy = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
mn_precision = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
mn_recall = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
mn_auc = sum(cross_val_score(clf_mn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Multinomial Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(mn_accuracy))
print("\tPrecision: {:.4f}".format(mn_precision))
print("\tRecall: {:.4f}".format(mn_recall))
print("\tAUC Score: {:.4f}".format(mn_auc))
print()

bn_accuracy = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
bn_precision = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='precision'))/cv
bn_recall = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='recall'))/cv
bn_auc = sum(cross_val_score(clf_bn, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print("Bernoulli Naive Bayes Classifier:")
print("\tAccuracy: {:.4f}".format(bn_accuracy))
print("\tPrecision: {:.4f}".format(bn_precision))
print("\tRecall: {:.4f}".format(bn_recall))
print("\tAUC Score: {:.4f}".format(bn_auc))

lr = LogisticRegression()
lr.fit(X_train_counts, targets)

lr_accuracy = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='recall'))/cv
lr_auc = sum(cross_val_score(lr, X_train_counts, targets, cv=cv, scoring='roc_auc'))/cv

print()
print("Logistic Regression Classifier:")
print("\tAccuracy: {:.4f}".format(lr_accuracy))
print("\tPrecision: {:.4f}".format(lr_precision))
print("\tRecall: {:.4f}".format(lr_recall))
print("\tAUC Score: {:.4f}".format(lr_auc))

Multinomial Naive Bayes Classifier:
	Accuracy: 0.7100
	Precision: 0.7507
	Recall: 0.6400
	AUC Score: 0.7580

Bernoulli Naive Bayes Classifier:
	Accuracy: 0.6800
	Precision: 0.9333
	Recall: 0.3800
	AUC Score: 0.8340

Logistic Regression Classifier:
	Accuracy: 0.6900
	Precision: 0.7543
	Recall: 0.5600
	AUC Score: 0.7570
