In [345]:
# import supporting packages
import pandas as pd
import numpy as np
import random
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.metrics import confusion_matrix

In [346]:
# load training and test data
filename1 = 'training.csv'
accounts = pd.read_csv(filename1)
filename2 = 'test.csv'
test = pd.read_csv(filename2)
test = test[:575] # error in reading loads 1000 rows; there's only data for 575 rows

In [347]:
# clean NaNs out of the data
def remove_NaN(text):
    if (pd.isnull(text)):
        text = ""
    else:
        text = text
    return text

accounts['description'] = accounts['description'].apply(remove_NaN)
accounts['location'] = accounts['location'].apply(remove_NaN)
accounts['url'] = accounts['description'].apply(remove_NaN)
accounts['status'] = accounts['status'].apply(remove_NaN)
accounts['has_extended_profile'] = accounts['has_extended_profile'].apply(remove_NaN)

test['description'] = test['description'].apply(remove_NaN)
test['location'] = test['location'].apply(remove_NaN)
test['url'] = test['description'].apply(remove_NaN)
test['status'] = test['status'].apply(remove_NaN)
test['has_extended_profile'] = test['has_extended_profile'].apply(remove_NaN)

In [348]:
# add a column that indicates if the word "bot" appears in the screen name
def bot_in_name(text):
    if "bot" in text.lower():
        return 1
    else:
        return 0
    
def bot_in_desc(text):
    text = text.lower()
    if "bot," in text:
        return 1
    if " bot " in text:
        return 1
    if " bot." in text:
        return 1
    if "#bot" in text:
        return 1
    return 0

accounts['bot_in_screen_name'] = accounts.screen_name.apply(bot_in_name)
test['bot_in_screen_name'] = test.screen_name.apply(bot_in_name)
accounts['bot_in_desc'] = accounts.description.apply(bot_in_desc)
test['bot_in_desc'] = test.description.apply(bot_in_desc)
accounts['truther_in_screen_name'] = accounts.screen_name.apply(truther_in_name)
test['truther_in_screen_name'] = test.screen_name.apply(truther_in_name)

In [349]:
# clean the test data (there's some issues with 0 counts for followers/friends)
# (and in some places verified=0 is coded as None)

def clean_verified(text):
    if text.lower() == 'true':
        return 1
    else:
        return 0
    
def clean_counts(text):
    if text == "None":
        return 0
    else:
        return int(text)
    
test.verified = test.verified.apply(clean_verified)
test.friends_count = test.friends_count.apply(clean_counts)
test.followers_count = test.followers_count.apply(clean_counts)

In [350]:
# get the text of the last status at the time of data collection
def return_text(text):
    try:
        if ("'text':" in text):
            start = text.index("'text': '") + 9
            end = text.index("',", start)
            return text[start:end]
        if ('"text":' in text):
            start = text.index('"text": "') + 9
            end = text.index('",', start)
            return text[start:end]
        if (text != "null" and text != "None"):
            return text
        if not pd.isnull(text):
            return ""
    except:
        start = text.index("'text'") + 10
        end = text.index(",", start + 1) - 1
        return text[start:end]
    
accounts['text'] = accounts.status.apply(return_text)
test['text'] = test.status.apply(return_text)

def bot_in_text(text):
    if ("bot") in text:
        return 1
    else:
        return 0
    
accounts['bot_in_text'] = accounts.text.apply(bot_in_text)
test['bot_in_text'] = test.text.apply(bot_in_text)

In [351]:
# add column for lexical diversity of the description
# lexical diversity function (as written in Shellman [2])
# http://www.erinshellman.com/bot-or-not/

def lexical_diversity(text):
    if (pd.isnull(text)):
        diversity = 0
    elif len(text) == 0:
        diversity = 0
    else:
        diversity = float(len(set(text))) / len(text)
    return diversity

accounts['desc_lex_diversity'] = accounts['description'].apply(lexical_diversity)
accounts['text_lex_diversity'] = accounts['text'].apply(lexical_diversity)

In [352]:
# split the accounts into bots and non-bots, then look at basic statistics
# for friend count, follower count, and lexical diversity of the description and the text
bots = accounts.loc[accounts['bot'] == 1]
humans = accounts.loc[accounts['bot'] == 0]

print("Bots:")
print("="*100)
print("Followers:")
print("Avg: {:.4f}\t\tStd: {:.4f}\tMed: {:.4f}".format(bots.followers_count.mean(), bots.followers_count.std(), bots.followers_count.median()))
print("\nFriends:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(bots.friends_count.mean(), bots.friends_count.std(), bots.friends_count.median()))
print("\nDescription Lexical Diversity:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(bots.desc_lex_diversity.mean(), bots.desc_lex_diversity.std(), bots.desc_lex_diversity.median()))
print("\nText Lexical Diversity:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(bots.text_lex_diversity.mean(), bots.text_lex_diversity.std(), bots.text_lex_diversity.median()))
print("\nStatuses:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(bots.statuses_count.mean(), bots.statuses_count.std(), bots.statuses_count.median()))
print("\nHumans:")
print("="*100)
print("Followers:")
print("Avg: {:.4f}\tStd: {:.4f}\tMed: {:.4f}".format(humans.followers_count.mean(), humans.followers_count.std(), humans.followers_count.median()))
print("\nFriends:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(humans.friends_count.mean(), humans.friends_count.std(), humans.friends_count.median()))
print("\nDescription Lexical Diversity:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(humans.desc_lex_diversity.mean(), humans.desc_lex_diversity.std(), humans.desc_lex_diversity.median()))
print("\nText Lexical Diversity:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(humans.text_lex_diversity.mean(), humans.text_lex_diversity.std(), humans.text_lex_diversity.median()))
print("\nStatuses:")
print("Avg: {:.4f}\t\tStd: {:.4f}\t\tMed: {:.4f}".format(humans.statuses_count.mean(), humans.statuses_count.std(), humans.statuses_count.median()))

Bots:
Followers:
Avg: 13458.7737		Std: 215307.6876	Med: 99.0000

Friends:
Avg: 1611.3747		Std: 17841.9368		Med: 14.0000

Description Lexical Diversity:
Avg: 0.3141		Std: 0.1988		Med: 0.2936

Text Lexical Diversity:
Avg: 0.3960		Std: 0.2537		Med: 0.3704

Statuses:
Avg: 29249.1703		Std: 222066.0451		Med: 1560.0000

Humans:
Followers:
Avg: 1876165.4546	Std: 7591517.0999	Med: 4422.5000

Friends:
Avg: 7647.1308		Std: 75387.9973		Med: 382.5000

Description Lexical Diversity:
Avg: 0.3445		Std: 0.2196		Med: 0.3193

Text Lexical Diversity:
Avg: 0.3817		Std: 0.2069		Med: 0.3496

Statuses:
Avg: 10683.1023		Std: 29660.2321		Med: 2931.5000


In [353]:
# get the ratio of verified accounts
b_verified = bots['verified'].sum() / len(bots.index)
h_verified = humans['verified'].sum() / len(bots.index)

print("Verification ratio for bots: {:.4f}".format(b_verified))
print("Verification ratio for humans: {:.4f}".format(h_verified))

Verification ratio for bots: 0.0061
Verification ratio for humans: 0.4883


In [354]:
cv = 5

X = accounts[['bot_in_screen_name', 'bot_in_text', 'verified', 'friends_count', 'followers_count', 'statuses_count', 'bot_in_desc']]
y = accounts.bot

dt = DecisionTreeClassifier()
dt.fit(X, y)

dt_accuracy = sum(cross_val_score(dt, X, y, cv=cv, scoring='accuracy'))/cv
dt_precision = sum(cross_val_score(dt, X, y, cv=cv, scoring='precision'))/cv
dt_recall = sum(cross_val_score(dt, X, y, cv=cv, scoring='recall'))/cv
dt_accuracy, dt_precision, dt_recall

(0.83768953344876729, 0.83874462408370776, 0.84479702687249847)

In [355]:
mnb = MultinomialNB()
mnb.fit(X, y)

mnb_accuracy = sum(cross_val_score(mnb, X, y, cv=cv, scoring='accuracy'))/cv
mnb_precision = sum(cross_val_score(mnb, X, y, cv=cv, scoring='precision'))/cv
mnb_recall = sum(cross_val_score(mnb, X, y, cv=cv, scoring='recall'))/cv
mnb_accuracy, mnb_precision, mnb_recall

(0.67569603219398022, 0.59647166652864192, 0.9735020011435106)

In [356]:
bnb = BernoulliNB(alpha=1)
bnb.fit(X, y)

bnb_accuracy = sum(cross_val_score(bnb, X, y, cv=cv, scoring='accuracy'))/cv
bnb_precision = sum(cross_val_score(bnb, X, y, cv=cv, scoring='precision'))/cv
bnb_recall = sum(cross_val_score(bnb, X, y, cv=cv, scoring='recall'))/cv
bnb_accuracy, bnb_precision, bnb_recall

(0.76332450039700384, 0.94480609869692, 0.52991995425957683)

In [357]:
lr = LogisticRegression(C=.001, penalty="l1")
lr.fit(X, y)

lr_accuracy = sum(cross_val_score(lr, X, y, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X, y, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X, y, cv=cv, scoring='recall'))/cv
lr_accuracy, lr_precision, lr_recall

(0.6871386707228021, 0.60795932487185289, 0.95079473985134366)

In [358]:
rf = RandomForestClassifier(n_jobs = 5, n_estimators = 1000, max_features = 4, max_depth = 5)
rf.fit(X, y)

rf_accuracy = sum(cross_val_score(rf, X, y, cv=cv, scoring='accuracy'))/cv
rf_precision = sum(cross_val_score(rf, X, y, cv=cv, scoring='precision'))/cv
rf_recall = sum(cross_val_score(rf, X, y, cv=cv, scoring='recall'))/cv
rf_accuracy, rf_precision, rf_recall

(0.87771644679989413, 0.92093232446366324, 0.80996569468267587)

In [359]:
k = 10
# train the k-nearest neighbors model with k = 10
for weights in ['uniform', 'distance']:
    knn = KNeighborsClassifier(k, weights=weights)
    knn.fit(X, y)
    
knn_accuracy = sum(cross_val_score(knn, X, y, cv=cv, scoring='accuracy'))/cv
knn_precision = sum(cross_val_score(knn, X, y, cv=cv, scoring='precision'))/cv
knn_recall = sum(cross_val_score(knn, X, y, cv=cv, scoring='recall'))/cv
knn_accuracy, knn_precision, knn_recall

(0.82444012895449281, 0.79114604590425919, 0.85311034877072611)

In [360]:
vc = VotingClassifier(estimators=[('dt', dt), ('mnb', mnb), ('bnb', bnb), ('lr', lr), ('rf', rf), ('knn', knn)], voting='hard', weights=[5,1,1,1,8,1])
vc.fit(X, y)

vc_accuracy = sum(cross_val_score(vc, X, y, cv=cv, scoring='accuracy'))/cv
vc_precision = sum(cross_val_score(vc, X, y, cv=cv, scoring='precision'))/cv
vc_recall = sum(cross_val_score(vc, X, y, cv=cv, scoring='recall'))/cv
vc_accuracy, vc_precision, vc_recall

(0.87879234308782872, 0.92041615806196919, 0.81072041166380782)

In [361]:
predictions = vc.predict(X)
tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
tn, fp, fn, tp

(1408, 68, 215, 1106)

In [362]:
for i in range(len(predictions)):
    if (X.bot_in_screen_name[i] == 1):
        predictions[i] = 1
    if (X.bot_in_desc[i] == 1):
        predictions[i] = 1
    if (X.bot_in_text[i] == 1):
        predictions[i] = 1
        
tn, fp, fn, tp = confusion_matrix(y, predictions).ravel()
tn, fp, fn, tp

(1389, 87, 214, 1107)

In [363]:
# now process the test data
data = test[['bot_in_screen_name', 'bot_in_text', 'verified', 'friends_count', 'followers_count', 'statuses_count', 'bot_in_desc']]
predictions = vc.predict(data)

for i in range(len(predictions)):
    if (data.bot_in_screen_name[i] == 1):
        predictions[i] = 1
    if (data.bot_in_desc[i] == 1):
        predictions[i] = 1
    if (data.bot_in_text[i] == 1):
        predictions[i] = 1

In [364]:
bots = predictions.tolist()
ids = test.id.tolist()

with open('output.txt', 'w') as file:
    file.write("Id,Bot\n")
    for i in range(len(bots)):
        file.write("{:.0f},{:d}\n".format(ids[i], bots[i]))

In [365]:
descriptions = accounts['description'].tolist()

count_vect = CountVectorizer()
X = count_vect.fit_transform(descriptions)
y = accounts.bot

mn = MultinomialNB()
mn.fit(X, y)
mnb_accuracy = sum(cross_val_score(mnb, X, y, cv=cv, scoring='accuracy'))/cv
mnb_precision = sum(cross_val_score(mnb, X, y, cv=cv, scoring='precision'))/cv
mnb_recall = sum(cross_val_score(mnb, X, y, cv=cv, scoring='recall'))/cv
mnb_accuracy, mnb_precision, mnb_recall

(0.74938759371043906, 0.77720485744836443, 0.65784734133790734)

In [366]:
bn = BernoulliNB()
bn.fit(X, y)
bnb_accuracy = sum(cross_val_score(bnb, X, y, cv=cv, scoring='accuracy'))/cv
bnb_precision = sum(cross_val_score(bnb, X, y, cv=cv, scoring='precision'))/cv
bnb_recall = sum(cross_val_score(bnb, X, y, cv=cv, scoring='recall'))/cv
bnb_accuracy, bnb_precision, bnb_recall

(0.75367587269092051, 0.83257776707149167, 0.59880503144654085)

In [367]:
lr = LogisticRegression()
lr.fit(X, y)
lr_accuracy = sum(cross_val_score(lr, X, y, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X, y, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X, y, cv=cv, scoring='recall'))/cv
lr_accuracy, lr_precision, lr_recall

(0.74507890650161512, 0.81798185723997352, 0.59197541452258429)

In [368]:
for weights in ['uniform', 'distance']:
    knn = KNeighborsClassifier(k, weights=weights)
    knn.fit(X, y)

knn_accuracy = sum(cross_val_score(knn, X, y, cv=cv, scoring='accuracy'))/cv
knn_precision = sum(cross_val_score(knn, X, y, cv=cv, scoring='precision'))/cv
knn_recall = sum(cross_val_score(knn, X, y, cv=cv, scoring='recall'))/cv
knn_accuracy, knn_precision, knn_recall

(0.60170153603806131, 0.6032948436172374, 0.52992853058890799)

In [369]:
text = accounts['text'].tolist()

count_vect = CountVectorizer()
X = count_vect.fit_transform(text)
y = accounts.bot

mn = MultinomialNB()
mn.fit(X, y)
mnb_accuracy = sum(cross_val_score(mnb, X, y, cv=cv, scoring='accuracy'))/cv
mnb_precision = sum(cross_val_score(mnb, X, y, cv=cv, scoring='precision'))/cv
mnb_recall = sum(cross_val_score(mnb, X, y, cv=cv, scoring='recall'))/cv
mnb_accuracy, mnb_precision, mnb_recall

(0.63030494357443756, 0.69400758800758788, 0.38984276729559747)

In [370]:
bn = BernoulliNB()
bn.fit(X, y)
bnb_accuracy = sum(cross_val_score(bnb, X, y, cv=cv, scoring='accuracy'))/cv
bnb_precision = sum(cross_val_score(bnb, X, y, cv=cv, scoring='precision'))/cv
bnb_recall = sum(cross_val_score(bnb, X, y, cv=cv, scoring='recall'))/cv
bnb_accuracy, bnb_precision, bnb_recall

(0.67641287121451277, 0.63590902405900096, 0.73806174957118353)

In [371]:
lr = LogisticRegression()
lr.fit(X, y)
lr_accuracy = sum(cross_val_score(lr, X, y, cv=cv, scoring='accuracy'))/cv
lr_precision = sum(cross_val_score(lr, X, y, cv=cv, scoring='precision'))/cv
lr_recall = sum(cross_val_score(lr, X, y, cv=cv, scoring='recall'))/cv
lr_accuracy, lr_precision, lr_recall

(0.6639147446260989, 0.62391081160652195, 0.72594911377930238)

In [372]:
for weights in ['uniform', 'distance']:
    knn = KNeighborsClassifier(k, weights=weights)
    knn.fit(X, y)

knn_accuracy = sum(cross_val_score(knn, X, y, cv=cv, scoring='accuracy'))/cv
knn_precision = sum(cross_val_score(knn, X, y, cv=cv, scoring='precision'))/cv
knn_recall = sum(cross_val_score(knn, X, y, cv=cv, scoring='recall'))/cv
knn_accuracy, knn_precision, knn_recall

(0.47871389896013694, 0.46593535509449974, 0.78421097770154369)