In [1]:
import pickle

In [2]:
with open("spam_editors_dummy_data.pickle", "rb") as f:
    spam_editors = pickle.load(f)

In [3]:
with open("non_spam_editors_dummy_data.pickle", "rb") as f:
    nonspam_editors = pickle.load(f)

In [4]:
import re
from urllib.parse import urlparse


# TODO: Figure out stopword removal
def gather_bios(editors):
    regex = re.compile(r"[\n,\r,\t]")
    out = []
    for id, editor in editors.items():
        if editor["bio"] is not None:
            bio = regex.sub("", editor["bio"])
            out.append(bio)
    return out


def gather_email_domains(editors):
    return [editor["email"].split("@")[1] for id, editor in editors.items()]


def gather_website_domains(editors):
    out = []
    for id, editor in editors.items():
        domain = urlparse(editor["website"]).hostname
        if domain is not None:
            out.append(domain)
    return out 

In [5]:
bios = gather_bios(spam_editors) + gather_bios(nonspam_editors)

In [6]:
from keras.preprocessing.text import Tokenizer
bio_tokenizer = Tokenizer(num_words=512)
bio_tokenizer.fit_on_texts(bios)

with open("bio_tokenizer.pickle", "wb") as f:
    pickle.dump(bio_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

Using TensorFlow backend.


In [7]:
# https://stackoverflow.com/a/32469562/5191080
# from nltk.corpus import stopwords
# from nltk.tokenize import wordpunct_tokenize
# from nltk.stem.porter import PorterStemmer

# stop_words = set(stopwords.words('english'))
# stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '&']) # remove it if you need punctuation 
# porter = PorterStemmer()


# for doc in bios:
#     words = []
#     for i in wordpunct_tokenize(doc):
#         l = i.lower()
#         if l not in stop_words:
#             words.append(l)

# for word in words[:400]:
#     tokens = [porter.stem(word)]
    
# with open("nltk_words.pickle", "wb") as f:
#     pickle.dump(tokens, f)

In [8]:
email_tokenizer = Tokenizer(num_words=1023, filters="")
email_tokenizer.fit_on_texts(gather_email_domains(spam_editors) + gather_email_domains(nonspam_editors))

with open("email_tokenizer.pickle", "wb") as f:
    pickle.dump(email_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
website_tokenizer = Tokenizer(num_words=1022, filters="")
website_tokenizer.fit_on_texts(gather_website_domains(spam_editors) + gather_website_domains(nonspam_editors))

with open("website_tokenizer.pickle", "wb") as f:
    pickle.dump(website_tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
import numpy as np
from urllib.parse import urlparse
from datetime import timedelta
from urlextract import URLExtract

extractor = URLExtract()
one_hour = timedelta(hours=1)


def preprocess_editor(editor, spam):
    # Apparently there are users with unset member_since
    if editor["member_since"] is not None:
        # These shouldn't be none but you can't trust the database
        if editor["last_updated"] is not None:
            update_delta = (editor["last_updated"] - editor["member_since"]) / one_hour
        else:
            update_delta = -1
        
        if editor["last_login_date"] is not None:
            login_delta = (editor["last_login_date"] - editor["member_since"]) / one_hour
        else:
            login_delta = -1
        
        # Confirm date may be None
        if editor["email_confirm_date"] is not None:
            conf_delta = (editor["email_confirm_date"] - editor["member_since"]) / one_hour
        else:
            conf_delta = -1
    else:
        update_delta, login_delta, conf_delta = -2, -2, -2
    
    # Email domain
    email_domain = email_tokenizer.texts_to_sequences([editor["email"].split("@")[1]])[0]
    if len(email_domain) == 0:
        email_token = 1024
    else:
        email_token = email_domain[0]
    
    # Website domain
    domain = urlparse(editor["website"]).hostname
    if domain is not None:
        website_domain = website_tokenizer.texts_to_sequences(urlparse(editor["website"]).hostname)[0]
        if len(website_domain) == 0:
            website_token = 1023
        else:
            website_token = email_domain[0]
    else:
        website_token = 1024
    
    # Bio metadata
    if editor["bio"] is not None:
        bio_len = len(editor["bio"])
        bio_urls = extractor.has_urls(editor["bio"])
        bio = bio_tokenizer.texts_to_matrix([editor["bio"]], mode="tfidf")[0]
    else:
        bio_len, bio_urls = 0, 0
        bio = np.zeros(512)
    
    data = np.array([
        spam, # spam classification
        editor["area"] is not None, # Area Set
        editor["gender"] is not None, # Gender
        editor["birth_date"] is not None, # Birth date set
        editor["privs"] != 0, # Nonzero privs
        bio_len, # Bio length
        bio_urls, # URLs in bio
        conf_delta, # Confirmation delta
        update_delta, # Last updated delta
        login_delta, # Last login delta
        email_token, # Email domain
        website_token, # Website domain
    ], dtype=np.float32)
    
    data = np.concatenate((data, bio))
    
    return data

In [11]:
import random

# Count of editors to select from each dict
TRAINING_COUNT = 8000

training_set = np.empty((TRAINING_COUNT*2, 524))

# Shuffle both dicts to prevent only picking the oldest editors
spam_keys = list(spam_editors.keys())
nonspam_keys = list(nonspam_editors.keys())

random.shuffle(spam_keys)
random.shuffle(nonspam_keys)

# Alternate spam/nonspam
i = 0
j = 1
for key in spam_keys[:TRAINING_COUNT]:
    training_set[i] = preprocess_editor(spam_editors[key], 1)
    i += 2
for key in nonspam_keys[:TRAINING_COUNT]:
    training_set[j] = preprocess_editor(nonspam_editors[key], 0)
    j += 2

with open("spambrainz_dataset.pickle", "wb") as f:
    pickle.dump(training_set, f)

In [12]:
# Evalutation set

EVAL_COUNT = 500

eval_set = np.empty((EVAL_COUNT*2, 524))

i = 0
j = 1
for key in spam_keys[TRAINING_COUNT:EVAL_COUNT]:
    eval_set[i] = preprocess_editor(spam_editors[key], 1)
    i += 2
for key in nonspam_keys[TRAINING_COUNT:EVAL_COUNT]:
    eval_set[j] = preprocess_editor(nonspam_editors[key], 0)
    j += 2

with open("spambrainz_dataset_eval.pickle", "wb") as f:
    pickle.dump(eval_set, f)