In [None]:
import pandas as pd
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from code import nlp
import numpy as np

In [None]:
bro_logs = pd.read_pickle("data/bro_logs_filtered.pkl")

bro_logs_read = bro_logs[bro_logs["action"] == "SMB::FILE_READ"]
print bro_logs_read.head(10)

In [None]:
#Prepare classifier and vectorizer and classify all data so further use is easier.

clf = joblib.load("data/joblib/clf_iter_9.pkl")
vectorizer = joblib.load("data/joblib/vectorizer_iter_9.pkl")

def classify_name(filename):
    stemmed_filename = [nlp.stringtofeatures(filename)]
    vectorized_filename = vectorizer.transform(stemmed_filename)
    return clf.predict(vectorized_filename)[0]

bro_logs_read["classification"] = bro_logs_read["name"].apply(classify_name)

print bro_logs_read.head(10)


In [None]:
print bro_logs_read["classification"].unique()

def df_to_pretty_avg(df_to_avg):
    value_counts = df_to_avg.value_counts(normalize=True)
    public = 0.
    pii = 0.
    cust_info = 0.
    internal = 0.
    if "Public" in value_counts:
        public = value_counts["Public"]
    if "Personal info" in value_counts:
        pii = value_counts["Personal info"]
    if "Customer Info" in value_counts:
        cust_info = value_counts["Customer Info"]
    if "Internal" in value_counts:
        internal = value_counts["Internal"]
    #print "pub: {}, emp: {}, cust: {}, int: {}".format(public, emp_info, cust_info, internal)

    return [public, pii, cust_info, internal]

In [None]:
def create_clusters(data, memory):
    cluster_data = []
    cluster_labels = []
    unique_persons = data["id.orig_h"].unique()

    for person in unique_persons:
        person_df = data[data["id.orig_h"] == person]
        if len(person_df) >= memory:
            cluster_data.append(df_to_pretty_avg(person_df["classification"][-memory:]))
            cluster_labels.append(person)
        
    kmeans = KMeans(n_clusters=4)
    pred = kmeans.fit_predict(cluster_data)
    clusters = {}

    for cluster, label in zip(pred, cluster_labels):
        if cluster not in clusters.keys():
            clusters[cluster] = [label]
        else:
            clusters[cluster].append(label)
        
    return clusters

print create_clusters(bro_logs_read, 25)

Find which arrays belong together

In [None]:
import math
from collections import Counter

#http://stackoverflow.com/questions/14720324/compute-the-similarity-between-two-lists
#https://en.wikipedia.org/wiki/Cosine_similarity
def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

def length_similarity(c1, c2):
    lenc1 = sum(c1.itervalues())
    lenc2 = sum(c2.itervalues())
    return min(lenc1, lenc2) / float(max(lenc1, lenc2))

def similarity_score(l1, l2):
    c1, c2 = Counter(l1), Counter(l2)
    return length_similarity(c1, c2) * counter_cosine_similarity(c1, c2)  

def similar_match(c1, c2):
    similar_match_dict = {}
    for old_key, old_value in c1.iteritems():
        for new_key, new_value in c2.iteritems():
            similarity = counter_cosine_similarity(Counter(old_value), Counter(new_value))
            if old_key not in similar_match_dict.keys():
                similar_match_dict[old_key] = {'new_key':new_key, 'similarity': similarity}
            else:
                if similarity > similar_match_dict[old_key]["similarity"]:
                    similar_match_dict[old_key]["new_key"] = new_key
                    similar_match_dict[old_key]["similarity"] = similarity

    return similar_match_dict
old_cluster = create_clusters(bro_logs_read, 25)
new_cluster = create_clusters(bro_logs_read, 30)
match = similar_match(old_cluster, new_cluster)
print match

Find which values were added to a different cluster

In [None]:
def notify_change(match, previous_cluster, new_cluster):
    for k, v in match.iteritems():
        #print v
        for new_item in new_cluster[v["new_key"]]:
            if new_item not in previous_cluster[k]:
                print("{} changed clusters").format(new_item)

                
notify_change(match, old_cluster, new_cluster)

Now that we see how the algorithm behaves let's simulate it running in a live environment

In [None]:
previous_cluster = {}
for i in range(len(bro_logs_read)):
    new_cluster = create_clusters(bro_logs_read[:i+5000], 25)
    match = similar_match(previous_cluster, new_cluster)
    notify_change(match, previous_cluster, new_cluster)
    previous_cluster = new_cluster
    