# Suspicious Data Filter
This is a simple prototype of how we can filter the data of our interest from a data dump.

This sample algorithm is built to reduce the time it takes to browse entire data dump given by the forensic department. Model uses synthetic data built using drug dealers lingo found on internet.

# Model 1
- This model is based on a learning algorithm
- Once model is trained it works very fast even on large data sets
- Will be very accurate to filter the data


- Need good amount of data for training
- Need technology expert to make changes

In [1]:
#Imports
import pandas as pd
import re
import matplotlib.pyplot as plt

#Settings
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('max_rows', 200)

In [2]:
#read data
data = pd.read_excel("Data.xlsx")

data.iloc[100:120, 0]

100                                       crack and coke
101                                        Weed moneyyyy
102                         Loan for strippers and blow!
103                                 DEFINITELY not drugs
104                  healing cuts with hydrochloric acid
105                                              deposit
106                                    drink drank drunk
107                               lots and lots of drugs
108                                  Purple lean sizzurp
109    I miss you man hopefully youll stop doing drug...
110                              Shrooms (the drug kind)
111                                             Not weed
112                                         Opium hahaha
113                                                Drank
114                                             drank up
115                          Summer lax and so not drugs
116                  Cause I love you and weed/groceries
117                         For

In [3]:
#train and build model

#randomize data set
data_randomized1 = data.sample(frac=1, random_state=6)
data_randomized2 = data.sample(frac=1, random_state=6)

#split data to train and test
#split_row = int(data.shape[0] * 0.8)
#train = data_randomized[0:split_row].reset_index(drop=True)
#test = data_randomized[split_row:].reset_index(drop=True)

train = data_randomized1.copy()

#clean data
train['Message'] = train['Message'].str.replace('\W', ' ', regex=True).str.strip().str.replace(' +', ' ', regex=True)
train['Message'] = train['Message'].str.lower()

#split messages on space
train['Message'] = train['Message'].str.split()

#collect words from all messages and filter unique
vocabulary = []
for sms in train['Message']:
    for word in sms:
        vocabulary.append(word)

vocabulary = list(set(vocabulary))

#Create empty dictionary with all 0's as count for each word in each message
word_counts_per_msg = {unique_word: [0] * len(train['Message']) for unique_word in vocabulary}

#get the actual counts
for index, msg in enumerate(train['Message']):
    for word in msg:
        word_counts_per_msg[word][index] += 1
        
#create new dataframe
train_clean = pd.concat([train['Suspicious'], pd.DataFrame(word_counts_per_msg)], axis=1)

#calculate probabilities
p_susp = train_clean['Suspicious'].value_counts(normalize=True)['Yes']
p_non_susp = train_clean['Suspicious'].value_counts(normalize=True)['No']

#calculate counts
n_susp = train_clean[train_clean['Suspicious'] == 'Yes'].sum(axis=1).sum()
n_non_susp = train_clean[train_clean['Suspicious'] == 'No'].sum(axis=1).sum()
n_vocabulary = len(vocabulary)

# Laplace smoothing
alpha = 1

#create two empty dictionaries with probabilities of each word
p_words_susp = {word: 0 for word in vocabulary}
p_words_non_susp = {word: 0 for word in vocabulary}

#split data into susp and non-susp
train_susp = train_clean[train_clean['Suspicious'] == 'Yes']
train_non_susp = train_clean[train_clean['Suspicious'] == 'No']

#calculate probabilites
for word in vocabulary:
    #calculate total number times this word appeared in messages
    n_word_susp = train_susp[word].sum()
    n_word_non_susp = train_non_susp[word].sum()
    
    #calculate probabilites
    p_word_susp = (n_word_susp + alpha)/(n_susp + (alpha * n_vocabulary))
    p_word_non_susp = (n_word_non_susp + alpha)/(n_non_susp + (alpha * n_vocabulary))
    
    #append to dictionaries
    p_words_susp[word] += p_word_susp
    p_words_non_susp[word] += p_word_non_susp

In [4]:
#Create a function that takes in a input string and classify the message
def classify(message):
    #clean and split the message
    message = re.sub('\W', ' ', message)
    message = message.lower().strip()
    message = message.split()

    #initiate values
    p_susp_given_message = p_susp
    p_non_susp_given_message = p_non_susp
    
    #calculate suspicious and non suspicious probabilities
    for word in message:
        if word in p_words_susp:
            p_susp_given_message *= p_words_susp[word]
        if word in p_words_non_susp:
            p_non_susp_given_message *= p_words_non_susp[word]
    
    #return labels and probabilities
    if p_non_susp_given_message > p_susp_given_message:
        return 'suspicious'
    elif p_non_susp_given_message < p_susp_given_message:
        return 'not suspicious'
    else:
        return 'needs human classification'

In [13]:
#Uncomment below code to output the result

#data_randomized1['predicted'] = data_randomized1['Message'].apply(classify)
#data_randomized1.loc[:,["Message","predicted"]].sort_values("predicted", ascending=False).reset_index()

# Model 2
This model is not based on any learning algorithm.

- Easy to implement
- Easy to make changes
- Works well with small data


- Will be slow if we have huge data and large number of key words to search
- Manual work to input every keyword in the list of suspicious words

In [10]:
bag_of_words = ["alc", "alcohol", "bubbly", "champagne", "drinks", "beer", "bud", "drank", "weed", "pills", "ecstasy", "broccoli", "plug", "codeine", "high", "buzzed", "stoned", "420", "smoke", "popper", "pods", "pod", "juul", "suorin", "vape", "vaping", "vape", "steroids", "steroid", "heroin", "drink", "drunk", "grass", "pill", "stuff", "drug", "drugs", "dealer", "crack", "substance", "illegal", "money", "purple", "meth", "opium"]


#Create a function that takes in a input string and classify the message
def predict(message):
    #clean and split the message
    message = re.sub('\W', ' ', message)
    message = message.lower().strip()
    message = message.split()

    #calculate suspicious or not
    if any(i in message for i in bag_of_words):
        return "suspicious"
    else:
        return "not suspicious"

In [11]:
#Try out new messages

predict("weed?? hellya!!")
predict("Hello, how are you?")
predict("pills, pills, pills!!")
predict("http://www.stoned.com/where_to_get_weed")
predict("http://www.wikipedia.com/datascience")

'suspicious'

'not suspicious'

'suspicious'

'suspicious'

'not suspicious'

In [22]:
#Uncomment below to ouput the result

#data_randomized2['predicted'] = data_randomized2['Message'].apply(predict)
#data_randomized2['predicted'].value_counts
#data_randomized2.loc[:, ["Message", "predicted"]].sort_values("predicted", ascending=False).reset_index()