### create_clean_df.py

cleans the data to be read into the model

In [None]:
# %load src/create_clean_df.py
import pandas as pd

if __name__ == "__main__":
    df = pd.read_json('data/data.zip')
    df['fraud'] = df['acct_type'].apply(lambda x: 'fraud' in x)
    df = df[df['acct_type']!='spammer']
    df.to_pickle('data/labelled_dataframe.p')

### model.py

creates a model based on the company training data provided

In [None]:
# %load src/model.py
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier

class MyModel():
    def __init__(self):
        self.model = RandomForestClassifier(n_estimators=1000, max_features=9)
        self.features = ["body_length","channels","delivery_method","fb_published","gts","org_facebook","org_twitter","user_age","has_header","venue_longitude","payout_type_num","user_type"]

    def fit(self, X, y):
        # Map payout type to integers
        X["payout_type_num"] = X["payout_type"].map({"ACH":1,"CHECK":2})
        X["payout_type_num"].fillna(3,inplace = True)
        # Get only the predictive features
        X = X[self.features]
        # Fill missing value headers to False
        X["has_header"].fillna(0,inplace=True)
        # Fill remaining few missing with median
        self.median = X.median()
        X.fillna(self.median,inplace= True)
        self.model.fit(X, y)
        return self

    def predict_proba(self, X):
        # Map payout type to integers
        X["payout_type_num"] = X["payout_type"].map({"ACH":1,"CHECK":2})
        X["payout_type_num"].fillna(3,inplace = True)
        # Get only the predictive features
        X = X[self.features]
        # Fill missing value headers to False
        X["has_header"].fillna(0,inplace=True)
        # Fill remaining few missing with median
        X.fillna(self.median,inplace= True)
        return self.model.predict_proba(X)


def get_data(datafile):
    df = pd.read_json(datafile)
    df['fraud'] = df['acct_type'].apply(lambda x: 'fraud' in x)
    y = df.pop('fraud')
    X = df
    return X, y

if __name__ == '__main__':
    X, y = get_data('data/data.json')
    model = MyModel()
    model.fit(X, y)
    with open('data/model.pkl', 'wb') as f:
        # Write the model to a file.
        pickle.dump(model, f)


### predict.py

creates prediction probabilities

In [None]:
# %load src/predict.py
from model import MyModel
import pandas as pd
import pickle

X = pd.read_json('data/test_script_examples.json')

with open('data/model.pkl', 'rb') as f:
    model = pickle.load(f)

print(model.predict_proba(X)[:,1])


### test_script_examples

In [None]:
test = df.sample(10)
test.to_json('data/test_script_examples.json')

In [None]:
pd.read_json('data/test_script_examples.json').head(10)

### Live Data

calls API to provide realtime data<br>
data is stored into a mongo db

In [None]:
# %load src/client.py
from model import MyModel
import pandas as pd
import pickle
import requests
import time
from pymongo import MongoClient
import boto3


class EventAPIClient:
    """Realtime Events API Client"""

    def __init__(self, first_sequence_number=0,
                 api_url = 'https://hxobin8em5.execute-api.us-west-2.amazonaws.com/api/',
                 api_key = 'vYm9mTUuspeyAWH1v-acfoTlck-tCxwTw9YfCynC',
                 db = None):
        """Initialize the API client."""
        self.next_sequence_number = first_sequence_number
        self.api_url = api_url
        self.api_key = api_key

        # Create mongo instance
        client = MongoClient('localhost', 27017)
        db = client['fraud']
        self.predictions = db['predictions']

        # Create an S3 client
        self.s3 = boto3.client('s3')

        # Load model
        with open('data/model.pkl', 'rb') as f:
            self.model = pickle.load(f)

    def save_to_database(self, row):
        """Save a data row to the database."""
        # Set row to pandas
        X = pd.DataFrame([row])
        # Predict
        y = self.model.predict_proba(X)
        # Append prediction
        row['probability'] = y[0,1].round(4)
        self.predictions.update(row, row, upsert=True)
        print('You have {} entries in your Database'.format(self.predictions.find().count()))

    def get_data(self):
        """Fetch data from the API."""
        payload = {'api_key': self.api_key,
                   'sequence_number': self.next_sequence_number}
        response = requests.post(self.api_url, json=payload)
        data = response.json()
        self.next_sequence_number = data['_next_sequence_number']
        return data['data']

    def collect(self, interval=30):
        """Check for new data from the API periodically."""
        while True:
            print("Requesting data...")
            data = self.get_data()
            if data:
                print("Saving...")
                for row in data:
                    self.save_to_database(row)
                ## Create csv image of database
                df =  pd.DataFrame(list(self.predictions.find()))
                df.to_csv('data/temp.csv', index=False)

                # Uploads the given file using a managed uploader, which will split up large
                # files automatically and upload parts in parallel.
                self.s3.upload_file('data/temp.csv', 'dsi-fraud-casestudy', 'live.csv', ExtraArgs={'ACL': 'public-read'})
            else:
                print("No new data received.")
            print(f"Waiting {interval} seconds...")
            time.sleep(interval)


In [None]:
# %load src/collect.py
from client import EventAPIClient
from model import MyModel

# Continuously collects data
client = EventAPIClient()
client.collect()


### Accessing database

reads latest data from the mongo db and uploads it to the server so the Tableau dashboard can read it (tableau public does not allow for mongo connection)

In [None]:
from pymongo import MongoClient
import pprint
import pandas as pd
import boto3

# Create a mongo client
client = MongoClient('localhost', 27017)
db = client['fraud']
predictions = db['predictions']

# Create an S3 client
s3 = boto3.client('s3')

In [None]:
# Create csv image of database
df =  pd.DataFrame(list(predictions.find()))
df.to_csv('data/temp.csv', index=False)

# Uploads the given file using a managed uploader, which will split up large
# files automatically and upload parts in parallel.
s3.upload_file('data/temp.csv', 'dsi-fraud-casestudy', 'live.csv', ExtraArgs={'ACL': 'public-read'})