# Introduction to Catching Fraud

In [None]:
# import python libraries
import folium
import pandas as pd
import numpy as np

### Load data from file into dataframe

In [None]:
data_pd = pd.read_csv('gwc-risk-dataset.csv')
print(data_pd.shape)

In [None]:
data_pd.head()

In [None]:
print data_pd['payment_type'].unique()

In [None]:
print data_pd['disbursement_type'].unique()

In [None]:
print data_pd['application'].unique()

In [None]:
print data_pd['recipient_country'].unique()

In [None]:
print data_pd['fraud_type'].unique()

In [None]:
locations = data_pd[['sender_device_latitude', 'sender_device_longitude']].drop_duplicates()
locations.head()

In [None]:
# How many locations are there?
len(locations)

## Calculate fraud rate for every location and plot on a map

In [None]:
location_stats = []
for index, location in locations.iterrows():
    latitude = location['sender_device_latitude']
    longitude = location['sender_device_longitude']
    # create new dataframe with only one lat/long location
    loc_data = data_pd[(data_pd['sender_device_latitude'] == latitude) & (data_pd['sender_device_longitude'] == longitude)]
    # find the count of transactions that are fraudulent
    count_fraud_data = loc_data[loc_data['fraud_type'] != 'None'].shape[0]
    # find the count of transactions that are not fraudulent
    count_data = loc_data.shape[0]
    # calculate the rate of fraud for this location
    location_stats.append([latitude, longitude, float(count_fraud_data)/float(count_data)])

In [None]:
style=['OpenStreetMap','Stamen Terrain','Stamen Toner','Mapbox Bright']
Lat_Long_US = [39.6150518,-101.1868806]
m = folium.Map(location=Lat_Long_US, zoom_start=4, tiles=style[0])
m

In [None]:
# For every location, plot a circle on the map
for i, (latitude, longitude, fraud_rate) in enumerate(location_stats):
    if fraud_rate < 0.1:
        color = 'green'
    elif fraud_rate < 0.15:
        color = 'yellow'
    elif fraud_rate < 0.25:
        color = 'orange'
    else:
        color = 'red'
    folium.CircleMarker(
        location=[latitude, longitude],
        popup='%.2f percent fraud rate, %s, %s' % (fraud_rate, latitude, longitude),
        color=color,
        radius = 10,
        fill=True,
        fill_color=color
    ).add_to(m)
    
m

Look closely at the dark red circle in Minnesota, we caught one fraud ring!

### How can we see the fraud more clearly?

##### Create radius scale

In [None]:
# set a minimum and maximum value for the radius of each circle
radius_scale = [5, 20]
# set a minimum and maximum value for the fraud rate of each location
data_scale = [0, 1]
# Now, given a fraud rate of 0.5, what radius should our circle be?
np.interp(0.5,data_scale, radius_scale)

##### Create color scale

In [None]:
# create a list of colors (from green to red)
color_scale = ['#1a9641','#a6d96a','#fee08b','#fdae61', '#d7191c']
# create a list of bins
color_bin = [0, 0.11, 0.12, 0.14, 0.90, 1]
# Now, given a fraud rate of 0.45, what color should our circle be?
pd.cut([0.45],color_bin,labels=color_scale)[0]

In [None]:
m = folium.Map(
    location=Lat_Long_US,
    zoom_start=4,
    tiles=style[0]
    )

for i, (latitude, longitude, fraud_rate) in enumerate(location_stats):
    color = pd.cut([fraud_rate],color_bin,labels=color_scale)[0]
    folium.CircleMarker(
        location=[latitude, longitude],
        popup='%.2f percent fraud rate' % fraud_rate,
        color=color,
        radius = np.interp(fraud_rate,data_scale, radius_scale),
        fill=True,
        fill_color=color
    ).add_to(m)
    
m

## How much fraud is there?

In [None]:
pd.crosstab(index=data_pd['fraud_type'], columns='Count')

In [None]:
pd.crosstab(index=data_pd['application'], columns=data_pd['fraud_type'])

In [None]:
pd.crosstab(index=data_pd['payment_type'], columns=data_pd['fraud_type'])

In [None]:
pd.crosstab(index=data_pd['disbursement_type'], columns=data_pd['fraud_type'])

In [None]:
data_pd['is_fraud'] = data_pd['fraud_type'].apply(lambda x: 0 if x == 'None' else 1)
pd.crosstab(index=data_pd['disbursement_type'], columns=data_pd['is_fraud'])

In [None]:
country_data = pd.crosstab(index=data_pd['recipient_country'], columns=data_pd['is_fraud'])
country_data.head()

### What country has the most fraudulent transactions?

In [None]:
country_data.loc[country_data[1].idxmax()]

In [None]:
country_data.sort_values(by=[1], ascending=False)

### Can we narrow down the fraud trend for SI?

In [None]:
# hint, try running similar lines from above like crosstab to see fraud 
# trends on the subset data, country_data.

mx_data = data_pd[data_pd['recipient_country'] == 'MX']
mx_data.head()

In [None]:
pd.crosstab(index=mx_data['application'], columns=mx_data['is_fraud'])

In [None]:
pd.crosstab(index=mx_data['disbursement_type'], columns=mx_data['is_fraud'])

In [None]:
mx_cp_data = mx_data[mx_data['disbursement_type'] == 'cash pickup']
mx_cp_data.head()

In [None]:
count_fraud_data = mx_cp_data[mx_cp_data['is_fraud'] == 1].shape[0]
count_data = mx_cp_data.shape[0]
print float(count_fraud_data)/count_data

Wow, a fraud rate of over 97%, we caught another fraud ring!

## Further Exercise

### Can you catch the last fraud ring?

Here's some example code to get you started:

In [None]:
# Remove mexico, cash pickup data, since we already established it as a fraud ring
data_pd = data_pd[data_pd['recipient_country'] != 'MX']
data_pd = data_pd[data_pd['disbursement_type'] != 'cash_pickup']

In [None]:
# Use crosstab to identify another trend
pd.crosstab(index=data_pd['application'], columns=data_pd['is_fraud'])

In [None]:
data_pd = data_pd[data_pd['application'] == 'web']
pd.crosstab(index=data_pd['disbursement_type'], columns=data_pd['is_fraud'])