# Insurance Claims Fraud Graph
Purpose of this module is to learn how to build your first graph from scratch

## Install packages

In [1]:
!%pip install graphdatascience pandas

zsh:fg:1: no job control in this shell.


In [3]:
import pandas as pd
from graphdatascience import GraphDataScience

In [4]:
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "neo4j"
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))

## Data set
Insurance Calims Fraud Data https://www.kaggle.com/datasets/mastmustu/insurance-claims-fraud-data



In [9]:
# This file contains data on the Employees/Agent who are working on the Insurance Claim
employee_csv = pd.read_csv("./datasets/employee_data.csv")
employee_csv.head()

Unnamed: 0,AGENT_ID,AGENT_NAME,DATE_OF_JOINING,ADDRESS_LINE1,ADDRESS_LINE2,CITY,STATE,POSTAL_CODE,EMP_ROUTING_NUMBER,EMP_ACCT_NUMBER
0,AGENT00001,Ray Johns,1993-06-05,1402 Maggies Way,,Waterbury Center,VT,5677,34584958,HKUN51252328472585
1,AGENT00002,Angelo Borjon,2005-12-27,414 Tanya Pass,,Panama City,FL,32404,107363763,OPIS19290040088204
2,AGENT00003,Candy Spellman,2003-09-02,606 National Street,#306,Fayetteville,AR,72701,81744097,YSCJ67489688482590
3,AGENT00004,Mary Smith,2004-09-23,235 Hugh Thomas Drive,,Panama City,FL,32404,67563771,ZANG21285355574581
4,AGENT00005,Mildred Diaz,2011-06-21,3426 Broadview Street,,Montgomery,AL,36110,114951317,DZFS82244494451134


In [93]:
# Insurance Claims transaction data
insurance_csv = pd.read_csv("./datasets/insurance_data.csv")
insurance_csv.head()

Unnamed: 0,TXN_DATE_TIME,TRANSACTION_ID,CUSTOMER_ID,POLICY_NUMBER,POLICY_EFF_DT,LOSS_DT,REPORT_DT,INSURANCE_TYPE,PREMIUM_AMOUNT,CLAIM_AMOUNT,...,CLAIM_STATUS,INCIDENT_SEVERITY,AUTHORITY_CONTACTED,ANY_INJURY,POLICE_REPORT_AVAILABLE,INCIDENT_STATE,INCIDENT_CITY,INCIDENT_HOUR_OF_THE_DAY,AGENT_ID,VENDOR_ID
0,2020-06-01 00:00:00,TXN00000001,A00003822,PLC00008468,2015-06-23,2020-05-16,2020-05-21,Health,157.13,9000,...,A,Major Loss,Police,0,1,GA,Savannah,4,AGENT00413,VNDR00556
1,2020-06-01 00:00:00,TXN00000002,A00008149,PLC00009594,2018-04-21,2020-05-13,2020-05-18,Property,141.71,26000,...,A,Total Loss,Ambulance,1,0,AL,Montgomery,0,AGENT00769,VNDR00592
2,2020-06-01 00:00:00,TXN00000003,A00003172,PLC00007969,2019-10-03,2020-05-21,2020-05-26,Property,157.24,13000,...,A,Total Loss,Police,0,1,CO,Grand Junction,19,AGENT00883,VNDR00031
3,2020-06-01 00:00:00,TXN00000004,A00007572,PLC00009292,2016-11-29,2020-05-14,2020-05-19,Health,172.87,16000,...,A,Minor Loss,Ambulance,0,0,GA,Savannah,12,AGENT00278,VNDR00075
4,2020-06-01 00:00:00,TXN00000005,A00008173,PLC00000204,2011-12-26,2020-05-17,2020-05-22,Travel,88.53,3000,...,A,Major Loss,Police,0,1,TN,Nashville,18,AGENT00636,VNDR00472


In [8]:
# This file contains the data about vendors who assist Insurance Agent while reviewing and investigating the claims
vendor_csv = pd.read_csv("./datasets/vendor_data.csv")
vendor_csv.head()

Unnamed: 0,VENDOR_ID,VENDOR_NAME,ADDRESS_LINE1,ADDRESS_LINE2,CITY,STATE,POSTAL_CODE
0,VNDR00001,"King, Proctor and Jones",2027 North Shannon Drive,#5,Fayetteville,AR,72703
1,VNDR00002,Garcia Ltd,5701 East Shirley Lane,,Montgomery,AL,36117
2,VNDR00003,Cherry LLC,1217 Cottondale Road,,Montgomery,AL,36109
3,VNDR00004,Mays-Benson,227 West Montgomery Cross Road,#736,Savannah,GA,31406
4,VNDR00005,Wilson PLC,23 North Hill Street,,Nashville,TN,37210


## Iteration 1

Lets build the following graph
```cypher
(:Agent) -[:assigned_to]-> (:Claim) <-[:involved_in]-(:Vendor)
```

In [29]:
# Create Vendor nodes

# Create constraint on Vendor nodes
gds.run_cypher('create constraint if not exists for (v:Vendor) require (v.id) is node key')

# Select the data we need for vendor nodes
vendor_node_data = vendor_csv[['VENDOR_ID', 'VENDOR_NAME']].drop_duplicates()

# Insert the data as Vendor nodes with id and name properties
number_of_vendor_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Vendor{id: row['VENDOR_ID']})
        set v.name = row['VENDOR_NAME']
    return count(*) as nodes_created
''', params = { 'rows' : vendor_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_vendor_nodes == vendor_node_data.shape[0], "Output does not match input"


In [30]:
# Create Agent nodes

# Create constraint on Agent nodes
gds.run_cypher('create constraint if not exists for (v:Agent) require (v.id) is node key')

# Select the data we need for vendor nodes
agent_node_data = employee_csv[['AGENT_ID', 'AGENT_NAME']].drop_duplicates()

# Insert the data as Vendor nodes with id and name properties
number_of_agent_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Agent{id: row['AGENT_ID']})
        set v.name = row['AGENT_NAME']
    return count(*) as nodes_created
''', params = { 'rows' : agent_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_agent_nodes == agent_node_data.shape[0], "Output does not match input"


In [82]:
# Create Claim nodes
gds.run_cypher('create constraint if not exists for (v:Claim) require (v.id) is node key')

# Select the data we need for vendor nodes
claim_node_data = insurance_csv[['TRANSACTION_ID', 'REPORT_DT', 'CLAIM_AMOUNT', 'INSURANCE_TYPE', 'CLAIM_STATUS']].drop_duplicates()

# Insert the data as Claim nodes with id and name properties
number_of_claim_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Claim{id: row['TRANSACTION_ID']})
        set v.report_date=date(row['REPORT_DT']),
            v.claim_amount=toInteger(row['CLAIM_AMOUNT']),
            v.insurance_type=row['INSURANCE_TYPE'],
            v.status=row['CLAIM_STATUS']
    return count(*) as nodes_created
''', params = { 'rows' : claim_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_claim_nodes == claim_node_data.shape[0], "Output does not match input"

In [72]:
# Check what we have in the database so far
gds.run_cypher('match (n) return labels(n) as labels, count(*) as count').head()

Unnamed: 0,labels,count
0,[Vendor],600
1,[Agent],1200
2,[Customer],10000
3,[Insurance],10000
4,[Claim],10000


In [39]:
# Create (:Claim) <-[:assigned_to]-(:Agent) relationship

# Select the data we need
assigned_to_rel_data = insurance_csv[['TRANSACTION_ID', 'AGENT_ID']].drop_duplicates()

# Create relationships
number_of_assigned_to_rels = gds.run_cypher('''
    unwind $rows as row
    match (c:Claim{id: row['TRANSACTION_ID']}),
          (a:Agent{id: row['AGENT_ID']})
    with c,a
    merge (c)<-[:assigned_to]-(a)
    return count(*) as rels_created
''', params = { 'rows' : assigned_to_rel_data.to_dict('records') })["rels_created"][0]

assert number_of_assigned_to_rels == assigned_to_rel_data.shape[0], "Output does not match input"

In [61]:
# Create (:Claim) <-[:involved_in]-(:Vendor) relationship
#AGENT_ID	VENDOR_ID

# Select the data we need
involved_in_rel_data = insurance_csv[['TRANSACTION_ID', 'VENDOR_ID']].dropna().drop_duplicates()

# Create relationships
number_of_involved_in_rels = gds.run_cypher('''
    unwind $rows as row
    match (v:Vendor{id: row['VENDOR_ID']}),
          (c:Claim{id: row['TRANSACTION_ID']})
    with c, v, row
    merge (c)<-[:involved_in]-(v)
    return count(*) as rels_created
''', params = { 'rows' : involved_in_rel_data.to_dict('records') })["rels_created"][0]

assert number_of_involved_in_rels == involved_in_rel_data.shape[0], "Output does not match input"

In [73]:
# Check what we have in the database so far
gds.run_cypher('match ()-[r]->() return type(r) as relationship, count(*) as count').head()

Unnamed: 0,relationship,count
0,assigned_to,10000
1,involved_in,6755
2,_Bloom_HAS_SCENE_,1


In [74]:
# Check risk segmentations
gds.run_cypher('match (c:Customer) return c.risk_segment as segmentation, count(*) as count').head()


Unnamed: 0,segmentation,count
0,L,4395
1,M,4150
2,H,1455


In [77]:
# Check insurance type for claims
gds.run_cypher('match (c:Claim) return c.insurance_type as insurance_type, count(*) as count, sum(c.claim_amount) as total_claim_amount order by total_claim_amount desc').head(10)

Unnamed: 0,insurance_type,count,total_claim_amount
0,Life,1682,91478000
1,Property,1692,41579000
2,Health,1690,18254000
3,Motor,1574,8663000
4,Travel,1670,4976000
5,Mobile,1692,688300


# Basic queries

In [69]:
# Do we have any colluding vendors and agents
gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(:Claim)<-[:involved_in]-(v:Vendor)
    return a.id as agent, v.id as vendor, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

Unnamed: 0,agent,vendor,numberOfClaims
0,AGENT01199,VNDR00115,2
1,AGENT00598,VNDR00123,2
2,AGENT01025,VNDR00025,2
3,AGENT00888,VNDR00112,2
4,AGENT00034,VNDR00118,2


Example

![](./images/colluding.png)

In [78]:
# Top agents
gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(:Claim)
    return a.id as agent, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

Unnamed: 0,agent,numberOfClaims
0,AGENT00992,19
1,AGENT00319,18
2,AGENT00849,17
3,AGENT00388,17
4,AGENT01097,16


In [80]:
# Top vendors
gds.run_cypher('''
    match (a:Vendor)-[:involved_in]->(:Claim)
    return a.id as vendor, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

Unnamed: 0,vendor,numberOfClaims
0,VNDR00535,28
1,VNDR00451,28
2,VNDR00083,27
3,VNDR00167,26
4,VNDR00590,26


In [92]:
# Susplcious agents?
gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(c:Claim{status:'A'})
    with a, sum(c.claim_amount) as total_claim_amount
    with avg(total_claim_amount) as avg_total_claim_amount
    match (a:Agent)-[:assigned_to]->(c:Claim{status:'A'})
    with 
        a.id as agent,
        count(*) as numberOfClaims, 
        sum(c.claim_amount) as total_claim_amount, 
        avg_total_claim_amount
    return 
        agent, 
        numberOfClaims, 
        total_claim_amount, 
        round(100.0*total_claim_amount/avg_total_claim_amount) as percent_of_average 
    order by total_claim_amount desc limit 10
''').head(10)

Unnamed: 0,agent,numberOfClaims,total_claim_amount,percent_of_average
0,AGENT00679,13,489000,373.0
1,AGENT00771,15,422100,322.0
2,AGENT00807,11,396800,303.0
3,AGENT00789,9,392900,300.0
4,AGENT00525,15,385900,295.0
5,AGENT00319,18,375600,287.0
6,AGENT00388,17,373800,285.0
7,AGENT00482,11,368000,281.0
8,AGENT00881,14,366400,280.0
9,AGENT00763,11,361000,276.0
