# Insurance Claims Fraud Graph
Purpose of this module is to learn how to build your first graph from scratch

## Install packages

In [None]:
!%pip install graphdatascience pandas

In [None]:
import pandas as pd
from graphdatascience import GraphDataScience

In [None]:
DB_ULR = "neo4j://localhost:7687"
DB_USER = "neo4j"
DB_PASS = "test1234"
DB_NAME = "neo4j"
gds = GraphDataScience(DB_ULR, auth=(DB_USER, DB_PASS))

## Data set
Insurance Calims Fraud Data https://www.kaggle.com/datasets/mastmustu/insurance-claims-fraud-data



In [None]:
# This file contains data on the Employees/Agent who are working on the Insurance Claim
employee_csv = pd.read_csv("./datasets/employee_data.csv")
employee_csv.head()

In [None]:
# Insurance Claims transaction data
insurance_csv = pd.read_csv("./datasets/insurance_data.csv")
insurance_csv.columns

In [None]:
# This file contains the data about vendors who assist Insurance Agent while reviewing and investigating the claims
vendor_csv = pd.read_csv("./datasets/vendor_data.csv")
vendor_csv.head()

## Iteration 1

Lets build the following graph
```cypher
(:Agent) -[:assigned_to]-> (:Claim) <-[:involved_in]-(:Vendor)
```

In [None]:
# Create Vendor nodes

# Create constraint on Vendor nodes
gds.run_cypher('create constraint if not exists for (v:Vendor) require (v.id) is node key')

# Select the data we need for vendor nodes
vendor_node_data = vendor_csv[['VENDOR_ID', 'VENDOR_NAME']].drop_duplicates()

# Insert the data as Vendor nodes with id and name properties
number_of_vendor_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Vendor{id: row['VENDOR_ID']})
        set v.name = row['VENDOR_NAME']
    return count(*) as nodes_created
''', params = { 'rows' : vendor_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_vendor_nodes == vendor_node_data.shape[0], "Output does not match input"


In [None]:
# Create Agent nodes

# Create constraint on Agent nodes
gds.run_cypher('create constraint if not exists for (v:Agent) require (v.id) is node key')

# Select the data we need for vendor nodes
agent_node_data = employee_csv[['AGENT_ID', 'AGENT_NAME']].drop_duplicates()

# Insert the data as Vendor nodes with id and name properties
number_of_agent_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Agent{id: row['AGENT_ID']})
        set v.name = row['AGENT_NAME']
    return count(*) as nodes_created
''', params = { 'rows' : agent_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_agent_nodes == agent_node_data.shape[0], "Output does not match input"


In [None]:
# Create Claim nodes
gds.run_cypher('create constraint if not exists for (v:Claim) require (v.id) is node key')

# Select the data we need for vendor nodes
claim_node_data = insurance_csv[['TRANSACTION_ID', 'REPORT_DT', 'CLAIM_AMOUNT', 'INSURANCE_TYPE', 'CLAIM_STATUS', 'RISK_SEGMENTATION']].drop_duplicates()

# Insert the data as Claim nodes with id and name properties
number_of_claim_nodes = gds.run_cypher('''
    unwind $rows as row
    merge (v:Claim{id: row['TRANSACTION_ID']})
        set v.report_date=date(row['REPORT_DT']),
            v.claim_amount=toInteger(row['CLAIM_AMOUNT']),
            v.insurance_type=row['INSURANCE_TYPE'],
            v.status=row['CLAIM_STATUS'],
            v.risk_segment=row['RISK_SEGMENTATION']
    return count(*) as nodes_created
''', params = { 'rows' : claim_node_data.to_dict('records') })["nodes_created"][0]

assert number_of_claim_nodes == claim_node_data.shape[0], "Output does not match input"

In [None]:
# Check what we have in the database so far
gds.run_cypher('match (n) return labels(n) as labels, count(*) as count').head()

In [None]:
# Create (:Claim) <-[:assigned_to]-(:Agent) relationship

# Select the data we need
assigned_to_rel_data = insurance_csv[['TRANSACTION_ID', 'AGENT_ID']].drop_duplicates()

# Create relationships
number_of_assigned_to_rels = gds.run_cypher('''
    unwind $rows as row
    match (c:Claim{id: row['TRANSACTION_ID']}),
          (a:Agent{id: row['AGENT_ID']})
    with c,a
    merge (c)<-[:assigned_to]-(a)
    return count(*) as rels_created
''', params = { 'rows' : assigned_to_rel_data.to_dict('records') })["rels_created"][0]

assert number_of_assigned_to_rels == assigned_to_rel_data.shape[0], "Output does not match input"

In [None]:
# Create (:Claim) <-[:involved_in]-(:Vendor) relationship
#AGENT_ID	VENDOR_ID

# Select the data we need
involved_in_rel_data = insurance_csv[['TRANSACTION_ID', 'VENDOR_ID']].dropna().drop_duplicates()

# Create relationships
number_of_involved_in_rels = gds.run_cypher('''
    unwind $rows as row
    match (v:Vendor{id: row['VENDOR_ID']}),
          (c:Claim{id: row['TRANSACTION_ID']})
    with c, v, row
    merge (c)<-[:involved_in]-(v)
    return count(*) as rels_created
''', params = { 'rows' : involved_in_rel_data.to_dict('records') })["rels_created"][0]

assert number_of_involved_in_rels == involved_in_rel_data.shape[0], "Output does not match input"

In [None]:
# Check what we have in the database so far
gds.run_cypher('match ()-[r]->() return type(r) as relationship, count(*) as count').head()

In [None]:
# Check risk segmentations
gds.run_cypher('match (c:Claim) return c.risk_segment as segmentation, count(*) as count').head()


In [None]:
# Check insurance type for claims
gds.run_cypher(''' 
    match (c:Claim) 
    return  c.insurance_type as insurance_type, 
            count(*) as count, 
            sum(c.claim_amount) as total_claim_amount 
    order by total_claim_amount desc
''').head(10)

# Basic queries

In [None]:
# Do we have any colluding vendors and agents
gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(:Claim)<-[:involved_in]-(v:Vendor)
    return a.id as agent, v.id as vendor, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

Example

![](./images/colluding.png)

Note: This looks fine, for now. In a larger sample we would probably find some parties that are involved together too freqently

In [None]:
# Top agents
gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(:Claim)
    return a.id as agent, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

In [None]:
# Top vendors
gds.run_cypher('''
    match (a:Vendor)-[:involved_in]->(:Claim)
    return a.id as vendor, count(*) as numberOfClaims order by numberOfClaims desc limit 10
''').head()

In [None]:
# Susplcious agents?
sus_agents=gds.run_cypher('''
    match (a:Agent)-[:assigned_to]->(c:Claim{status:'A'})
    with a, sum(c.claim_amount) as total_claim_amount
    with avg(total_claim_amount) as avg_total_claim_amount
    match (a:Agent)-[:assigned_to]->(c:Claim{status:'A'})
    with 
        a.id as agent,
        count(*) as numberOfClaims, 
        sum(c.claim_amount) as total_claim_amount, 
        avg_total_claim_amount
    return 
        agent, 
        numberOfClaims, 
        total_claim_amount, 
        round(100.0*total_claim_amount/avg_total_claim_amount) as percent_of_average 
    order by total_claim_amount desc limit 10
''').head(10)

In [None]:
# Excercise: Make a histogram (for Susplcious agents)