# Helthcare provider fraud
Data set from kaggle: https://www.kaggle.com/datasets/rohitrox/healthcare-provider-fraud-detection-analysis

In [None]:
%%capture
%pip install graphdatascience pandas ipython numpy

In [None]:
import pandas as pd
from neo4j import GraphDatabase
from graphdatascience import GraphDataScience
import numpy as np

In [None]:
# Neo4j Sandbox Connection details
DB_ULR = 'neo4j://localhost:7687'
DB_USER = 'neo4j'
DB_PASS = 'test1234'
gds = GraphDataScience.from_neo4j_driver(DB_ULR, auth=(DB_USER, DB_PASS))
gds.version()

In [None]:
# Provider
train_provider_csv = pd.read_csv("./datasets/Train-1542865627584.csv")
train_provider_csv.head()

In [None]:
# Create Provider nodes
gds.run_cypher('create constraint if not exists for (n:Provider) require (n.id) is node key')
label_dist = gds.run_cypher('''
    unwind $data as row
    merge (n:Provider{id: row.Provider})
        set n.fraud = case row.PotentialFraud when 'Yes' then true else false end
    return n.fraud as is_fraud, count(*) as count
''', params = {'data': train_provider_csv.to_dict('records')})
label_dist.head()

In [None]:
# Beneficiarydata
pd.set_option('display.max_columns', None)
train_beneficiary_csv = pd.read_csv("./datasets/Train_Beneficiarydata-1542865627584.csv")
train_beneficiary_csv.head()

In [None]:
train_beneficiary_csv.columns


In [None]:
gds.run_cypher('create constraint if not exists for (n:Condition) require (n.id) is node key')
gds.run_cypher('''
    unwind [
        'RenalDiseaseIndicator',
        'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
        'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
        'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
        'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
        'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
        'ChronicCond_stroke'
    ] as conditionId
    merge (n:Condition{id: conditionId})
''')

In [None]:
# Create Beneficiary nodes and also has_condition relationships
gds.run_cypher('create constraint if not exists for (n:Beneficiary) require (n.id) is node key')
gds.run_cypher('''
    match (c:Condition)
    with collect(c) as conditions
    unwind $data as row
    merge (n:Beneficiary{id: row.BeneID})
        set n.dob = date(row.DOB),
            n.gender = row.Gender,
            n.race = row.Race
    with conditions, n, row
    call {
        with row, conditions, n
        foreach(
            c in [x in conditions where row[x.id] = 1 or row[x.id] = 'Y' | x] |
            merge (n)-[:has_condition]->(c)
        )
    }
''', params = {'data': train_beneficiary_csv.to_dict('records')})

In [None]:
# Set date of death
dead = train_beneficiary_csv[['BeneID','DOD']].dropna()
gds.run_cypher(''' 
    unwind $data as row
    match (n:Beneficiary{id: row.BeneID})
        set n.dod = date(row.DOD)
''', params = {'data': dead.to_dict('records')})

In [None]:
# Compute age of Beneficiaries
agedist = gds.run_cypher(''' 
    with date() as today
    match (n:Beneficiary)
    set n.age = duration.between(n.dob, today).years
    return n.age as age, count(*) as beneficiaries order by age
''')
agedist.hist("age")

In [None]:
# Inpatientdata
pd.set_option('display.max_columns', None)
train_inpatient_csv = pd.read_csv("./datasets/Train_Inpatientdata-1542865627584.csv")
train_inpatient_csv.fillna( value=0, inplace=True)
train_inpatient_csv.head()

In [None]:
train_inpatient_csv.columns

In [None]:
# Create claims data
gds.run_cypher('create constraint if not exists for (n:Claim) require (n.id) is node key')
gds.run_cypher('create constraint if not exists for (n:Provider) require (n.id) is node key')
gds.run_cypher('create constraint if not exists for (n:Physician) require (n.id) is node key')
gds.run_cypher('create constraint if not exists for (n:Diagnosis) require (n.id) is node key')
gds.run_cypher('create constraint if not exists for (n:Procedure) require (n.id) is node key')

claims_cypher = '''
    unwind $data as row
    merge (c:Claim{id: row.ClaimID})
        set c.inpatient = $inpatient,
            c.startDate = date(row.ClaimStartDt),
            c.endDate = date(row.ClaimEndDt),
            c.admissionDate = date(row.AdmissionDt),
            c.dischargeDate = date(row.DischargeDt),
            c.deductible_amt = row.DeductibleAmtPaid,
            c.reimbursed_amt = row.InscClaimAmtReimbursed
    merge (p:Provider{id: row.Provider})
    merge (c)-[:provider]->(p)
    merge (b:Beneficiary{id: row.BeneID})
    merge (c)-[:beneficiary]->(b)
    with c,p, row
    call {
        with c,p,row
        with c,p,row where row.ClmAdmitDiagnosisCode <> 0
        merge (d:Diagnosis{id: row.ClmAdmitDiagnosisCode})
        merge (c)-[:admit_diagonisis]->(d)
    }
    call {
        with c,p,row
        with c,p,row where row.DiagnosisGroupCode <> 0
        merge (d:Diagnosis{id: row.DiagnosisGroupCode})
        merge (c)-[:diagonisis_group]->(d)
    }
    call {
        with c,p,row
        with c,p, [ x in  [ 'ClmDiagnosisCode_1', 'ClmDiagnosisCode_2', 'ClmDiagnosisCode_3',
                            'ClmDiagnosisCode_4', 'ClmDiagnosisCode_5', 'ClmDiagnosisCode_6',
                            'ClmDiagnosisCode_7', 'ClmDiagnosisCode_8', 'ClmDiagnosisCode_9',
                            'ClmDiagnosisCode_10' ] where row[x] <> 0 |
                   row[x]] as codes
        foreach( code in codes | 
            merge (d:Diagnosis{id: code})
            merge (c)-[:diagonisis_code]->(d)
        )
    }
    call {
        with c,p,row
        with c,p, [ x in  [ 'ClmProcedureCode_1', 'ClmProcedureCode_2',
                            'ClmProcedureCode_3', 'ClmProcedureCode_4', 'ClmProcedureCode_5',
                            'ClmProcedureCode_6' ] where row[x] <> 0 |
                   row[x]] as codes
        foreach( code in codes | 
            merge (d:Procedure{id: code})
            merge (c)-[:procedure_code]->(d)
        )
    }
    call {
        with c,p,row
        with c,p,row where row.AttendingPhysician <> 0 
        merge (ap:Physician{id: row.AttendingPhysician})
        merge (c)-[:attending]->(ap)
    }
    call {
        with c,p,row
        with c,p,row where row.OperatingPhysician <> 0 
        merge (op:Physician{id: row.OperatingPhysician})
        merge (c)-[:operating]->(op)
    }
    call {
        with c,p,row
        with c,p,row where row.OtherPhysician <> 0
        merge (ot:Physician{id: row.OtherPhysician})
        merge (c)-[:other]->(ot)
    }
'''

gds.run_cypher(claims_cypher, params = {'data': train_inpatient_csv.to_dict('records'), 'inpatient': True})

In [None]:
# Outpatientdata
pd.set_option('display.max_columns', None)
train_outpatient_csv = pd.read_csv("./datasets/Train_Outpatientdata-1542865627584.csv")
train_outpatient_csv.fillna( value=0, inplace=True)
train_outpatient_csv.head()

In [None]:
for chunk in np.array_split(train_outpatient_csv, 10):
    gds.run_cypher(claims_cypher, params = {'data': chunk.to_dict('records'), 'inpatient': False})

## Graph model so far
![](./images/model.png)

## Some questions that we can investigate with cypher queries

In [None]:
# 1. Billing for services that were not provided (Claim with no Procedure or Pysician)
fake_invoice = gds.run_cypher(''' 
    match (c:Claim)
    where not exists { (c)-->(:Procedure) }
      and not exists { (c)-->(:Physician) }
    return count(*) as numberOfClaims, sum(c.reimbursed_amt) as total_reimbursed_amt
''')
fake_invoice.head()

In [None]:
# 2. Duplicate submission of a claim for the same service
duplicate_submission =  gds.run_cypher(''' 
    match (c1:Claim)-->(:Procedure)<--(c2:Claim)
    where id(c1)<id(c2)
      and (c1)-[:beneficiary]->()<-[:beneficiary]-(c2)
    return 
        c1.id as claim1, c2.id as claim2,
        c1.reimbursed_amt as amt1, c2.reimbursed_amt as amt2,
        [ (c1)-[:provider]->(p) | p.id][0] as provider1,
        [ (c2)-[:provider]->(p) | p.id][0] as provider2,
        [ (c1)-[:provider]->(p) | p.fraud][0] as provider1_fraud,
        [ (c2)-[:provider]->(p) | p.fraud][0] as provider2_fraud

    limit 10
''')
duplicate_submission.head()

In [None]:
# 3. Misrepresenting the service provided (Beneficiares with similar diagonsis should have similar procedure cost)
# Here, we assume that claims with similar diagonsis codes end up in the same community
# So let's run louvain
g_diagnosis, project_stats = gds.graph.project(
    'g_diagnosis', 
    ['Claim', 'Diagnosis'], 
    ['diagonisis_code', 'diagonisis_group'])
project_stats

In [None]:
gds.louvain.stats(g_diagnosis)

In [None]:
gds.louvain.write(g_diagnosis, writeProperty='community_id')

In [None]:
g_diagnosis.drop()

In [None]:
# Check our communities, what is the average claim amout, and what providers are above average
gds.run_cypher('create range index if not exists for (n:Claim) on (n.community_id)')
community_dist = gds.run_cypher(''' 
    match (n:Claim)
    with n.community_id as community_id, 
            count(*) as number_of_claims,
            avg(n.reimbursed_amt) as avg_community_amt
        order by number_of_claims desc limit 50
    match (c:Claim{community_id:community_id})-[:provider]->(p)
    with p, community_id, avg_community_amt, 
            avg(c.reimbursed_amt) as avg_provider_amt
        order by avg_provider_amt desc
        where avg_provider_amt > avg_community_amt
    with p, community_id, avg_community_amt, avg_provider_amt,
        avg_provider_amt/avg_community_amt*100 as percent_over_average
        order by  percent_over_average desc
    return 
        community_id, 
        p.id as provider_id, 
        avg_community_amt, 
        avg_provider_amt,
        percent_over_average,
        p.fraud as is_fraud

''')
community_dist.head()

In [None]:
# 4. Charging for a more complex or expensive service than was actually provided

In [None]:
# 5. Billing for a covered service when the service actually provided was not covered

# ML

In [None]:
# Prep (need numeric label for our classes)
gds.run_cypher("match (p:Provider) set p.fraud_label = case p.fraud when true then 1 else 0 end")

## How can we make a monopartite graph for embeddings?
Excecise: Explore provider - provider relationships
```cypher
match p=(p1:Provider)<-[:provider]-()-->(py:Physician|Beneficiary)<--()-[:provider]->(p2:Provider)
where id(p1)<id(p2)
return p limit 50
```


In [None]:
g_train = gds.graph.get('g_train')
g_train.drop()

In [None]:
# Graph projection
g_train, project_stats = gds.graph.project.cypher(
    'g_train',
    'match (n:Provider) return id(n) as id, n.fraud_label as fraud_label, labels(n) AS labels',
    ''' 
    match (p1:Provider)<-[:provider]-()-->(py:Physician|Beneficiary)<--()-[:provider]->(p2:Provider)
    where p1<>p2
    return id(p1) as source, id(p2) as target, count(*) as weight
    '''
)

In [None]:
# # Graph projection
# g_train, project_stats = gds.graph.project('g_train', 
# [
#     { 
#         "Provider": {"properties": ["fraud_label"]},
#         "Condition": {},
#         "Beneficiary": {},
#         "Claim": {},
#         "Physician": {},
#         "Diagnosis": {},
#         "Procedure": {}
#     } 
# ],
# [
#     {
#         'has_condition': {'orientation': 'UNDIRECTED'},
#         'provider': {'orientation': 'UNDIRECTED'},
#         'attending': {'orientation': 'UNDIRECTED'},
#         'beneficiary': {'orientation': 'UNDIRECTED'},
#         'operating': {'orientation': 'UNDIRECTED'},
#         'other': {'orientation': 'UNDIRECTED'},
#         'admit_diagonisis': {'orientation': 'UNDIRECTED'},
#         'diagonisis_group': {'orientation': 'UNDIRECTED'},
#         'diagonisis_code': {'orientation': 'UNDIRECTED'},
#         'procedure_code': {'orientation': 'UNDIRECTED'},
#     }
# ])
# project_stats

In [None]:
# gds.fastRP.write(g_train, embeddingDimension=2, writeProperty='embedding',relationshipWeightProperty='weight', iterationWeights=[0.0, 1.0, 1.0, 0.7, 0.7, 0.6, 0.6, 0.4, 0.4])

In [None]:
pipeline, _ = gds.beta.pipeline.nodeClassification.create("provider-fraud-pipe")
pipeline.addNodeProperty('fastRP', embeddingDimension=64, mutateProperty='embedding', relationshipWeightProperty='weight', iterationWeights=[0.0, 1.0, 1.0, 0.7, 0.7, 0.6, 0.6, 0.4, 0.4])
pipeline.configureSplit(testFraction=0.3, validationFolds=5)
pipeline.selectFeatures(['embedding'])
pipeline.addLogisticRegression(tolerance=0.00001, maxEpochs=500, penalty=0.0, batchSize=32)
pipeline.addMLP()
pipeline.addRandomForest(maxDepth=20)
pipeline.configureAutoTuning(maxTrials=100)

In [None]:
trained_pipe_model, res = pipeline.train(g_train, modelName="fraud-model", targetNodeLabels=['Provider'], targetProperty="fraud_label", metrics=["ACCURACY", "F1_WEIGHTED"])

In [None]:
trained_pipe_model.metrics()

In [None]:
trained_pipe_model

In [None]:
result = trained_pipe_model.predict_write(g_train, concurrency=8, writeProperty="predicted_label", predictedProbabilityProperty="predicted_probablity")

In [None]:
pipeline.drop()

In [None]:
trained_pipe_model.drop()

In [None]:
#g_train.drop()

In [None]:
# Confusion matrix
c_data = gds.run_cypher('''
    match (n:Provider)
    return n.fraud_label as actual, n.predicted_label as predicted
''')
c_data.head()

In [None]:
confusion_matrix = pd.crosstab(c_data['actual'], c_data['predicted'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix.head()