# Create Synthetic Dataset for Risk Adjustment

Purpose is to create a synthetic dataset for testing the risk adjustment scoring model. At its core it needs a member id and an icd10 code.

Since the purpose is to score categories, it pulls icd10s from the CMS files not the full ICD10 set.

## How it works
It uses faker to generate IDs. From there it imports the appropriate icd10 list and creates a dictionary of member_id and dx code list. It then dumps to a json

In [3]:
from faker import Faker
import random
import pandas as pd
import numpy as np
import json

fake = Faker()

# Function to generate synthetic IDs
def generate_ids(num_records):
    ids = [fake.uuid4() for _ in range(num_records)]
    return ids

def generate_gender():
    # https://stackoverflow.com/questions/43991548/when-defining-a-python-class-how-to-set-a-random-variable-in-it
   return random.choice(('M', 'F'))
    
def generate_age():
    # Will need to refactor this for test data to at least have one member per each bucket
    return random.randint(1, 100)

def generate_orec():
    # https://resdac.org/cms-data/variables/current-reason-entitlement-code
    # Might want to add 2 and 3 to ensure performs
    return random.choice(('0', '1', '2', '3'))

def generate_medicaid():
    # https://resdac.org/cms-data/variables/current-reason-entitlement-code
    # Might want to add 2 and 3 to ensure performs
    return random.choice((True, False))


# Function to generate synthetic ICD-10 codes
def generate_icd10_codes():
    df = pd.read_csv(r"...\F2423P2M.TXT", sep='\t', header=None)
    df.rename({0: 'icd10', 1: 'cat_nbr', 2: 'unknown'}, axis=1, inplace=True)
    # Set the random seed for reproducibility
    np.random.seed(42)
    
    # Randomly sample with replacement using numpy
    icd_codes = np.random.choice(df['icd10'], size=random.randint(1, 25), replace=True)
    
    return list(icd_codes)

def generate_json(input_dict, type):
    # Specify the file path where you want to save the JSON file
    json_file_path = f'test_dataset_{type}.json'
    
    # Dump the dictionary to JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(input_dict, json_file)

def generate_pipe_delimited_dx_codes(input_dict):
    '''Expects a nested dictionary of member_id, dx_code'''
    data_list = []
    for key, value in input_dict.items():
        tmp_list = [(key, dx) for dx in value]
        data_list.extend(tmp_list)  

    df = pd.DataFrame(data_list, columns=['member_id', 'dx_code'])
    df.to_csv('test_dataset_10k.txt', sep='|', header=True, index=False)

def generate_pipe_delimited_demo(input_dict):
    df = pd.DataFrame.from_dict(test_demo_dict, orient='index').reset_index().rename({'index': 'member_id'}, axis=1)
    df.to_csv('test_demo_dataset_10k.txt', sep='|', header=True, index=False)


# Set the number of records you want
num_records = 10000

# Generate synthetic IDs and ICD-10 codes
synthetic_ids = generate_ids(num_records)

test_dx_dict = {}
test_demo_dict = {}
for id in synthetic_ids:
    icd_codes = generate_icd10_codes()
    gender = generate_gender()
    orec = generate_orec()
    age = generate_age()
    medicaid = generate_medicaid
    # icd_codes = generate_icd10_codes()
    test_dx_dict[id] = icd_codes
    test_demo_dict[id] = {
        'gender': gender,
        'orec': orec,
        'age': age,
        'medicaid': medicaid
    }

# generate_json(test_dx_dict, 'dx_10k')
# generate_json(test_demo_dict, 'demo_10k')
generate_pipe_delimited_dx_codes(test_dx_dict)
generate_pipe_delimited_demo(test_demo_dict)


In [27]:
test_demo_dict

{'587790b1-ee73-470f-8b48-91c23e403484': {'gender': 'F',
  'orec': '0',
  'age': 77},
 'bfd6477a-b7a1-463f-bec6-5a89be66c71b': {'gender': 'F',
  'orec': '0',
  'age': 90},
 'f4514062-c1f4-4f55-af86-8afa10b20d7d': {'gender': 'M',
  'orec': '1',
  'age': 70},
 '4482263e-57c5-4d15-a56f-ce2453494e1f': {'gender': 'M',
  'orec': '1',
  'age': 67},
 '371c11ab-c648-40f8-8df5-350389603d68': {'gender': 'F',
  'orec': '1',
  'age': 66},
 '15e01924-6e34-4ff7-b2e2-dc0b78934591': {'gender': 'F',
  'orec': '1',
  'age': 62},
 'fc4231cd-b4cf-45fe-9956-b21944e66b66': {'gender': 'F',
  'orec': '1',
  'age': 69},
 '0f323acf-6cf2-4928-aec5-639cb2052142': {'gender': 'F',
  'orec': '1',
  'age': 68},
 '080eb6e2-bb39-4a95-9367-9ec668ee62cf': {'gender': 'F',
  'orec': '1',
  'age': 32},
 '9e418033-29fe-4caf-af21-fc4431bdfe89': {'gender': 'M',
  'orec': '1',
  'age': 18},
 '99b1bd63-df33-49e2-ace6-dc3f5cd9ea52': {'gender': 'F',
  'orec': '0',
  'age': 88},
 '30f99117-e47f-441f-a72f-9c5bdd0f11d6': {'gender': 'F

In [38]:
df_demo = pd.DataFrame.from_dict(test_demo_dict, orient='index').reset_index().rename({'index': 'member_id'}, axis=1)

In [39]:
df_demo

Unnamed: 0,member_id,gender,orec,age
0,587790b1-ee73-470f-8b48-91c23e403484,F,0,77
1,bfd6477a-b7a1-463f-bec6-5a89be66c71b,F,0,90
2,f4514062-c1f4-4f55-af86-8afa10b20d7d,M,1,70
3,4482263e-57c5-4d15-a56f-ce2453494e1f,M,1,67
4,371c11ab-c648-40f8-8df5-350389603d68,F,1,66
...,...,...,...,...
95,d5137a49-a2f1-4e13-a10a-df9016343547,F,0,4
96,9c658b9e-62f1-4b86-8122-e6d82e499bbb,M,1,52
97,99a16292-92f9-4d83-99c4-a965e86b8d44,F,1,68
98,459399c4-e393-463f-8abb-4a13188578ed,F,1,70


In [23]:
test_dict2 = test_dict

In [None]:
test_dict2.

In [16]:
import numpy as np
import random
random.randint(0, 1)

1

### Future Enhancements

1. Add Gender
2. Add Age or DOB
3. OREC
4. Name
5. Address
6. DX Code Dates
7. Provider
8. Specialty