In [16]:
import pandas as pd

full_diagnosis_table = pd.read_csv('../data/disease_diagnosis_code.csv')
full_diagnosis_table_df = pd.DataFrame(full_diagnosis_table)

null_values = full_diagnosis_table_df.isnull().any()
print(null_values)

icd9_diagnosis_table = full_diagnosis_table_df.loc[full_diagnosis_table_df['icd_version'] == 9]
print(icd9_diagnosis_table)

icd9 = icd9_diagnosis_table.icd_code.astype(str).str.len().value_counts().reset_index()

Unnamed: 0     False
subject_id     False
hadm_id        False
seq_num        False
icd_code       False
icd_version    False
dtype: bool
        Unnamed: 0  subject_id   hadm_id  seq_num icd_code  icd_version
0              100    10000690  23280645        1     4280            9
1              101    10000690  23280645        2      486            9
2              102    10000690  23280645        3     4271            9
3              103    10000690  23280645        4     2764            9
4              104    10000690  23280645        5    42833            9
...            ...         ...       ...      ...      ...          ...
137631     6364029    19999297  21439025       12    30301            9
137632     6364030    19999297  21439025       13     2768            9
137633     6364031    19999297  21439025       14     3051            9
137634     6364032    19999297  21439025       15     2874            9
137635     6364033    19999297  21439025       16    04111            

In [17]:
import pandas as pd

diag = pd.read_csv("../data/disease_diagnosis_code.csv", dtype=str)
gem = pd.read_csv("../data/icd9toicd10cmgem.csv", dtype=str)

#Separating the ICD9 and ICD10 rows
icd9_df = diag.query("icd_version == '9'").copy()
icd10_df = diag.query("icd_version == '10'").copy()

icd9_df['icd_code'] = icd9_df['icd_code'].astype(str)

#Merging ICD9 with GEM file
icd9_mapped = icd9_df.merge(
    gem,
    left_on="icd_code",
    right_on="icd9cm",
    how="left"
)

#Filtering
icd9_mapped_clean = icd9_mapped.query(
    "no_map == '0' and combination == '0'"
)

#Converting ICD9 to ICD10
icd9_as_icd10 = icd9_mapped_clean[
    ['subject_id', 'hadm_id', 'seq_num', 'icd10cm']
].rename(columns={'icd10cm': 'icd_code'})

icd9_as_icd10['icd_version'] = '10'

#Combining the newly converted ICD9 with the ICD10 dataframe
harmonised_diag = pd.concat(
    [
        icd10_df[['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'icd_version']],
        icd9_as_icd10
    ],
    ignore_index=True
)

harmonised_diag.to_csv('ICD9to10diagnosis.csv', index=False)

In [19]:
origccs = pd.read_csv("../data/DXCCSR_v2026-1.csv", dtype=str)

ccs = origccs.rename(columns={"'ICD-10-CM CODE'": 'icd_code',
                              "'CCSR CATEGORY 1'": 'ccs_code',
                              "'CCSR CATEGORY 1 DESCRIPTION'": 'ccs_description'})

harmonised_diag['icd_code_clean'] = (
    harmonised_diag['icd_code']
    .str.replace('.', '', regex=False)
    .str.replace("'", "", regex=False)   # <-- remove quotes
    .str.strip()
)

ccs['icd_code_clean'] = (
    ccs['icd_code']
    .str.replace('.', '', regex=False)
    .str.replace("'", "", regex=False)   # <-- remove quotes
    .str.strip()
)

harmonised_ccs = harmonised_diag.merge(
    ccs[['icd_code_clean', 'ccs_code', 'ccs_description']],
    on='icd_code_clean',
    how='left'
)


harmonised_ccs = harmonised_ccs[['subject_id', 'hadm_id', 'seq_num', 'icd_code', 'ccs_code', 'ccs_description']]
harmonised_ccs.to_csv('grouped_diagnosis_byCSS.csv', index=False)