In [19]:
import numpy as np
import pandas as pd
import seaborn as sns
from fitter import Fitter, get_common_distributions, get_distributions

In [20]:
def generate_nested_stratified_audit(df, target_n):
    """
    Performs Nested Stratification: 
    MCC -> CMG -> (Binned RIW & ELOS)
    """
    # Step 1: Create Resource Bins (The Nested 'Economic' Layer)
    # We use qcut to create 3 bins: 0 (Low), 1 (Med), 2 (High)
    df['riw_bin'] = pd.qcut(df['IP_RIW'], 3, labels=False)
    df['elos_bin'] = pd.qcut(df['ELS_DAYS'], 3, labels=False, duplicates='drop')
    
    # Create the 'Resource Strata' string
    df['res_strata'] = df['riw_bin'].astype(str) + df['elos_bin'].astype(str)

    # Step 2: Define the Nested Grouping
    # This ensures every sample represents a specific clinical + resource profile
    nested_cols = ['MCC_CODE', 'CMG_CODE', 'res_strata']
    
    # Step 3: Calculate the Sampling Fraction
    # fraction = target_n / total_population
    fraction = 1#0.5 #target_n / len(df)

    # Step 4: Apply Grouped Sampling
    # We use a lambda to ensure that even small groups get at least 1 chart 
    # if the sample size is large (e.g., 15984)
    audit_list = df.groupby(nested_cols, group_keys=False).apply(
        lambda x: x.sample(n=max(1, int(len(x) * fraction))) 
        if len(x) > 0 else None
    )

    # Step 5: Final Adjustment
    # Because of rounding in 'max(1, ...)', we might be slightly over target_n
    if len(audit_list) > target_n:
        return audit_list.sample(n=target_n, random_state=42)
    
    return audit_list

In [21]:
#df = pd.read_csv('SampleData_1024_label_header.csv')
df = pd.read_csv('D:\Manas PhD Den\CoderProducvity\Research_Papers\Data\Data_2017_19\Input_Data_Updated_Except_CMG_900_Series_2017_2019_2.csv')
#df = pd.read_csv('D:\Manas PhD Den\Winter_Q775\Term-Paper\SampleData_Systemetic_40960_1_Training_2.csv')
#df = pd.read_csv('D:\Manas PhD Den\Winter_Q775\Term-Paper\SampleData_Simple_2017_2019_10240_3_ELOS.csv')
#df = pd.read_csv('D:\Manas PhD Den\CoderProducvity\Research_Papers\Data\Input_Data_Updated_Except_CMG_900_Series_2.csv')
df.describe()
#df = pd.read_csv('SampleData_1024.csv')

Unnamed: 0,AGRP_F_D,GENDER,X_FR_I_T,ADM_CAT,ENT_CODE,X_TO_I_T,DIS_DISP,TLOS_CAT,ACT_LCAT,ALC_LCAT,MCC_CODE,CMG_CODE,IP_RIW,ELS_DAYS
count,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0,508187.0
mean,10.249402,0.558169,1.192844,2.335166,2.703023,1.50604,7.111729,2.045682,2.032886,0.119631,10.360903,424.770821,1.369405,5.107477
std,5.653354,0.496605,3.468138,0.883357,0.78868,3.113838,10.772777,0.998957,1.003101,0.572563,5.370669,225.933978,3.063272,6.242612
min,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,0.0,0.0,1.0,2.0,0.0,5.0,1.0,1.0,0.0,5.0,211.0,0.4161,1.9
50%,12.0,1.0,0.0,3.0,3.0,0.0,5.0,3.0,3.0,0.0,11.0,458.0,0.7125,3.2
75%,15.0,1.0,0.0,3.0,3.0,0.0,5.0,3.0,3.0,0.0,14.0,577.0,1.3161,5.7
max,17.0,1.0,14.0,3.0,5.0,14.0,90.0,3.0,3.0,3.0,20.0,816.0,326.0779,100.0


In [22]:
# --- MOCK DATA SETUP ---
#np.random.seed(42)
#data_size = 50000
#df_dad = pd.DataFrame({
#    'chart_id': range(data_size),
#    'mcc': np.random.choice(['01', '05', '08', '11', '13'], data_size), # Clinical Categories
#    'cmg': np.random.randint(100, 500, data_size),
#    'riw': np.random.gamma(2, 1, data_size),
#    'elos': np.random.poisson(5, data_size)
#})
df_dad =df

# --- GENERATE YOUR THREE SAMPLES ---
for i in range(25):  # Iterates from 0 to 4
    sample = generate_nested_stratified_audit(df, 1828)
    sample.to_csv('D:\Manas PhD Den\CoderProducvity\Research_Papers\Data\Data_2017_19\Data_1828\SampleData_Mutlistage_Stratified_2017_2019_1828_'+str(i+1)+'.csv', index=False)
    #result_groupBy = sample_660.groupby(['CMG_CODE', 'MCC_CODE']).size().reset_index(name='count').sort_values(by='count', ascending=False)
    #result_groupBy.to_csv('D:\Manas PhD Den\CoderProducvity\Research_Papers\Data\Data_2017_19\Data_660\SampleData_GroupBy_2017_2019_600_'+str(i+1)+'.csv', index=False)
    
    
#sample_1828 = generate_multistage_audit(df_dad, 1828)
#sample_15984 = generate_multistage_audit(df_dad, 15984)

#print(f"Audit Samples Generated: {len(sample_660)}, {len(sample_1828)}, {len(sample_15984)}")