In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import ttest_ind

# Load data
df = pd.read_csv("E:/PXA252_BH/OlderFiles20250512/class_all_with_chronic_names.csv")

# Drop identifiers
cols_to_drop = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY']
df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)

# Keep only class 1 and 2
df = df[df['class'].isin([1, 2])]

# Separate target
y = df['class']
X = df.drop(columns=['class'])

# Store original AGE_GROUP values before any processing
age_group_original = None
if 'AGE_GROUP' in X.columns:
    # Clean up AGE_GROUP values and create readable ranges
    age_group_mapping = {
        "18": "18",
        "18-24": "18-24",
        "25-34": "25-34", 
        "35-44": "35-44",
        "45-54": "45-54",
        "55-64": "55-64",
        "65-74": "65-74",
        "75+": "75+",
        ">= 90": "75+",
        ">=90": "75+",
        "Unknown": "Unknown",
        "Missing": "Missing"
    }
    
    age_group_original = X['AGE_GROUP'].fillna('Missing').astype(str).map(age_group_mapping)
    age_group_original = age_group_original.fillna('Missing')  # Handle any unmapped values

# Handle categorical variables with 'Missing' category (EXCLUDING AGE_GROUP)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
# Remove AGE_GROUP from categorical processing if it exists
categorical_cols = [col for col in categorical_cols if col != 'AGE_GROUP']

label_encoders = {}
for col in categorical_cols:
    X[col] = X[col].fillna('Missing')
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le  # store encoder to map back

# Handle AGE_GROUP separately - encode for model training but keep original for summary
if 'AGE_GROUP' in X.columns:
    X['AGE_GROUP'] = X['AGE_GROUP'].fillna('Missing')
    le_age = LabelEncoder()
    X['AGE_GROUP'] = le_age.fit_transform(X['AGE_GROUP'].astype(str))
    # Don't store this encoder as we want to use original values for summary

# Impute numerical columns only
numerical_cols = X.select_dtypes(include=[np.number]).columns  
imputer = SimpleImputer(strategy='mean')
X[numerical_cols] = imputer.fit_transform(X[numerical_cols])

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
top_20_features = importances.head(20).index.tolist()

# Append target column
df_imputed = X.copy()
df_imputed['class'] = y.values

# Restore original AGE_GROUP values for summary table
if age_group_original is not None and 'AGE_GROUP' in df_imputed.columns:
    df_imputed['AGE_GROUP_READABLE'] = age_group_original.values

# Split into groups
df_class1 = df_imputed[df_imputed['class'] == 1]
df_class2 = df_imputed[df_imputed['class'] == 2]

# Build summary table
summary = []

for col in top_20_features:
    if col == 'AGE_GROUP' and age_group_original is not None:
        # Special handling for AGE_GROUP - use readable values
        counts1 = df_class1['AGE_GROUP_READABLE'].value_counts(normalize=True) * 100
        counts2 = df_class2['AGE_GROUP_READABLE'].value_counts(normalize=True) * 100
        stats1 = "; ".join([f"{k}: {v:.1f}%" for k, v in counts1.items()])
        stats2 = "; ".join([f"{k}: {v:.1f}%" for k, v in counts2.items()])
        p_value = "N/A (categorical)"
        
    elif col in label_encoders:  # Other categorical variables
        le = label_encoders[col]
        inv_map = dict(zip(range(len(le.classes_)), le.classes_))
        # Value counts per class
        counts1 = df_class1[col].value_counts(normalize=True) * 100
        counts2 = df_class2[col].value_counts(normalize=True) * 100
        stats1 = "; ".join([f"{inv_map.get(k)}: {v:.1f}%" for k, v in counts1.items()])
        stats2 = "; ".join([f"{inv_map.get(k)}: {v:.1f}%" for k, v in counts2.items()])
        
    else:  # Numerical
        mean1 = df_class1[col].mean()
        std1 = df_class1[col].std()
        mean2 = df_class2[col].mean()
        std2 = df_class2[col].std()
        stats1 = f"{mean1:.2f} ± {std1:.2f}"
        stats2 = f"{mean2:.2f} ± {std2:.2f}"
    
    summary.append({
        'Feature': col,
        'Class 1': stats1,
        'Class 2': stats2
    })

# Create DataFrame
df_summary = pd.DataFrame(summary)

# Save and display
df_summary.to_csv("E:/PXA252_BH/OlderFiles20250512/characteristics_table_top20_with_age_ranges.csv", index=False)
print(df_summary)