In [None]:
import pandas as pd

df_diagnosis = pd.read_csv("Extracted_Diagnosis.csv", dtype={'HASHED_PERSONID': str})
df_diagnosis['DIAG_DT_TM'] = pd.to_datetime(df_diagnosis['DIAG_DT_TM'], errors='coerce')

# ICD definitions
initial_icd = "F32.0"
target_icds = ["F32.0", "F33.0"]

# Get initial F32.0 diagnoses
df_initial = df_diagnosis[df_diagnosis['ICD'] == initial_icd].copy()

# Merge with all diagnoses for follow-up
df_followup = pd.merge(df_initial, df_diagnosis, on='HASHED_PERSONID', suffixes=('_initial', '_followup'))

#  Filter follow-ups BETWEEN 1 year and 1.5 years later
df_followup = df_followup[
(df_followup['DIAG_DT_TM_followup'] > df_followup['DIAG_DT_TM_initial'] + pd.DateOffset(years=1)) &
(df_followup['DIAG_DT_TM_followup'] <= df_followup['DIAG_DT_TM_initial'] + pd.DateOffset(months=24))

]

# Only keep those who were diagnosed with F32.0 or F33.0 at follow-up
df_followup = df_followup[df_followup['ICD_followup'].isin(target_icds)]

# Get unique patient IDs
final_patient_ids = df_followup['HASHED_PERSONID'].unique()

# Keep all records of interest
df_final = df_diagnosis[
    (df_diagnosis['HASHED_PERSONID'].isin(final_patient_ids)) &
    (df_diagnosis['ICD'].isin(target_icds))
].copy()

# Label class = 1
df_final['class'] = 1

# Save results
df_final.to_csv("class1.csv", index=False)

# Print summary
print(f"✅ Saved class1.csv with {df_final['HASHED_PERSONID'].nunique()} unique patients")
print("📋 Columns:", list(df_final.columns))
print("🔎 Preview:")
print(df_final.head())


In [None]:
import pandas as pd

# Load the diagnosis dataset
df_diagnosis = pd.read_csv("Extracted_Diagnosis.csv", dtype={'HASHED_PERSONID': str})
df_diagnosis['DIAG_DT_TM'] = pd.to_datetime(df_diagnosis['DIAG_DT_TM'], errors='coerce')

# Define the ICDs
worse_icds = ["F32.1", "F32.2", "F32.3", "F33.1", "F33.2", "F33.3"]
initial_icds = ["F32.0", "F33.0"]

# Helper function to identify patients who progressed in a given time window
def get_progressed_patients(df, initial_icd, min_months, max_months, class_label):
    df_initial = df[df['ICD'] == initial_icd].sort_values(by=['HASHED_PERSONID', 'DIAG_DT_TM'])
    df_initial = df_initial.drop_duplicates(subset='HASHED_PERSONID', keep='first')

    df_progressed = df[df['ICD'].isin(worse_icds)]

    df_merged = pd.merge(
        df_initial[['HASHED_PERSONID', 'DIAG_DT_TM']],
        df_progressed,
        on='HASHED_PERSONID',
        suffixes=('_initial', '_progressed')
    )

    min_offset = pd.DateOffset(months=min_months)
    max_offset = pd.DateOffset(months=max_months)

    df_merged = df_merged[
        (df_merged['DIAG_DT_TM_progressed'] > df_merged['DIAG_DT_TM_initial'] + min_offset) &
        (df_merged['DIAG_DT_TM_progressed'] <= df_merged['DIAG_DT_TM_initial'] + max_offset)
    ]

    final_ids = df_merged['HASHED_PERSONID'].unique()

    df_result = df[
        (df['HASHED_PERSONID'].isin(final_ids)) &
        (df['ICD'].isin([initial_icd] + worse_icds))
    ].copy()

    df_result['class'] = class_label
    return df_result

# Get Class 2 (within 12 months)
df_class2_f32 = get_progressed_patients(df_diagnosis, "F32.0", 0, 12, 2)
df_class2_f33 = get_progressed_patients(df_diagnosis, "F33.0", 0, 12, 2)

# Get Class 3 (12 to 24 months)
df_class3_f32 = get_progressed_patients(df_diagnosis, "F32.0", 12, 24, 3)
df_class3_f33 = get_progressed_patients(df_diagnosis, "F33.0", 12, 24, 3)

# Combine classes
df_combined = pd.concat([
    df_class2_f32, df_class2_f33,
    df_class3_f32, df_class3_f33
], ignore_index=True)

# Save to file
# Save to file
df_combined.to_csv("class2.csv", index=False)

# ✅ Use df_combined for final reporting
print(f"✅ Saved: class2.csv")
print("👥 Unique patients:", df_combined['HASHED_PERSONID'].nunique())
print("📊 Class distribution:\n", df_combined['class'].value_counts())
print("🔍 ICD codes included:", df_combined['ICD'].unique())



In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv("class_all_with_chronic_names.csv")

# Drop unnecessary identifier columns if needed
cols_to_drop = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY']
df = df.drop(columns=cols_to_drop, errors='ignore')

# Separate features and target
y = df['class']
X = df.drop(columns=['class'])

# Label encode categorical features
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_array = imputer.fit_transform(X)
X = pd.DataFrame(X_array, columns=X.columns)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

# Plot all features
plt.figure(figsize=(10, len(importances) * 0.25))
importances.plot(kind='barh')
plt.gca().invert_yaxis()
plt.title("Feature Importances (Random Forest)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()

# Print all feature importances
for feature, importance in importances.items():
    print(f"{feature}: {importance:.6f}")


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# Step 1: Load dataset
df = pd.read_csv("class_all_with_chronic_names.csv")

# Step 2: Drop unnecessary columns
drop_cols = ['HASHED_PERSONID', 'ENCNTR_ID_SI', 'DIAG_DT_TM', 'ICD', 'DIAGNOSIS_DISPLAY']
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

# Step 3: Drop rows with missing target
# Step 3: Remove rows with missing or unexpected class values
df = df[df['class'].isin([1, 2])]
y = df['class'].map({1: 0, 2: 1})

# Optional: check class balance
print("✅ Class distribution:\n", y.value_counts())


# Step 4: Separate features and target
y = df['class'].map({1: 0, 2: 1})  # Binary conversion
X = df.drop(columns=['class'])

# Step 5: Label encode categorical columns
for col in X.select_dtypes(include=['object', 'category']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# Step 6: Convert infinite to NaN and impute
X.replace([np.inf, -np.inf], np.nan, inplace=True)
missing_before = X.isnull().sum().sum()
print(f"🧼 Missing values before imputation: {missing_before}")

# Step 7: Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Step 8: Final check
X_clean = X_imputed.copy()
if X_clean.isnull().sum().sum() == 0 and np.isfinite(X_clean.values).all():
    print("✅ Data is clean and ready.")
else:
    raise ValueError("❌ Still NaNs or Infs in the dataset after cleaning!")

# Step 9: Feature Selection with Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_clean, y)
importances = pd.Series(rf.feature_importances_, index=X_clean.columns)
top_20_features = importances.sort_values(ascending=False).head(40).index.tolist()

# Step 10: Subset top 20 features
X_top = X_clean[top_20_features]

# Step 11: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_top, y, test_size=0.2, stratify=y, random_state=42)

# Step 12: Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

# Step 13: Evaluation
y_pred = xgb.predict(X_test)
print("📊 Classification Report:\n", classification_report(y_test, y_pred))
print("✅ Accuracy:", accuracy_score(y_test, y_pred))

# Step 14: Plot
importances_xgb = pd.Series(xgb.feature_importances_, index=X_top.columns).sort_values()
plt.figure(figsize=(10, 8))
importances_xgb.plot(kind='barh')
plt.title("XGBoost Feature Importances (Top 40)")
plt.xlabel("Importance")
plt.tight_layout()
plt.show()
