In [4]:
import pandas as pd
import numpy as np
import random

# 1. AYARLAR
n_patients = 1500
np.random.seed(42) # Sabit seed

ids = [f"PT-{i:06d}" for i in range(n_patients)]

# 2. TEMEL DEMOGRAFİK VERİLER
# Cinsiyet
gender = np.random.choice(['Male', 'Female'], n_patients, p=[0.5, 0.5])
# Yaş
age = np.random.normal(60, 12, n_patients).astype(int)

# 3. KLİNİK DEĞİŞKENLER (Logic Chain)
cancer_types = ['Lung', 'Breast', 'Colorectal', 'Prostate', 'Pancreatic']
stages = ['I', 'II', 'III', 'IV']

patient_cancer = []
patient_stage = []
tumor_sizes = []
recurrence_flag = []
tumor_grades = []

for i in range(n_patients):
    # A. Kanser Tipi (Cinsiyete Bağlı)
    if gender[i] == 'Male':
        ctype = np.random.choice(['Lung', 'Colorectal', 'Prostate', 'Pancreatic'], p=[0.3, 0.2, 0.3, 0.2])
    else:
        ctype = np.random.choice(['Lung', 'Breast', 'Colorectal', 'Pancreatic'], p=[0.2, 0.5, 0.15, 0.15])
    patient_cancer.append(ctype)
    
    # B. Evre (Kanser Tipine Bağlı)
    if ctype == 'Pancreatic': # Genelde geç evre
        stg = np.random.choice(stages, p=[0.05, 0.1, 0.35, 0.5])
    elif ctype == 'Breast':   # Genelde erken evre
        stg = np.random.choice(stages, p=[0.4, 0.3, 0.2, 0.1])
    else:
        stg = np.random.choice(stages, p=[0.25, 0.25, 0.25, 0.25])
    patient_stage.append(stg)
    
    # C. Tümör Boyutu (Evreye Bağlı)
    if stg == 'I': size = np.random.normal(1.5, 0.5)
    elif stg == 'II': size = np.random.normal(3.0, 1.0)
    elif stg == 'III': size = np.random.normal(5.0, 1.5)
    else: size = np.random.normal(7.0, 2.0)
    tumor_sizes.append(abs(round(size, 2)))

    # D. Recurrence (Nüks) ve Grade
    # İleri evre ve agresif tiplerde nüks artar
    rec_prob = 0.1
    if stg == 'III': rec_prob = 0.4
    if stg == 'IV': rec_prob = 0.8
    recurrence_flag.append(1 if np.random.random() < rec_prob else 0)
    
    # Grade (Hücre bozulma derecesi)
    tumor_grades.append(np.random.choice(['G1', 'G2', 'G3', 'Gx'], p=[0.2, 0.3, 0.3, 0.2]))

# 4. KOMORBİDİTE VE TEDAVİ GEÇMİŞİ (Text Data)
comorb_options = ['Hypertension', 'Diabetes Type 2', 'COPD', 'CAD', 'Obesity', 'None']
therapy_map = {
    'I': ['Surgery'],
    'II': ['Surgery', 'Radiotherapy'],
    'III': ['Surgery', 'Chemotherapy', 'Radiotherapy'],
    'IV': ['Chemotherapy', 'Immunotherapy', 'Targeted Therapy']
}

patient_comorbs = []
therapy_history = []

for i in range(n_patients):
    # Komorbidite (Çoklu seçim)
    num_c = np.random.choice([0, 1, 2], p=[0.3, 0.4, 0.3])
    if num_c == 0: 
        patient_comorbs.append("None")
    else:
        selected = np.random.choice(comorb_options[:-1], num_c, replace=False)
        patient_comorbs.append(", ".join(selected))
    
    # Tedavi (Evreye uygun rastgele seçim)
    opts = therapy_map[patient_stage[i]]
    chosen_therapy = np.random.choice(opts, 1)[0]
    
    # Nüks varsa ekstra tedavi ekle
    if recurrence_flag[i] == 1:
        chosen_therapy += " + 2nd Line Chemo"
    
    therapy_history.append(chosen_therapy)

# 5. SAĞKALIM SÜRESİ (Outcome Logic)
survival_months = []
events = []

for i in range(n_patients):
    base = 60 # 5 yıl baz
    
    # Cezalar
    if patient_stage[i] == 'IV': base -= 40
    if patient_stage[i] == 'III': base -= 20
    if 'Pancreatic' in patient_cancer[i]: base -= 15
    if 'G3' in tumor_grades[i]: base -= 10
    if recurrence_flag[i] == 1: base -= 10
    
    # Gürültü
    noise = np.random.normal(0, 12)
    final = max(1, base + noise)
    
    # Censoring
    if final > 60:
        survival_months.append(60)
        events.append(0)
    else:
        survival_months.append(round(final, 1))
        events.append(1)

# 6. DATAFRAME OLUŞTURMA
df_final = pd.DataFrame({
    'Patient_ID': ids,
    'Diagnosis_Age': age,
    'Gender': gender,                 # YENİ EKLENDİ
    'Cancer_Type': patient_cancer,
    'Baseline_Stage': patient_stage,
    'Tumor_Size_cm': tumor_sizes,     # YENİ EKLENDİ
    'Tumor_Grade': tumor_grades,      # GERİ GELDİ
    'Recurrence_Status': ['Yes' if r==1 else 'No' for r in recurrence_flag], # GERİ GELDİ
    'Comorbidities': patient_comorbs,
    'Prior_Treatments': therapy_history, # GERİ GELDİ
    'Survival_Months': survival_months,
    'Event_Status': events
})

# --- DATA DIRTYING (KİRLETME OPERASYONU) ---

# 1. Evreleri Karıştır
stage_map_dirty = {'I':'1', 'II':'Stage 2', 'III':'3', 'IV':'Metastatic'}
df_final['Baseline_Stage'] = df_final['Baseline_Stage'].apply(lambda x: stage_map_dirty[x] if np.random.random()<0.3 else x)

# 2. Tümör Boyutunu String Yap ("5.4 cm" vb.)
def dirtify_tumor(val):
    if np.random.random() < 0.2: return f"{val} cm"
    if np.random.random() < 0.1: return str(val).replace('.', ',')
    return val
df_final['Tumor_Size_cm'] = df_final['Tumor_Size_cm'].apply(dirtify_tumor)

# 3. Cinsiyet Yazım Hataları
def dirtify_gender(val):
    if np.random.random() < 0.1: return val.lower() # 'male'
    if np.random.random() < 0.05: return 'M' if val=='Male' else 'F'
    return val
df_final['Gender'] = df_final['Gender'].apply(dirtify_gender)

# 4. Kanser Tipi Varyasyonları
type_map = {'Lung':'Lung CA', 'Breast':'Invasive Ductal', 'Colorectal':'Colon Cancer'}
def dirtify_cancer(val):
    if val in type_map and np.random.random() < 0.2:
        return type_map[val]
    return val
df_final['Cancer_Type'] = df_final['Cancer_Type'].apply(dirtify_cancer)

# A. Tümör Boyutu (%15 Kayıp): Bazı hastalarda ölçüm yapılamamış olsun
df_final.loc[df_final.sample(frac=0.15).index, 'Tumor_Size_cm'] = np.nan

# B. Tumor Grade (%10 Kayıp): Patoloji raporu sisteme girilmemiş
df_final.loc[df_final.sample(frac=0.10).index, 'Tumor_Grade'] = np.nan

# C. Tedavi Geçmişi (%8 Kayıp): Hasta hatırlamıyor veya kayıt yok
df_final.loc[df_final.sample(frac=0.08).index, 'Prior_Treatments'] = np.nan

# D. Evre (%5 Kayıp): "Unknown Stage"
df_final.loc[df_final.sample(frac=0.05).index, 'Baseline_Stage'] = np.nan

# E. Cinsiyet ve Yaş (%2 Kayıp): Nadir de olsa sistem hatası
df_final.loc[df_final.sample(frac=0.02).index, 'Gender'] = np.nan
df_final.loc[df_final.sample(frac=0.02).index, 'Diagnosis_Age'] = np.nan

# --- SON KONTROL VE KAYIT ---
print("\nVeri Delik Deşik Edildi (Missing Values Added):")
print(df_final.isnull().sum()) # Hangi sütunda kaç eksik var görelim

# KAYDET
df_final.to_csv("oncology_data.csv", index=False)
print("✅ Ultimate Oncology Data Created (12 Columns, Dirty & Complex)")
print(df_final.head())


Veri Delik Deşik Edildi (Missing Values Added):
Patient_ID             0
Diagnosis_Age         30
Gender                30
Cancer_Type            0
Baseline_Stage        75
Tumor_Size_cm        225
Tumor_Grade          150
Recurrence_Status      0
Comorbidities          0
Prior_Treatments     120
Survival_Months        0
Event_Status           0
dtype: int64
✅ Ultimate Oncology Data Created (12 Columns, Dirty & Complex)
  Patient_ID  Diagnosis_Age  Gender Cancer_Type Baseline_Stage Tumor_Size_cm  \
0  PT-000000           30.0    Male  Colorectal        Stage 2           3.6   
1  PT-000001           87.0  Female      Breast             II          2.05   
2  PT-000002           43.0  Female  Colorectal              I          1,63   
3  PT-000003           40.0  Female        Lung             II           NaN   
4  PT-000004           72.0    Male        Lung            III          5.14   

  Tumor_Grade Recurrence_Status    Comorbidities  \
0          G3                No  Diabetes 