a) Import bibliotek

In [21]:
import pandas as pd
import numpy as np
from cryptography.fernet import Fernet
import hashlib
import uuid
from datetime import datetime

b) Import danych

In [22]:
df = pd.read_csv('CancerProstateSurvival.csv')
df.head()

Unnamed: 0,times,patient.vital_status,patient.gender,patient.race,patient.ethnicity,patient.days_to_birth,patient.drugs.drug.therapy_types.therapy_type,patient.stage_event.pathologic_stage,patient.stage_event.tnm_categories.pathologic_categories.pathologic_t,patient.stage_event.tnm_categories.pathologic_categories.pathologic_m
0,621,0,male,,,-18658.0,,,t2b,
1,1332,0,male,,,-20958.0,,,t3a,
2,995,0,male,,,-17365.0,hormone therapy,,t4,
3,671,0,male,,,-19065.0,,,t2b,
4,1033,0,male,,,-25904.0,,,t3b,


c) Identyfikacja danych wrażliwych

In [23]:
sensitive_cols = ['patient.gender', 'patient.race', 'patient.ethnicity', 'patient.days_to_birth']
clinical_cols = [c for c in df.columns if c not in sensitive_cols]
sensitive_cols, clinical_cols

(['patient.gender',
  'patient.race',
  'patient.ethnicity',
  'patient.days_to_birth'],
 ['times',
  'patient.vital_status',
  'patient.drugs.drug.therapy_types.therapy_type',
  'patient.stage_event.pathologic_stage',
  'patient.stage_event.tnm_categories.pathologic_categories.pathologic_t',
  'patient.stage_event.tnm_categories.pathologic_categories.pathologic_m'])

d) Szyfrowanie danych (Fernet)

In [24]:
key = Fernet.generate_key()
fernet = Fernet(key)
df_encrypted = df.copy()
for col in sensitive_cols:
    df_encrypted[col] = df_encrypted[col].astype(str).apply(lambda x: fernet.encrypt(x.encode()).decode())
df_encrypted.head()

Unnamed: 0,times,patient.vital_status,patient.gender,patient.race,patient.ethnicity,patient.days_to_birth,patient.drugs.drug.therapy_types.therapy_type,patient.stage_event.pathologic_stage,patient.stage_event.tnm_categories.pathologic_categories.pathologic_t,patient.stage_event.tnm_categories.pathologic_categories.pathologic_m
0,621,0,gAAAAABpho6kPXqfyGinLyM7TF72yMPlcTJdu4bF0k2Kjo...,gAAAAABpho6kXV6KBTMj7WllnTwjdkgUuWKbccGxsLAC3j...,gAAAAABpho6kTTQ13QqSG9Nkd_DmUa2zMOevLZKqF0Qyy6...,gAAAAABpho6kv_N3OVCFX1i7xNuKmBoP5hUwHGoQ83tUAB...,,,t2b,
1,1332,0,gAAAAABpho6kJw1iSvYAia_rRGgeXFOEUNuyJeXh8L0m7R...,gAAAAABpho6kmdV-IZmkITyyqXAHDOyjbqzbWw6gy_ZomU...,gAAAAABpho6kLDg6VowB0OVuZXjNFCiFXoZUNoNFrOWE6C...,gAAAAABpho6kNv1G-LJ8n5G14HBuidvEJtHE7qvqTiUi4V...,,,t3a,
2,995,0,gAAAAABpho6kXkJrHpdz0y1AL1VNMttNuzN42Wcjn8theh...,gAAAAABpho6kCxCwLaaGngOibiFLNnXZj7SZVfunwyJsJ0...,gAAAAABpho6kkWT2-DFt4qEMqd_GtlZrK_cSW85zYOR7iU...,gAAAAABpho6kU_U00MsIqqivFtHFym98gSB6egMG5d10S7...,hormone therapy,,t4,
3,671,0,gAAAAABpho6kldbkTadTEbcmHaZfD5GXoHZBgptPHntmQL...,gAAAAABpho6kCANXTR0FPv5tzFkbDH1mkj5g7UT_jyTscy...,gAAAAABpho6kD6wKT5y0PbQqTHGNBswdV-T9S5thFEHRlk...,gAAAAABpho6kWTcLAq0Uw3B2de6N5BGA3dbhc_U3AmhiDf...,,,t2b,
4,1033,0,gAAAAABpho6kHUO1_X6Fl97fp01oI_ioHmd3zKnKd_Dro4...,gAAAAABpho6k2SVhGWnM1kvNzXBg-yAHJmB7fcVvEhP2y2...,gAAAAABpho6k3qFcYQJn5Q9ep1rLDKzdvGZutdVYPdu_At...,gAAAAABpho6kSfTzPV0e0fDp_UvRebozC2VsCCym1KX2nR...,,,t3b,


e) Integralność danych (SHA-256)

In [25]:
with open('CancerProstateSurvival.csv', 'rb') as f:
    data_bytes = f.read()
hash_original = hashlib.sha256(data_bytes).hexdigest()
with open('data_hash.sha256', 'w') as f:
    f.write(hash_original)
df_modified = df.copy()
df_modified.iloc[0, 0] = df_modified.iloc[0, 0] + 1
modified_bytes = df_modified.to_csv(index=False).encode()
hash_modified = hashlib.sha256(modified_bytes).hexdigest()
hash_original, hash_modified

('81fe87a8831a1309bd1fa2d243d10cf791be93a06d8e23cb3c5596c6220fe5bb',
 '360f9b859e2b6fd5536c0f21ed11886470b3cd462437cfbd6f01aabc144af621')

f) Pseudonimizacja danych

In [26]:
df_pseudo = df.copy()
row_tokens = {i: str(uuid.uuid4()) for i in df_pseudo.index}
for col in sensitive_cols:
    df_pseudo[col] = df_pseudo.index.map(row_tokens)
df_pseudo.head()

Unnamed: 0,times,patient.vital_status,patient.gender,patient.race,patient.ethnicity,patient.days_to_birth,patient.drugs.drug.therapy_types.therapy_type,patient.stage_event.pathologic_stage,patient.stage_event.tnm_categories.pathologic_categories.pathologic_t,patient.stage_event.tnm_categories.pathologic_categories.pathologic_m
0,621,0,00be0848-52da-4d25-9838-9208996fe619,00be0848-52da-4d25-9838-9208996fe619,00be0848-52da-4d25-9838-9208996fe619,00be0848-52da-4d25-9838-9208996fe619,,,t2b,
1,1332,0,dd00fffb-7b49-4058-a761-db5c015196ef,dd00fffb-7b49-4058-a761-db5c015196ef,dd00fffb-7b49-4058-a761-db5c015196ef,dd00fffb-7b49-4058-a761-db5c015196ef,,,t3a,
2,995,0,2b44c7b1-77c7-4a08-bc68-52d7129c3680,2b44c7b1-77c7-4a08-bc68-52d7129c3680,2b44c7b1-77c7-4a08-bc68-52d7129c3680,2b44c7b1-77c7-4a08-bc68-52d7129c3680,hormone therapy,,t4,
3,671,0,75aabf28-0513-4635-bb0c-c142c817a203,75aabf28-0513-4635-bb0c-c142c817a203,75aabf28-0513-4635-bb0c-c142c817a203,75aabf28-0513-4635-bb0c-c142c817a203,,,t2b,
4,1033,0,25189851-9ea0-4e8a-b631-cd8a3cb5a15e,25189851-9ea0-4e8a-b631-cd8a3cb5a15e,25189851-9ea0-4e8a-b631-cd8a3cb5a15e,25189851-9ea0-4e8a-b631-cd8a3cb5a15e,,,t3b,


g) Anonimizacja danych

In [27]:
df_anon = df.copy()
age_years = (-df_anon['patient.days_to_birth'] / 365.25).round(0)
df_anon['age_group'] = pd.cut(age_years, bins=[0, 50, 60, 70, 80, 200], labels=['0-50','51-60','61-70','71-80','81+'])
df_anon = df_anon.drop(columns=sensitive_cols)
df_anon.head()

Unnamed: 0,times,patient.vital_status,patient.drugs.drug.therapy_types.therapy_type,patient.stage_event.pathologic_stage,patient.stage_event.tnm_categories.pathologic_categories.pathologic_t,patient.stage_event.tnm_categories.pathologic_categories.pathologic_m,age_group
0,621,0,,,t2b,,51-60
1,1332,0,,,t3a,,51-60
2,995,0,hormone therapy,,t4,,0-50
3,671,0,,,t2b,,51-60
4,1033,0,,,t3b,,71-80


h) Kontrola dostępu (RBAC)

In [28]:
roles = {
    'administrator': df.columns.tolist(),
    'lekarz': clinical_cols + ['patient.gender'],
    'analityk': clinical_cols
}
def access_data(role):
    cols = roles[role]
    return df[cols]
access_admin = access_data('administrator').head()
access_doctor = access_data('lekarz').head()
access_analyst = access_data('analityk').head()
access_admin, access_doctor, access_analyst

(   times  patient.vital_status patient.gender patient.race patient.ethnicity  \
 0    621                     0           male          NaN               NaN   
 1   1332                     0           male          NaN               NaN   
 2    995                     0           male          NaN               NaN   
 3    671                     0           male          NaN               NaN   
 4   1033                     0           male          NaN               NaN   
 
    patient.days_to_birth patient.drugs.drug.therapy_types.therapy_type  \
 0               -18658.0                                           NaN   
 1               -20958.0                                           NaN   
 2               -17365.0                               hormone therapy   
 3               -19065.0                                           NaN   
 4               -25904.0                                           NaN   
 
    patient.stage_event.pathologic_stage  \
 0              

i) Audyt dostępu do danych

In [29]:
audit_log = []
def log_access(role, cols):
    audit_log.append({
        'time': datetime.utcnow().isoformat(),
        'role': role,
        'columns': ','.join(cols)
    })
for role in roles:
    cols = roles[role]
    _ = df[cols].head()
    log_access(role, cols)
audit_df = pd.DataFrame(audit_log)
audit_df

Unnamed: 0,time,role,columns
0,2026-02-07T01:00:20.580726,administrator,"times,patient.vital_status,patient.gender,pati..."
1,2026-02-07T01:00:20.580726,lekarz,"times,patient.vital_status,patient.drugs.drug...."
2,2026-02-07T01:00:20.580726,analityk,"times,patient.vital_status,patient.drugs.drug...."


j) Walidacja i ochrona danych wejściowych

In [30]:
with open('CancerProstateSurvival.csv', 'rb') as f:
    current_hash = hashlib.sha256(f.read()).hexdigest()
if current_hash != hash_original:
    raise ValueError('Wykryto manipulacje danych')
df_verified = pd.read_csv('CancerProstateSurvival.csv')
df_verified.head()

Unnamed: 0,times,patient.vital_status,patient.gender,patient.race,patient.ethnicity,patient.days_to_birth,patient.drugs.drug.therapy_types.therapy_type,patient.stage_event.pathologic_stage,patient.stage_event.tnm_categories.pathologic_categories.pathologic_t,patient.stage_event.tnm_categories.pathologic_categories.pathologic_m
0,621,0,male,,,-18658.0,,,t2b,
1,1332,0,male,,,-20958.0,,,t3a,
2,995,0,male,,,-17365.0,hormone therapy,,t4,
3,671,0,male,,,-19065.0,,,t2b,
4,1033,0,male,,,-25904.0,,,t3b,
