In [1]:
import pandas as pd

In [2]:
df = pd.read_csv(r"..\data\preprocessed.csv")

In [3]:
df.columns

Index(['raw_id', 'clinic_application_date', 'intake_date', 'group',
       'signing_date', 'sheet', 'clean_id', 'therapy_start_date',
       'therapy_end_date'],
      dtype='object')

In [4]:
df

Unnamed: 0,raw_id,clinic_application_date,intake_date,group,signing_date,sheet,clean_id,therapy_start_date,therapy_end_date
0,H4481s,2024-10-08 00:00:00,,CAU,2024-11-05 00:00:00,משתתפים פעילים,h4481,,
1,A2785s,2024-09-03 00:00:00,2024-09-22 00:00:00,Stepped Care,2024-11-14 00:00:00,משתתפים פעילים,a2785,,
2,y4316s,2024-10-29 00:00:00,2024-11-18 00:00:00,Stepped Care,2024-11-18 00:00:00,משתתפים פעילים,y4316,,
3,O1620S,2024-04-03 00:00:00,2025-01-14 00:00:00,CAU,2025-01-14 00:00:00,משתתפים פעילים,o1620,,
4,G3981,2024-10-07 00:00:00,2024-11-14 00:00:00,Stepped Care,2025-01-16 00:00:00,משתתפים פעילים,g3981,,
...,...,...,...,...,...,...,...,...,...
335,H4481s,2024-10-14 00:00:00,2024-11-05 00:00:00,Stepped Care,2024-11-05 00:00:00,נשירה קלינית- לאחר ת. טיפול,h4481,,
336,y4290s,2025-02-18 00:00:00,2025-02-18 00:00:00,Stepped Care,2025-02-18 00:00:00,נשירה קלינית- לאחר ת. טיפול,y4290,,
337,y9411s,2024-12-16 00:00:00,2025-03-20 00:00:00,Stepped Care,טרם חתמו,נשירה קלינית- לאחר ת. טיפול,y9411,,2025-09-09 00:00:00
338,S1996s,2024-11-14 00:00:00,2024-12-15 00:00:00,Stepped Care,2024-12-15 00:00:00,עלייה לרמה 2,s1996,2025-01-15 00:00:00,2025-04-08 00:00:00


In [5]:
priority = {
        'סיימו טיפול': 0,
        'CAU': 1,
        'IPC-SSC': 2,
        'משתתפים פעילים': 3,
        'פספוסי גיוסים': 4,
        'אין שת"פ טיפולי': 5,
        'אי התאמה למחקר': 6,
        'אי הסכמה למחקר': 7,
        'נשירה מחקרית': 8,
        'נשירה קלינית- לאחר ת. טיפול': 9,
        'עלייה לרמה 2': 10
}

In [6]:
formats = [
    "%Y-%m-%d",   # 2025-09-28
    "%d/%m/%Y",   # 28/09/2025
    "%m/%d/%Y",   # 09/28/2025
    "%d-%b-%Y",   # 28-Sep-2025
    "%d.%m.%Y",   # 28.09.2025
    '%Y-%m-%d %H:%M:%S'
]

date_columns = [i for i in df.columns if 'date' in i]


In [7]:
def parse_date(x):
    for fmt in formats:
        try:
            return pd.to_datetime(x, format=fmt)
        except (ValueError, TypeError):
            continue
    return pd.NaT

for date_col in date_columns:
    df[date_col] = df[date_col].apply(parse_date)

#### Filling missing data by prioraty 

In [8]:
# map priority and sort so preferred sources come first for each participant
df['prio'] = df['sheet'].map(priority)
df_sorted = df.sort_values(['clean_id', 'prio'])

# group by participant and take the first non-null value per column
# groupby.first() skips NA and returns first non-NA per column
result = df_sorted.groupby('clean_id', as_index=False).first()

# drop helper column if you like
result = result.drop(columns=['prio'])

In [9]:
dummies = pd.get_dummies(df['sheet'], prefix='טבלת_')
dummies['clean_id'] = df['clean_id']
one_hot_g = dummies.groupby('clean_id', as_index=True).max().reset_index()

# merge similarly
final2 = result.merge(one_hot_g, on='clean_id', how='left').fillna(0)

In [10]:
final2['group'] = final2.group.str.replace('Stepped care', 'Stepped Care')
# final2["first_contact_date"] = final2["clinic_application_date"].fillna(final2["intake_date"]).fillna(final2["signing_date"])
final2['therapy_starting_date'] =  final2["therapy_start_date"].fillna(final2["therapy_end_date"])

In [15]:
final2['therapy_starting_date']

0      2025-09-02 00:00:00
1                        0
2                        0
3                        0
4                        0
              ...         
296    2025-04-22 00:00:00
297                      0
298                      0
299                      0
300                      0
Name: therapy_starting_date, Length: 301, dtype: object

In [19]:
dummies.columns

Index(['טבלת__CAU', 'טבלת__IPC-SSC', 'טבלת__אי הסכמה למחקר',
       'טבלת__אי התאמה למחקר', 'טבלת__אין שת"פ טיפולי', 'טבלת__משתתפים פעילים',
       'טבלת__נשירה מחקרית', 'טבלת__נשירה קלינית- לאחר ת. טיפול',
       'טבלת__סיימו טיפול', 'טבלת__עלייה לרמה 2', 'טבלת__פספוסי גיוסים',
       'clean_id'],
      dtype='object')

In [22]:
final2[final2['therapy_starting_date'] == 0][['raw_id', 'group'] + list(dummies.columns)].to_excel("missing_therapy_starting_date.xlsx", index=0)