In [2]:
import pandas as pd
import numpy as np
from lifelines import CoxTimeVaryingFitter

from sksurv.ensemble import RandomSurvivalForest
from sksurv.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter

from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('BDHSC_SCC_2025_synth_data.csv')
regimen_columns = ["Base_Drug_Combo", "Comp_INI", "Comp_NNRTI", "ExtraPI", "ExtraPk_En"]
df["Regimen"] = df[regimen_columns].astype(str).agg("_".join, axis=1)

unique_ids = df["ID"].unique()
train_ids, test_ids = train_test_split(unique_ids, test_size=0.1, random_state=42)
train_df = df[df["ID"].isin(train_ids)]
test_df = df[df["ID"].isin(test_ids)]

In [4]:
df_sorted = train_df.sort_values(['ID', 'Month'])
df_sorted['Regimen_Shift'] = (df_sorted.groupby('ID')['Regimen'].shift() != df_sorted['Regimen']).astype(int)
df_sorted['Regimen_Start'] = df_sorted.groupby('ID')['Regimen_Shift'].cumsum()

# Split into regimen intervals
intervals = df_sorted.groupby(['ID', 'Regimen_Start']).agg(
    Start=('Month', 'min'),
    End=('Month', 'max'),
    Regimen=('Regimen', 'first'),
    VL=('VL', lambda x: list(x)),
    CD4=('CD4', lambda x: list(x))
).reset_index()

In [5]:
def track_outcomes(group):
    """Track first suppression/CD4 recovery and retain covariates."""
    vl = group['VL']
    cd4 = group['CD4']
    time_points = group['Month']
    
    # Viral load outcomes
    vl_250_time = next((t for t, val in zip(time_points, vl) if val <= 250), None)
    vl_50_time = next((t for t, val in zip(time_points, vl) if val <= 50), None)
    
    # CD4 recovery
    cd4_500_time = next((t for t, val in zip(time_points, cd4) if val > 500), None)
    
    return pd.Series({
        'VL_250_time': vl_250_time,
        'VL_50_time': vl_50_time,
        'CD4_500_time': cd4_500_time,
        'VL_250_Censored': 1 if vl_250_time is None else 0,
        'VL_50_Censored': 1 if vl_50_time is None else 0,
        'CD4_500_Censored': 1 if cd4_500_time is None else 0,
        'Gender': group['Gender'].iloc[0],  # Add baseline covariates
        'Ethnicity': group['Ethnic'].iloc[0],
        'Baseline_VL': group['VL'].iloc[0],
        'Baseline_CD4': group['CD4'].iloc[0],
        'Baseline_CD4_percent': group['RelCD4']
    })

# Apply outcome tracking
outcomes = df_sorted.groupby(['ID', 'Regimen_Start']).apply(track_outcomes).reset_index()
intervals_outcomes = pd.merge(intervals, outcomes, on=['ID', 'Regimen_Start'])

  outcomes = df_sorted.groupby(['ID', 'Regimen_Start']).apply(track_outcomes).reset_index()


In [158]:
def filter_intervals(df, mode):
    """Filter intervals where start time < target time (or target not achieved)."""

    if mode == 'vl50':
        mask = (df['Start'] < df['VL_50_time']) | (df['VL_50_Censored'] == 1)
    elif mode == 'vl250':
        mask = (df['Start'] < df['VL_250_time']) | (df['VL_250_Censored'] == 1)
    else:
        mask = (df['Start'] < df['CD4_500_time']) | (df['CD4_500_Censored'] == 1)
    df = df[mask]
    return df[df['End'] - df['Start'] >= 6]

def cox(mode, verbose=False):
    filtered_intervals_outcomes = filter_intervals(intervals_outcomes, mode)

    tv_data = []
    for _, row in filtered_intervals_outcomes.iterrows():
        start = row['Start']
        if mode == 'vl50':
            end = row['End'] if row['VL_50_Censored'] == 1 else row['VL_50_time']
        elif mode == 'vl250':
            end = row['End'] if row['VL_250_Censored'] == 1 else row['VL_250_time']
        else:
            end = row['End'] if row['CD4_500_Censored'] == 1 else row['CD4_500_time']
        
        tv_data.append({
            'Patient ID': row['ID'],
            'start': start,
            'stop': end,
            'gender': row['Gender'],
            'ethnicity': row['Ethnicity'],
            'censor': row['VL_50_Censored'] if mode == 'vl50' else row['VL_250_Censored'] if mode == 'vl250' else row['CD4_500_Censored'],
            'Regimen': row['Regimen'],
            'Baseline_VL': row['Baseline_VL'],
            'Baseline_CD4': row['Baseline_CD4']
        })

    tv_df = pd.DataFrame(tv_data)
    counts = tv_df["Regimen"].value_counts()
    common_regimens = counts[counts > 1000].index

    tv_df["Regimen_lumped"] = tv_df["Regimen"].apply(
        lambda x: x if x in common_regimens else "Other"
    )

    tv_df["gender"] = tv_df["gender"].astype("category")
    tv_df["ethnicity"] = tv_df["ethnicity"].astype("category")
    tv_df = pd.get_dummies(tv_df, columns=["Regimen_lumped", "gender", "ethnicity"], drop_first=True)

    tv_df.drop(['Regimen'], axis=1, inplace=True)

    ctv = CoxTimeVaryingFitter()
    ctv.fit(
        tv_df,
        id_col="Patient ID",
        event_col="censor",
        start_col="start",
        stop_col="stop",
        show_progress=True
    )
    if verbose:
        ctv.print_summary()
    return ctv, common_regimens

In [159]:
ctv_vl50, common_regimens_vl50 = cox('vl50')
ctv_vl250,  common_regimens_vl250= cox('vl250')
ctv_cd4_500 , common_regimens_cd4_500= cox('cd4_500')

Iteration 1: norm_delta = 9.71e-01, step_size = 0.9500, log_lik = -494348.87229, newton_decrement = 1.20e+04, seconds_since_start = 2.2
Iteration 2: norm_delta = 7.88e-01, step_size = 0.9500, log_lik = -484879.57803, newton_decrement = 1.22e+03, seconds_since_start = 4.0
Iteration 3: norm_delta = 5.97e-01, step_size = 0.9500, log_lik = -483628.32475, newton_decrement = 2.65e+01, seconds_since_start = 6.1
Iteration 4: norm_delta = 6.32e-01, step_size = 1.0000, log_lik = -483601.05426, newton_decrement = 4.43e-01, seconds_since_start = 7.4
Iteration 5: norm_delta = 6.19e-01, step_size = 0.9800, log_lik = -483600.57112, newton_decrement = 5.25e-02, seconds_since_start = 11.7
Iteration 6: norm_delta = 6.07e-01, step_size = 0.9604, log_lik = -483600.50554, newton_decrement = 1.97e-02, seconds_since_start = 16.3
Iteration 7: norm_delta = 6.32e-01, step_size = 1.0000, log_lik = -483600.48123, newton_decrement = 7.54e-03, seconds_since_start = 18.1
Iteration 8: norm_delta = 6.19e-01, step_size



Iteration 1: norm_delta = 1.43e+00, step_size = 0.9500, log_lik = -274437.11482, newton_decrement = 1.19e+04, seconds_since_start = 0.6
Iteration 2: norm_delta = 1.02e+00, step_size = 0.9500, log_lik = -266155.86993, newton_decrement = 2.42e+03, seconds_since_start = 1.2
Iteration 3: norm_delta = 5.98e-01, step_size = 0.9500, log_lik = -263778.55963, newton_decrement = 3.13e+01, seconds_since_start = 1.7
Iteration 4: norm_delta = 6.32e-01, step_size = 1.0000, log_lik = -263747.27854, newton_decrement = 1.28e-01, seconds_since_start = 2.4
Iteration 5: norm_delta = 6.18e-01, step_size = 0.9800, log_lik = -263747.14759, newton_decrement = 2.73e-03, seconds_since_start = 7.0
Iteration 6: norm_delta = 6.06e-01, step_size = 0.9604, log_lik = -263747.14417, newton_decrement = 1.02e-03, seconds_since_start = 7.9
Iteration 7: norm_delta = 6.31e-01, step_size = 1.0000, log_lik = -263747.14291, newton_decrement = 3.91e-04, seconds_since_start = 8.6
Iteration 8: norm_delta = 6.18e-01, step_size = 



Iteration 1: norm_delta = 5.58e-01, step_size = 0.9500, log_lik = -539223.17448, newton_decrement = 4.27e+03, seconds_since_start = 1.5
Iteration 2: norm_delta = 6.70e-02, step_size = 0.9500, log_lik = -535690.17573, newton_decrement = 2.60e+02, seconds_since_start = 2.7
Iteration 3: norm_delta = 1.17e-02, step_size = 0.9500, log_lik = -535416.18175, newton_decrement = 7.24e+00, seconds_since_start = 3.8
Iteration 4: norm_delta = 1.03e-03, step_size = 1.0000, log_lik = -535408.83033, newton_decrement = 4.83e-02, seconds_since_start = 7.3
Iteration 5: norm_delta = 3.59e-06, step_size = 1.0000, log_lik = -535408.78189, newton_decrement = 5.92e-07, seconds_since_start = 9.0
Iteration 6: norm_delta = 6.02e-11, step_size = 1.0000, log_lik = -535408.78189, newton_decrement = 1.68e-16, seconds_since_start = 10.1
Convergence completed after 6 iterations.


In [9]:
test_df_sorted = test_df.sort_values(['ID', 'Month'])
test_df_sorted['Regimen_Shift'] = (test_df_sorted.groupby('ID')['Regimen'].shift() != test_df_sorted['Regimen']).astype(int)
test_df_sorted['Regimen_Start'] = test_df_sorted.groupby('ID')['Regimen_Shift'].cumsum()

# Split into regimen intervals
test_intervals = test_df_sorted.groupby(['ID', 'Regimen_Start']).agg(
    Start=('Month', 'min'),
    End=('Month', 'max'),
    Regimen=('Regimen', 'first'),
    VL=('VL', lambda x: list(x)),
    CD4=('CD4', lambda x: list(x))
).reset_index()

test_outcomes = test_df_sorted.groupby(['ID', 'Regimen_Start']).apply(track_outcomes).reset_index()
test_intervals_outcomes = pd.merge(test_intervals, test_outcomes, on=['ID', 'Regimen_Start'])

  test_outcomes = test_df_sorted.groupby(['ID', 'Regimen_Start']).apply(track_outcomes).reset_index()


In [115]:
def predict_probability_vl50(new_sample, ctv, baseline_survival, time_point):
    """
    Predicts the probability of reaching VL ≤ 50 for a new patient sample at a given time.

    Parameters:
    - new_sample (pd.Series): A single row of patient data with same features as ctv model.
    - ctv (CoxTimeVaryingFitter): Trained Cox PH model.
    - baseline_survival (pd.Series): The baseline survival function from ctv.
    - time_point (float): The time at which to predict the probability.

    Returns:
    - Probability of reaching VL ≤ 50 by time_point.
    """
    # Compute the linear predictor (risk score)
    X_beta = np.dot(new_sample, ctv.params_)
    X_beta = np.clip(X_beta, -10, 10)  

    # Extract baseline survival at time_point
    S0_t = baseline_survival.loc[time_point]

    # Compute adjusted survival probability for new sample
    S_new_t = np.exp(np.exp(X_beta) * np.log(S0_t))

    # Probability of VL ≤ 50 by time_point
    probability_vl50 = 1 - S_new_t
    return probability_vl50


In [106]:
def compute_combined_probability(test_df, ctv, baseline_survival):
    """
    Computes the combined probability of reaching VL ≤ 50 for each patient in test_df,
    by multiplying predicted probabilities for each regimen interval.

    Parameters:
    - test_df (pd.DataFrame): Test dataset with patient intervals.
    - ctv (CoxTimeVaryingFitter): Trained Cox PH model.
    - baseline_survival (pd.Series): The baseline survival function from ctv.
    - time_point (float): The time at which to predict the probability.

    Returns:
    - A dictionary mapping Patient ID to combined probability of VL ≤ 50.
    """
    patient_probs = {}

    # Group by Patient ID
    for patient_id, patient_data in test_df.groupby("Patient ID"):
        combined_prob = 1.0  # Start with 1, since we multiply probabilities

        for _, interval in patient_data.iterrows():
            # Drop non-feature columns (assuming the same feature structure as Cox model)
            duration = interval['stop'] - interval['start']
            if duration < 6: continue
            features = interval.drop(["Patient ID", "stop", "censor", "start"])

            # Compute probability for this interval
            try:
                prob_vl50 = predict_probability_vl50(features, ctv, baseline_survival, duration)
            except:
                prob_vl50 = 0.5*predict_probability_vl50(features, ctv, baseline_survival, duration+1) +0.5*predict_probability_vl50(features, ctv, baseline_survival, duration-1)

            
            # Multiply probabilities (to reflect all regimen intervals)
            combined_prob *= prob_vl50

        # Store the final probability per patient
        patient_probs[patient_id] = combined_prob

    return patient_probs



In [164]:
mode = 'vl50'
filtered_test_intervals_outcomes = filter_intervals(test_intervals_outcomes, mode)

tv_data = []
for _, row in filtered_test_intervals_outcomes.iterrows():
    start = row['Start']
    if mode == 'vl50':
        end = row['End'] if row['VL_50_Censored'] == 1 else row['VL_50_time']
    elif mode == 'vl250':
        end = row['End'] if row['VL_250_Censored'] == 1 else row['VL_250_time']
    else:
        end = row['End'] if row['CD4_500_Censored'] == 1 else row['CD4_500_time']
    
    tv_data.append({
        'Patient ID': row['ID'],
        'start': start,
        'stop': end,
        'gender': row['Gender'],
        'ethnicity': row['Ethnicity'],
        'censor': row['VL_50_Censored'] if mode == 'vl50' else row['VL_250_Censored'] if mode == 'vl250' else row['CD4_500_Censored'],
        'Regimen': row['Regimen'],
        'Baseline_VL': row['Baseline_VL'],
        'Baseline_CD4': row['Baseline_CD4']
    })

tv_df = pd.DataFrame(tv_data)

tv_df["Regimen_lumped"] = tv_df["Regimen"].apply(
    lambda x: x if x in sorted(common_regimens_vl50)[1:] else "Other"
)

tv_df["gender"] = tv_df["gender"].astype("category")
tv_df["ethnicity"] = tv_df["ethnicity"].astype("category")
tv_df = pd.get_dummies(tv_df, columns=["Regimen_lumped", "gender", "ethnicity"], drop_first=True)
tv_df['Regimen_lumped_0_1_2_5_0'] = False

tv_df.drop(['Regimen'], axis=1, inplace=True)
tv_df = tv_df[['Patient ID', 'start', 'stop', 'censor', 'Baseline_VL', 'Baseline_CD4',
       'Regimen_lumped_0_1_2_5_0', 'Regimen_lumped_0_1_3_5_0', 'Regimen_lumped_0_2_3_5_1',
       'Regimen_lumped_0_3_1_5_0', 'Regimen_lumped_0_3_2_5_0',
       'Regimen_lumped_0_3_3_5_0', 'Regimen_lumped_1_0_3_5_0',
       'Regimen_lumped_1_1_3_5_0', 'Regimen_lumped_1_3_0_5_0',
       'Regimen_lumped_1_3_3_0_0', 'Regimen_lumped_3_3_3_1_0',
       'Regimen_lumped_3_3_3_3_0', 'Regimen_lumped_3_3_3_5_0',
       'Regimen_lumped_4_3_3_4_0', 'Regimen_lumped_Other', 'gender_2', 'ethnicity_2', 'ethnicity_3', 'ethnicity_4']]

In [165]:
patient_combined_probs = compute_combined_probability(tv_df, ctv_vl50, ctv_vl50.baseline_survival_)

# Convert results to a DataFrame and display
patient_prob_df = pd.DataFrame([(list(x)[0], float(list(x)[1])) for x in patient_combined_probs.items()], columns=["Patient ID", "Combined Probability VL ≤ 50"])

  patient_prob_df = pd.DataFrame([(list(x)[0], float(list(x)[1])) for x in patient_combined_probs.items()], columns=["Patient ID", "Combined Probability VL ≤ 50"])


In [166]:
patient_prob_df

Unnamed: 0,Patient ID,Combined Probability VL ≤ 50
0,480089982195338,1.0
1,860720023361230,1.0
2,1464022152555177,1.0
3,1603386688763148,1.0
4,6159908844499024,1.0
...,...,...
14214,18442811382418572999,1.0
14215,18443220529254420590,1.0
14216,18444319623942329983,1.0
14217,18445549969321618109,1.0


In [167]:
test_df['vl50'] = test_df['VL'] < 50
tmp = test_df.groupby('ID')['vl50'].any().reset_index()
tmp

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['vl50'] = test_df['VL'] < 50


Unnamed: 0,ID,vl50
0,480089982195338,True
1,860720023361230,True
2,1464022152555177,True
3,1603386688763148,True
4,3696111044372867,True
...,...,...
18719,18442811382418572999,True
18720,18443220529254420590,True
18721,18444319623942329983,True
18722,18445549969321618109,True


In [168]:
patient_prob_df_joined = pd.merge(patient_prob_df, tmp, left_on='Patient ID', right_on = 'ID', how ='inner')
patient_prob_df_joined["vl50"] = patient_prob_df_joined["vl50"].astype(int)
patient_prob_df_joined

Unnamed: 0,Patient ID,Combined Probability VL ≤ 50,ID,vl50
0,480089982195338,1.0,480089982195338,1
1,860720023361230,1.0,860720023361230,1
2,1464022152555177,1.0,1464022152555177,1
3,1603386688763148,1.0,1603386688763148,1
4,6159908844499024,1.0,6159908844499024,1
...,...,...,...,...
14214,18442811382418572999,1.0,18442811382418572999,1
14215,18443220529254420590,1.0,18443220529254420590,1
14216,18444319623942329983,1.0,18444319623942329983,1
14217,18445549969321618109,1.0,18445549969321618109,1


In [169]:
from sklearn.metrics import roc_curve, auc, accuracy_score

fpr, tpr, thresholds = roc_curve(patient_prob_df_joined["vl50"], patient_prob_df_joined["Combined Probability VL ≤ 50"])

# Compute AUC
roc_auc = auc(fpr, tpr)

# Find optimal threshold using Youden’s J statistic (sensitivity + specificity - 1)
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]
# optimal_threshold = 0.5

patient_prob_df_joined["predicted_labels"] = (patient_prob_df_joined["Combined Probability VL ≤ 50"] >= optimal_threshold).astype(int)
accuracy = accuracy_score(patient_prob_df_joined["vl50"], patient_prob_df_joined["predicted_labels"])

print(f"Optimal Threshold: {optimal_threshold:.4f}")
print(f"AUC Score: {roc_auc:.4f}")
print(f"Accuracy: {accuracy}")


Optimal Threshold: inf
AUC Score: 0.4997
Accuracy: 0.059779168717912654


In [170]:
# Find the threshold that maximizes accuracy
accuracy_scores = []
thresholds_list = []

for threshold in thresholds:
    predicted_labels = (patient_prob_df_joined["Combined Probability VL ≤ 50"] >= threshold).astype(int)
    acc = accuracy_score(patient_prob_df_joined["vl50"], predicted_labels)
    accuracy_scores.append(acc)
    thresholds_list.append(threshold)

# Get the optimal threshold that maximizes accuracy
optimal_acc_idx = np.argmax(accuracy_scores)
optimal_acc_threshold = thresholds_list[optimal_acc_idx]
max_accuracy = accuracy_scores[optimal_acc_idx]

# Output results
optimal_acc_threshold, max_accuracy

(0.004773284543680756, 0.9402208312820873)

In [172]:
df[df['Regimen'] == '4_3_2_5_0']

Unnamed: 0,ID,Month,Gender,Ethnic,Base_Drug_Combo,Comp_INI,Comp_NNRTI,ExtraPI,ExtraPk_En,VL_M,CD4_M,Drug_M,VL,CD4,RelCD4,Regimen
4321,1606850074281952178,1,1,4,4,3,2,5,0,0,0,0,10935.818000,347.909360,22.034391,4_3_2_5_0
4322,1606850074281952178,2,1,4,4,3,2,5,0,0,1,0,18739.482000,360.469540,15.962029,4_3_2_5_0
4323,1606850074281952178,3,1,4,4,3,2,5,0,0,0,0,14724.895500,367.058620,16.352610,4_3_2_5_0
4324,1606850074281952178,4,1,4,4,3,2,5,0,0,0,0,11673.653000,354.201420,22.001566,4_3_2_5_0
4325,1606850074281952178,5,1,4,4,3,2,5,0,0,0,0,15124.843000,366.674440,19.234829,4_3_2_5_0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11233703,5505741940955840955,23,1,3,4,3,2,5,0,0,0,0,10207.768085,662.298846,63.079257,4_3_2_5_0
11233704,5505741940955840955,24,1,3,4,3,2,5,0,0,0,0,3446.113780,751.198993,70.097839,4_3_2_5_0
11233706,5505741940955840955,26,1,3,4,3,2,5,0,0,0,0,6838.961981,661.318779,73.366594,4_3_2_5_0
11233708,5505741940955840955,28,1,3,4,3,2,5,0,0,0,0,1063.054229,547.081272,86.578841,4_3_2_5_0


In [182]:
tmp = intervals_outcomes[intervals_outcomes['Regimen'] == '4_3_2_5_0']
tmp = tmp[tmp['End'] - tmp['Start'] >= 6]
tmp

Unnamed: 0,ID,Regimen_Start,Start,End,Regimen,VL,CD4,VL_250_time,VL_50_time,CD4_500_time,VL_250_Censored,VL_50_Censored,CD4_500_Censored,Gender,Ethnicity,Baseline_VL,Baseline_CD4,Baseline_CD4_percent
4731,55637088653164410,8,31,59,4_3_2_5_0,"[19.9795429258329, 18.8406077161368, 18.566046...","[713.455419564125, 918.874397042475, 828.55939...",31.0,31.0,31.0,0,0,0,1,4,19.979543,713.455420,2210191 22.883184 2210192 25.909040 2210...
4983,59184572372388764,1,0,15,4_3_2_5_0,"[63938.454662909, 29992.3074736124, 39229.3606...","[623.174365002545, 409.135324042191, 356.47953...",,,0.0,1,1,0,1,4,63938.454663,623.174365,11147760 16.184535 11147761 19.644096 11...
10199,115834427629663970,17,37,55,4_3_2_5_0,"[11.7450744122787, 6.66063652952883, 13.722506...","[2837.01714710031, 617.653631441048, 2888.3693...",37.0,37.0,37.0,0,0,0,2,4,11.745074,2837.017147,720877 172.031655 720878 137.317561 7208...
17290,202627894193868326,8,16,24,4_3_2_5_0,"[4.82417845626777, 8.97523958845362, 9.3677223...","[384.046418094006, 635.966871000706, 304.69321...",16.0,16.0,17.0,0,0,0,2,2,4.824178,384.046418,708136 74.062419 708137 71.903582 708138...
18019,211485711750439609,2,2,8,4_3_2_5_0,"[78737.9797944603, 71268.5319638531, 68196.828...","[279.811229597925, 178.933934850182, 144.11172...",,,,1,1,1,2,4,78737.979794,279.811230,10056482 20.420045 10056483 26.321873 10...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1558608,18083206079148101345,1,0,35,4_3_2_5_0,"[71189.4415694741, 52371.0350666313, 49868.341...","[556.001306481549, 318.191059057215, 364.72541...",16.0,16.0,0.0,0,0,0,2,4,71189.441569,556.001306,10099320 17.080729 10099321 25.075226 10...
1564654,18145948716975756379,5,6,15,4_3_2_5_0,"[27598.6746402201, 33361.0573976751, 23296.457...","[497.618239060025, 502.564284878892, 569.06949...",,,7.0,1,1,0,1,4,27598.674640,497.618239,6742746 28.981524 6742747 24.910351 6742...
1565828,18158010162326428136,2,1,9,4_3_2_5_0,"[48745.7341446213, 59562.3782852315, 44383.216...","[514.324151560391, 591.746796548947, 470.99905...",,,1.0,1,1,0,1,4,48745.734145,514.324152,7342681 30.288881 7342682 23.557101 7342...
1570380,18208653253402239311,1,0,35,4_3_2_5_0,"[72028.1405030012, 63741.7914543191, 66247.239...","[507.164169323858, 345.405840342773, 425.11737...",16.0,25.0,0.0,0,0,0,2,2,72028.140503,507.164169,8692800 19.331665 8692801 27.392868 8692...


In [184]:
df[df['ID'] == 18442603752681511701]

Unnamed: 0,ID,Month,Gender,Ethnic,Base_Drug_Combo,Comp_INI,Comp_NNRTI,ExtraPI,ExtraPk_En,VL_M,CD4_M,Drug_M,VL,CD4,RelCD4,Regimen
185880,18442603752681511701,0,2,4,4,3,3,4,0,1,1,0,47761.863,300.40134,16.619902,4_3_3_4_0
185881,18442603752681511701,1,2,4,4,3,3,4,0,0,0,0,32097.826,168.56769,25.6888,4_3_3_4_0
185882,18442603752681511701,2,2,4,4,3,3,4,0,0,0,0,35739.867,237.41399,19.501965,4_3_3_4_0
185883,18442603752681511701,3,2,4,4,3,3,4,0,0,0,0,27271.408,163.99104,26.467417,4_3_3_4_0
185884,18442603752681511701,4,2,4,4,3,3,4,0,1,1,0,40004.305,270.8509,17.745737,4_3_3_4_0
185885,18442603752681511701,5,2,4,4,3,3,4,0,1,1,0,38698.48,267.02628,17.687048,4_3_3_4_0
185886,18442603752681511701,6,2,4,4,3,3,4,0,0,0,0,37749.52,239.09296,19.710062,4_3_3_4_0
185887,18442603752681511701,7,2,4,4,3,3,4,0,0,0,0,28249.021,169.89125,25.585333,4_3_3_4_0
185888,18442603752681511701,8,2,4,4,3,3,4,0,1,1,0,38418.445,260.29974,17.835638,4_3_3_4_0
185889,18442603752681511701,9,2,4,4,3,3,4,0,1,1,0,35979.62,255.19427,18.28167,4_3_3_4_0
