In [1]:
# importing all the necessary libraries
import numpy as np
import pandas as pd

# Importing dataset

In [2]:
# importing dataset
patients_visit_data = pd.read_csv("/content/CDR_score_task.csv")

In [3]:
patients_visit_data

Unnamed: 0,PTID,Visit Code,CDR Score
0,2,sc,0.0
1,2,m06,0.0
2,2,m36,0.0
3,2,m60,0.0
4,2,m72,0.0
...,...,...,...
13044,7120,sc,0.5
13045,7121,sc,0.5
13046,7122,sc,0.0
13047,7123,sc,0.0


In [4]:
# getting info about data
patients_visit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13049 entries, 0 to 13048
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   PTID        13049 non-null  int64  
 1   Visit Code  13037 non-null  object 
 2   CDR Score   12983 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 306.0+ KB


In [5]:
#checking null values
patients_visit_data.isnull().sum()

PTID           0
Visit Code    12
CDR Score     66
dtype: int64

**There are 12 null values in visit code and 66 null values in cdr score**

In [6]:
# Impute missing values in cdr column with the mean
patients_visit_data['CDR Score'].fillna(patients_visit_data['CDR Score'].mean(), inplace=True)

In [7]:
# Remove all those rows with null values in visit code
patients_visit_data.dropna(subset=['Visit Code'], inplace=True)

In [8]:
#checking null values
patients_visit_data.isnull().sum()

PTID          0
Visit Code    0
CDR Score     0
dtype: int64

In [9]:
patients_visit_data.shape

(13037, 3)

# *Patients with continous 6 months visit pattern*



In [10]:
patients_visit_data

Unnamed: 0,PTID,Visit Code,CDR Score
0,2,sc,0.0
1,2,m06,0.0
2,2,m36,0.0
3,2,m60,0.0
4,2,m72,0.0
...,...,...,...
13044,7120,sc,0.5
13045,7121,sc,0.5
13046,7122,sc,0.0
13047,7123,sc,0.0


In [11]:
# unique values in visit code column
unique_visit_codes = patients_visit_data['Visit Code'].unique()
print(unique_visit_codes)

['sc' 'm06' 'm36' 'm60' 'm72' 'm84' 'm96' 'm108' 'm120' 'm132' 'm144'
 'm12' 'm24' 'm18' 'm48' 'f' 'm174' 'm150' 'm162' 'm180' 'm156' 'm168'
 'm192' 'm186' 'm138' 'm204' 'uns1' 'm198' 'm126' 'm102' 'm78' 'm90'
 'm114' 'm66' 'm54' 'm42' 'm30']


In [12]:
len(unique_visit_codes)

37

In [13]:
# Group by patient ID and get the first CDR score for each patient
cdr_data = patients_visit_data.groupby('PTID')['CDR Score'].first().reset_index()


In [14]:
cdr_data

Unnamed: 0,PTID,CDR Score
0,2,0.0
1,3,1.0
2,4,0.5
3,5,0.0
4,6,0.5
...,...,...
3544,7120,0.5
3545,7121,0.5
3546,7122,0.0
3547,7123,0.0


In [15]:
# Group patient visit data by patient ID and aggregate their visits into a list
patient_visits = patients_visit_data.groupby('PTID')['Visit Code'].apply(list).reset_index()


In [16]:
patient_visits

Unnamed: 0,PTID,Visit Code
0,2,"[sc, m06, m36, m60, m72, m84, m96, m108, m120,..."
1,3,"[sc, m06, m12, m24]"
2,4,"[sc, m06, m12, m18, m36]"
3,5,"[sc, m06, m12, m24, m36]"
4,6,"[sc, m06, m12, m18, m24, m36]"
...,...,...
3544,7120,[sc]
3545,7121,[sc]
3546,7122,[sc]
3547,7123,[sc]


In [17]:
# Merge the two dataframes based on the 'PTID' column
df = pd.merge(cdr_data, patient_visits, on='PTID')

#resulting merged dataframe
df

Unnamed: 0,PTID,CDR Score,Visit Code
0,2,0.0,"[sc, m06, m36, m60, m72, m84, m96, m108, m120,..."
1,3,1.0,"[sc, m06, m12, m24]"
2,4,0.5,"[sc, m06, m12, m18, m36]"
3,5,0.0,"[sc, m06, m12, m24, m36]"
4,6,0.5,"[sc, m06, m12, m18, m24, m36]"
...,...,...,...
3544,7120,0.5,[sc]
3545,7121,0.5,[sc]
3546,7122,0.0,[sc]
3547,7123,0.0,[sc]


In [18]:
# List of predefined visit codes
a = ['sc', 'm06', 'm12', 'm18', 'm24', 'm30', 'm36', 'm42', 'm48', 'm54', 'm60', 'm66', 'm72', 'm78', 'm84', 'm90', 'm96', 'm102', 'm108',
     'm114', 'm120', 'm126', 'm132', 'm138', 'm144', 'm150', 'm156', 'm162', 'm168', 'm174', 'm180', 'm186', 'm192', 'm198', 'm204']

# Function to find matching values between visit codes and predefined list 'a'
def find_matching_values(visit_codes):
    result = []
    # Iterate through each pair of visit codes and predefined codes
    for m_value, a_value in zip(visit_codes, a):
        # Check if the visit code matches the predefined code
        if m_value == a_value:
            result.append(m_value)
        else:
            # If there is a mismatch, break the loop
            break
    return result

# Apply the find_matching_values function to each row in the 'Visit Code' column
df['consequent_visits'] = df['Visit Code'].apply(find_matching_values)


In [21]:
# Filling the empty list with f values
df['consequent_visits'] = df['consequent_visits'].apply(lambda x: x if len(x) > 0 else ['f'])
print(df)


      PTID  CDR Score                                         Visit Code  \
0        2        0.0  [sc, m06, m36, m60, m72, m84, m96, m108, m120,...   
1        3        1.0                                [sc, m06, m12, m24]   
2        4        0.5                           [sc, m06, m12, m18, m36]   
3        5        0.0                           [sc, m06, m12, m24, m36]   
4        6        0.5                      [sc, m06, m12, m18, m24, m36]   
...    ...        ...                                                ...   
3544  7120        0.5                                               [sc]   
3545  7121        0.5                                               [sc]   
3546  7122        0.0                                               [sc]   
3547  7123        0.0                                               [sc]   
3548  7125        0.5                                               [sc]   

             consequent_visits  
0                    [sc, m06]  
1               [sc, 

In [22]:
from google.colab import files

# Save the filtered DataFrame to a CSV file
df.to_csv('filtered_data.csv', index=False)

# Download the CSV file
files.download('filtered_data.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>