In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from collections import defaultdict
import traceback
import os
import json

In [3]:
linux = False
master_path = "s3://pheno-master-data-collection-v1-prod-us-east-1"

In [3]:
# the source is here:https://docs.google.com/spreadsheets/d/1l8gf56ifdBwAptohozsdVidx9dUS-yNx2qxamYsooTw/edit?usp=sharing
if linux: # should move to s3
    file_path = '/home/ec2-user/data/datasets_feature_sets_v1-1.csv'
    df_feature_set = pd.read_csv(file_path)
else: 
    df_feature_set = pd.read_csv('/home/jovyan/projects/data/datasets_feature_sets_v1-1.csv')
    
df_feature_set[['dataset_id', 'dataset']] = df_feature_set[['dataset_id', 'dataset']].ffill()
df_feature_set_ready = df_feature_set.query('status == "READY on v1.1 Master bucket"')
df_feature_set_ready.head()

In [8]:
print(df_feature_set_ready.shape)
df = df_feature_set_ready.query('@pd.notna(count_parquet_relative_path) and @pd.notna(count_field_name)') # in @none
print(df.shape)
df_no_dup = df.drop_duplicates(['dataset', 'feature_set', 'count_parquet_relative_path', 'count_field_name'])
print(df_no_dup.shape)

(60, 19)


(52, 19)

In [13]:
count_dict = defaultdict(dict)
for index, row in df_no_dup.iterrows():
    try:
        # if row['dataset'] == 'diet_logging':
        #     continue
        dataset = row['dataset']
        feature_set = row['feature_set']
        
        #read parquet file
        full_path = f"{master_path}{row['count_parquet_relative_path']}"
        df_parquet = pd.read_parquet(full_path).reset_index()
        
        if dataset=='medications':
            row['count_field_name'] = 'medication'
        res_dict =  {
            "df_parquet_shape": df_parquet.shape[0],
            "count_field_name": row['count_field_name'],
            "count_parquet_relative_path": row['count_parquet_relative_path'],
            "particioant_id_nunique": df_parquet['participant_id'].nunique()
            }
        
        max_col = ''
        max_count = 0
        
        for col in row['count_field_name'].split(','):
            # remove white space
            col = col.strip()
            if col not in df_parquet.columns:
                print (f"ERROR: {col} not in {full_path} : {df_parquet.columns}")
                
            no_na_df = df_parquet.dropna(subset=[col]).copy()
            
            if dataset != 'events': 
                date_col = 'collection_date'
            else: 
                date_col = 'research_stage_date'
                
            if date_col not in no_na_df.columns:
                date_col = 'collection_timestamp'
            if date_col not in no_na_df.columns:
                print (f"ERROR: {date_col} not in {full_path}")
                first_date = None
                last_date = None
            else:   
                first_date = no_na_df[date_col].min() # first date
                last_date = no_na_df[date_col].max() # last date
            
            nunique_participants_not_na = no_na_df.participant_id.nunique() # number of unique participants
            count_col_notna = no_na_df.shape[0] # number of not na entries
            
            if no_na_df.shape[0] > max_count:
                max_count = no_na_df.shape[0]
                max_col = col
                
                
            res_dict[col] = {
                "count_col_notna": count_col_notna,  
                "nunique_participants_not_na" : nunique_participants_not_na,
                "first_date": first_date,
                "last_date": last_date,
            }
        del df_parquet
        res_dict['max_col'] = max_col
        count_dict[f"{int(row['dataset_id'])}-{dataset}"][feature_set] = res_dict
        del no_na_df
        
        
    except Exception as e: 
        print(row['dataset'], full_path)
        print(traceback.format_exc())
    

# with open('count_dict.json', 'w') as f:
#             json.dump(count_dict, f)

ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/population/population.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/fundus/microvasculature.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/fundus/microvasculature.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/hrv.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/hrv.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/hrv.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/hrv.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/raw/hrv.parquet
ERROR: collection_timestamp not in s3://pheno-master-data-collection-v1-prod-us-east-1/sleep/raw/hrv.parquet
ERROR: 

In [15]:

full_data = list()
for k, v in count_dict.items():
    for kk, vv in v.items():
        data = list()
        data.append(k)
        data.append(kk)
        col = vv['max_col']
        data.append(vv[col]["count_col_notna"])
        data.append(vv[col]["nunique_participants_not_na"])
        data.append(vv[col]["first_date"])
        data.append(vv[col]["last_date"])
        full_data.append(data)
        


In [16]:
df_res = pd.DataFrame(full_data)
df_res.columns = ['dataset', 'feature_set', 'count_col_notna', 'nunique_participants_not_na', 'first_date', 'last_date']
df_res.head()


Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
0,0-population,population,11179,11179,,
1,1-events,events,23300,11179,2018-12-05 00:00:00,2023-03-19 00:00:00
2,2-anthropometrics,anthropometrics,13568,10781,2018-11-22 00:00:00,2023-02-07 00:00:00
3,3-fundus,fundus,7079,7049,2021-02-17 00:00:00,2022-12-04 00:00:00
4,3-fundus,microvasculature,7079,7049,,


In [10]:
# df_res.to_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count.csv', index=False)
df_res = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count.csv')

In [11]:
df_res.query('dataset == "17-cgm"')

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
46,17-cgm,cgm,10087,10011,2019-01-07 00:14:00+02:00,2023-03-13 00:08:00+02:00
47,17-cgm,timeseries,10087,10011,2019-01-07 00:14:00+02:00,2023-03-13 00:08:00+02:00
48,17-cgm,iglu,10087,10011,,
49,17-cgm,iglu_daily,112023,10011,2019-01-07 00:00:00,2023-03-24 00:00:00


In [12]:
dict_dataset = {
    'population': 'events',
    'microvasculature': 'fundus',
    'hrv': 'sleep',
    'hrv_raw': 'sleep',
    'pca': 'human_genetics',
    'iglu': 'iglu_daily'
}
for k, v in dict_dataset.items():
    df_res.loc[df_res['feature_set']== k, 'first_date'] =  df_res.loc[df_res['feature_set']== v, 'first_date'].values
    df_res.loc[df_res['feature_set']== k, 'last_date'] = df_res.loc[df_res['feature_set']== v, 'last_date'].values

df_res.head()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
0,0-population,population,11179,11179,2018-12-05 00:00:00,2023-03-19 00:00:00
1,1-events,events,23300,11179,2018-12-05 00:00:00,2023-03-19 00:00:00
2,2-anthropometrics,anthropometrics,13568,10781,2018-11-22 00:00:00,2023-02-07 00:00:00
3,3-fundus,fundus,7079,7049,2021-02-17 00:00:00,2022-12-04 00:00:00
4,3-fundus,microvasculature,7079,7049,2021-02-17 00:00:00,2022-12-04 00:00:00


In [13]:
df_res.query('dataset == "17-cgm"')

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
46,17-cgm,cgm,10087,10011,2019-01-07 00:14:00+02:00,2023-03-13 00:08:00+02:00
47,17-cgm,timeseries,10087,10011,2019-01-07 00:14:00+02:00,2023-03-13 00:08:00+02:00
48,17-cgm,iglu,10087,10011,2019-01-07 00:00:00,2023-03-24 00:00:00
49,17-cgm,iglu_daily,112023,10011,2019-01-07 00:00:00,2023-03-24 00:00:00


In [4]:
import datetime

# Create a function to convert the date string to a datetime object
def convert_date_string(date_string):
    if date_string is None or date_string == 'None' or date_string == 'NaT' or date_string == 'nan':
        return None
    else:
        date_string_without_timezone = date_string[:19]
        datetime_object = datetime.datetime.strptime(date_string_without_timezone, '%Y-%m-%d %H:%M:%S')
        date = datetime_object.date()
        return date
    
    


In [15]:
df_convert = df_res.copy()


df_convert["first_date"] = df_convert["first_date"].apply(str)
df_convert["last_date"] = df_convert["last_date"].apply(str)

# Convert all the dates in the DataFrame to datetime objects
df_convert["first_date"] = df_convert["first_date"].apply(convert_date_string)
df_convert["last_date"] = df_convert["last_date"].apply(convert_date_string)




In [16]:
df_convert.query('dataset == "17-cgm"')

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
46,17-cgm,cgm,10087,10011,2019-01-07,2023-03-13
47,17-cgm,timeseries,10087,10011,2019-01-07,2023-03-13
48,17-cgm,iglu,10087,10011,2019-01-07,2023-03-24
49,17-cgm,iglu_daily,112023,10011,2019-01-07,2023-03-24


In [26]:
# df_convert.to_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted.csv', index=False)
# df_res = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted.csv')

In [5]:
# Reload the data

if linux:
    df = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted.csv')
else: 
    profile_name = 'ds'
    df = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted.csv',  storage_options=dict(profile=profile_name))


In [6]:
df.head()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date
0,0-population,population,11179,11179,2018-12-05,2023-03-19
1,1-events,events,23300,11179,2018-12-05,2023-03-19
2,2-anthropometrics,anthropometrics,13568,10781,2018-11-22,2023-02-07
3,3-fundus,fundus,7079,7049,2021-02-17,2022-12-04
4,3-fundus,microvasculature,7079,7049,2021-02-17,2022-12-04


In [7]:

# Function to split 'dataset' column
def split_dataset(value):
    try:
        id_, dataset = value.split('-', 1)
    except ValueError:
        id_ = None
        dataset = value
    return pd.Series([id_, dataset])


In [8]:

# Apply the function
df[['id', 'dataset']] = df['dataset'].apply(split_dataset)


In [9]:
df.head()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,population,population,11179,11179,2018-12-05,2023-03-19,0
1,events,events,23300,11179,2018-12-05,2023-03-19,1
2,anthropometrics,anthropometrics,13568,10781,2018-11-22,2023-02-07,2
3,fundus,fundus,7079,7049,2021-02-17,2022-12-04,3
4,fundus,microvasculature,7079,7049,2021-02-17,2022-12-04,3


In [10]:
df.tail()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
47,cgm,timeseries,10087,10011,2019-01-07,2023-03-13,17
48,cgm,iglu,10087,10011,2019-01-07,2023-03-24,17
49,cgm,iglu_daily,112023,10011,2019-01-07,2023-03-24,17
50,medications,medications,14527,10352,2018-10-18,2023-03-15,18
51,bone_density,bone_density,10130,9188,2020-06-09,2023-03-16,22


## Handle Questionnaires

In [11]:
df_res_q = dict()

In [12]:
missing_datasets = {
    '21-medical_conditions': 'medical_conditions/medical_conditions.parquet',
    '40-ukbb_questionnaire': 'ukbb_questionnaire/ukbb_questionnaire.parquet', 
    '41-ukbb_followup_questionnaire': 'ukbb_followup_questionnaire/ukbb_followup_questionnaire.parquet',
    '42-ibs_questionnaire': 'ibs_questionnaire/ibs_questionnaire.parquet',
    '43-covid_questionnaire': 'covid_questionnaire/covid_questionnaire.parquet'
}

base_path = 's3://pheno-master-data-collection-v1-prod-us-east-1'


In [13]:
def handle_survey_data(key):
    
    if linux:
        df = pd.read_parquet(os.path.join(base_path, missing_datasets[key]))
    else: 
        profile_name = 'prod'
        df = pd.read_parquet(os.path.join(base_path, missing_datasets[key]), storage_options=dict(profile=profile_name))
    
    df.reset_index(inplace=True)
    # display(df.head())
    
    print(df.shape)
    df_drop = df.drop_duplicates(['participant_id','cohort', 'research_stage',  'collection_date'])
    print(df_drop.shape)

    data_points = df_drop.shape[0]
    unique_participants = df_drop.participant_id.nunique()

    first_date = df_drop.sort_values('collection_timestamp')['collection_timestamp'].values[0]
    last_date = df_drop.sort_values('collection_timestamp')['collection_timestamp'].values[-1]

    df_res_q[key] = {
        'count_col_notna': data_points, 
        'nunique_participants_not_na': unique_participants, 
        'first_date': first_date, 
        'last_date': last_date
    }
    print(df_res_q[key])


In [14]:
for key in missing_datasets.keys():
    handle_survey_data(key)    


(10418, 255)
(10418, 255)
{'count_col_notna': 10418, 'nunique_participants_not_na': 10418, 'first_date': numpy.datetime64('2018-11-21T13:40:32.000000000'), 'last_date': numpy.datetime64('2023-04-23T05:40:57.000000000')}
(31311, 346)
(10156, 346)
{'count_col_notna': 10156, 'nunique_participants_not_na': 9139, 'first_date': numpy.datetime64('2019-01-09T15:58:09.000000000'), 'last_date': numpy.datetime64('2023-05-17T09:24:18.000000000')}
(20926, 199)
(6872, 199)
{'count_col_notna': 6872, 'nunique_participants_not_na': 5410, 'first_date': numpy.datetime64('2020-02-17T16:38:08.000000000'), 'last_date': numpy.datetime64('2023-05-17T12:06:40.000000000')}
(9600, 61)
(9423, 61)
{'count_col_notna': 9423, 'nunique_participants_not_na': 8989, 'first_date': numpy.datetime64('2019-01-09T13:48:33.000000000'), 'last_date': numpy.datetime64('2023-05-08T12:10:40.000000000')}
(8966, 21)
(8966, 21)
{'count_col_notna': 8966, 'nunique_participants_not_na': 8744, 'first_date': numpy.datetime64('2021-04-26T06

In [15]:
res_q = pd.DataFrame.from_dict(df_res_q).T
res_q = res_q.reset_index()

res_q["first_date"] = res_q["first_date"].apply(str)
res_q["last_date"] = res_q["last_date"].apply(str)

# Convert all the dates in the DataFrame to datetime objects
res_q["first_date"] = res_q["first_date"].apply(convert_date_string)
res_q["last_date"] = res_q["last_date"].apply(convert_date_string)


res_q[['id', 'dataset']] = res_q['index'].apply(split_dataset)
res_q['feature_set'] = res_q['dataset']
res_q.drop('index', axis=1, inplace=True)
res_q = res_q[['dataset', 'feature_set', 'count_col_notna', 'nunique_participants_not_na', 'first_date', 'last_date', 'id']]


In [16]:
res_q.head()


Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,medical_conditions,medical_conditions,10418,10418,2018-11-21,2023-04-23,21
1,ukbb_questionnaire,ukbb_questionnaire,10156,9139,2019-01-09,2023-05-17,40
2,ukbb_followup_questionnaire,ukbb_followup_questionnaire,6872,5410,2020-02-17,2023-05-17,41
3,ibs_questionnaire,ibs_questionnaire,9423,8989,2019-01-09,2023-05-08,42
4,covid_questionnaire,covid_questionnaire,8966,8744,2021-04-26,2023-05-17,43


## Back To Parsing

In [17]:
df = pd.concat([df, res_q])
df.head()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,population,population,11179,11179,2018-12-05,2023-03-19,0
1,events,events,23300,11179,2018-12-05,2023-03-19,1
2,anthropometrics,anthropometrics,13568,10781,2018-11-22,2023-02-07,2
3,fundus,fundus,7079,7049,2021-02-17,2022-12-04,3
4,fundus,microvasculature,7079,7049,2021-02-17,2022-12-04,3


In [18]:
df.tail()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,medical_conditions,medical_conditions,10418,10418,2018-11-21,2023-04-23,21
1,ukbb_questionnaire,ukbb_questionnaire,10156,9139,2019-01-09,2023-05-17,40
2,ukbb_followup_questionnaire,ukbb_followup_questionnaire,6872,5410,2020-02-17,2023-05-17,41
3,ibs_questionnaire,ibs_questionnaire,9423,8989,2019-01-09,2023-05-08,42
4,covid_questionnaire,covid_questionnaire,8966,8744,2021-04-26,2023-05-17,43


In [19]:
# Function to format the numbers
def format_number(x):
    return f'{int(x):03d}'

# Apply the function to the desired column
df['id'] = df['id'].apply(format_number)
 
df.head()


Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,population,population,11179,11179,2018-12-05,2023-03-19,0
1,events,events,23300,11179,2018-12-05,2023-03-19,1
2,anthropometrics,anthropometrics,13568,10781,2018-11-22,2023-02-07,2
3,fundus,fundus,7079,7049,2021-02-17,2022-12-04,3
4,fundus,microvasculature,7079,7049,2021-02-17,2022-12-04,3


In [20]:
df.tail()


Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,medical_conditions,medical_conditions,10418,10418,2018-11-21,2023-04-23,21
1,ukbb_questionnaire,ukbb_questionnaire,10156,9139,2019-01-09,2023-05-17,40
2,ukbb_followup_questionnaire,ukbb_followup_questionnaire,6872,5410,2020-02-17,2023-05-17,41
3,ibs_questionnaire,ibs_questionnaire,9423,8989,2019-01-09,2023-05-08,42
4,covid_questionnaire,covid_questionnaire,8966,8744,2021-04-26,2023-05-17,43


In [21]:
# df.sort_values('id', inplace=True)
# df.to_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted_with_q_featyre_sets.csv', index=False)

In [22]:
# df.drop_duplicates(['dataset'], keep='first', inplace=True)
df = df.query('dataset==feature_set')
df.sort_values('id', inplace=True)
df.head()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
0,population,population,11179,11179,2018-12-05,2023-03-19,0
1,events,events,23300,11179,2018-12-05,2023-03-19,1
2,anthropometrics,anthropometrics,13568,10781,2018-11-22,2023-02-07,2
3,fundus,fundus,7079,7049,2021-02-17,2022-12-04,3
6,liver_ultrasound,liver_ultrasound,30997,9000,2020-02-20,2023-05-14,4


In [23]:
df.tail()

Unnamed: 0,dataset,feature_set,count_col_notna,nunique_participants_not_na,first_date,last_date,id
51,bone_density,bone_density,10130,9188,2020-06-09,2023-03-16,22
1,ukbb_questionnaire,ukbb_questionnaire,10156,9139,2019-01-09,2023-05-17,40
2,ukbb_followup_questionnaire,ukbb_followup_questionnaire,6872,5410,2020-02-17,2023-05-17,41
3,ibs_questionnaire,ibs_questionnaire,9423,8989,2019-01-09,2023-05-08,42
4,covid_questionnaire,covid_questionnaire,8966,8744,2021-04-26,2023-05-17,43


In [38]:
if linux:
    # df.to_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted_with_q.csv', index=False)
    df = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted_with_q.csv')
else: 
    profile_name = 'ds'
    # df.to_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted_with_q.csv', index=False, storage_options=dict(profile=profile_name))
    df = pd.read_csv('s3://ds-users/mariag/pheno_master_data_collection_v1_1_count_converted_with_q.csv', storage_options=dict(profile=profile_name))
    
df['id'] = df['id'].apply(format_number)

In [39]:

# Reorder the columns
df = df[['id', 'dataset', 'feature_set', 'count_col_notna', 'nunique_participants_not_na', 'first_date', 'last_date']]

# Combine 'count_col_notna' and 'nunique_participants_not_na' columns
df['data points (participants)'] = df['count_col_notna'].astype(str) + " (" + df['nunique_participants_not_na'].astype(str) + ")"

# Drop the original columns
df = df.drop(columns=['count_col_notna', 'nunique_participants_not_na'])
df.head()

Unnamed: 0,id,dataset,feature_set,first_date,last_date,data points (participants)
0,0,population,population,2018-12-05,2023-03-19,11179 (11179)
1,1,events,events,2018-12-05,2023-03-19,23300 (11179)
2,2,anthropometrics,anthropometrics,2018-11-22,2023-02-07,13568 (10781)
3,3,fundus,fundus,2021-02-17,2022-12-04,7079 (7049)
4,4,liver_ultrasound,liver_ultrasound,2020-02-20,2023-05-14,30997 (9000)


In [40]:
df.tail()

Unnamed: 0,id,dataset,feature_set,first_date,last_date,data points (participants)
20,22,bone_density,bone_density,2020-06-09,2023-03-16,10130 (9188)
21,40,ukbb_questionnaire,ukbb_questionnaire,2019-01-09,2023-05-17,10156 (9139)
22,41,ukbb_followup_questionnaire,ukbb_followup_questionnaire,2020-02-17,2023-05-17,6872 (5410)
23,42,ibs_questionnaire,ibs_questionnaire,2019-01-09,2023-05-08,9423 (8989)
24,43,covid_questionnaire,covid_questionnaire,2021-04-26,2023-05-17,8966 (8744)


In [41]:

# Reorder the columns
df = df[['id', 'dataset', 'data points (participants)', 'first_date', 'last_date']]#'feature_set',

# Remove '_' from column names
df.columns = df.columns.str.replace('_', ' ')

# Add 'tabular data' column with "✓" for all rows
df['tabular data'] = '✓'

# Add 'time series data' column with "✓" for specified rows
df['time series data'] = ''
df.loc[df['dataset'] == 'diet logging', 'time series data'] = '✓'
df.loc[(df['dataset'].isin(['sleep', 'ecg', 'cgm'])), 'time series data'] = '✓'

# Add 'image data' column with "✓" for specified rows
df['image data'] = ''
df.loc[(df['dataset'] == 'fundus') , 'image data'] = '✓'



In [42]:
# Convert each value in the 'dataset' column to a link
df['dataset'] = df.apply(lambda row: f'[{row["dataset"]}](datasets/{row["id"]}-{row["dataset"]}.html)', axis=1)

# Convert the DataFrame to a markdown table
markdown_table = df.to_markdown(index=False)

In [44]:

# Save the markdown table to a .md file
with open('../datasets.md', 'w') as f:
    f.write(markdown_table)
    f.write('\n\n')
    f.write(': Available Datasets {.striped .hover tbl-colwidths="[5, 35, 20 ,20, 20, 5, 5, 5]"}')
    
# Check the first few lines of the markdown table
print('\n'.join(markdown_table.split('\n')[:10]))


|   id | dataset                                                                      | data points (participants)   | first date   | last date   | tabular data   | time series data   | image data   |
|-----:|:-----------------------------------------------------------------------------|:-----------------------------|:-------------|:------------|:---------------|:-------------------|:-------------|
|  000 | [population](datasets/000-population.html)                                   | 11179 (11179)                | 2018-12-05   | 2023-03-19  | ✓              |                    |              |
|  001 | [events](datasets/001-events.html)                                           | 23300 (11179)                | 2018-12-05   | 2023-03-19  | ✓              |                    |              |
|  002 | [anthropometrics](datasets/002-anthropometrics.html)                         | 13568 (10781)                | 2018-11-22   | 2023-02-07  | ✓              |                    |           