In [1]:
%cd /content/drive/MyDrive/analysis_learning_PERG/

/content/drive/MyDrive/analysis_learning_PERG


# Import Libraries

In [8]:
import os
import pandas as pd
import numpy as np
import math
import shutil

# Data Cleaning

In [67]:
participants_info = pd.read_csv("data/raw_data/csv/participants_info.csv", index_col=False)
participants_info.head()

Unnamed: 0,id_record,date,age_years,sex,diagnosis1,diagnosis2,diagnosis3,va_re_logMar,va_le_logMar,unilateral,rep_record,comments
0,1,2016-09-15,13,Male,Normal,,,-0.08,0.06,,,
1,2,2005-09-15,13,Female,Congenital stationary night blindness,,,0.18,0.16,,,
2,3,2019-08-08,49,Female,Orbital ischemia,Systemic disorder with ocular manifestations,,0.26,0.0,,Id:0329 - Id:0154 - Id:0049 - Id:0271,
3,4,2004-12-16,43,Female,Retinitis pigmentosa,,,,,,,
4,5,2016-07-13,47,Female,Normal,,,0.1,0.1,,,


Since there are repeated observations of some participants, the first step is to club repeated observations of the same participants together.

In order to accomplish this, I will be creating a hash map, which will be initialised as empty and be populated by rep_record id : id_record

In [4]:
rep_record_map = {}

for ind in participants_info.index:
  rep_records = participants_info.iloc[ind].rep_record

  if type(rep_records)==str:
    for rep_record in rep_records.replace(" ", "").split("-"):
      repeated_id = int(rep_record.split(":")[1])

      if repeated_id not in rep_record_map.keys() and repeated_id not in rep_record_map.values():
        rep_record_map[int(rep_record.split(":")[1])] = participants_info.iloc[ind].id_record

To validate the repeated record map we can cross check with the information provided by <source of info> which states that "*During this extended timeframe, 23 individuals had multiple visits: 19 individuals had two visits each, 1 individual had three, another had four visits and two subjects had five visits each. Out of the total number of participants, 155 were female, and the age range was from 4 to 86 with a mean of 37.1 ± 18.3 years. As a part of the routine clinical evaluation, all subjects underwent diagnosis by ophthalmology specialists. Out of the total number of participants, 100 subjects showed no eye-related medical conditions and were classified as normal.*"

In [65]:
validate_info = {}

for val in rep_record_map.values():
  if val not in validate_info:
    validate_info[val] = 2
  else:
    validate_info[val] += 1

print(f"{sum(1 for v in validate_info.values() if v == 2)} individuals had two visits each")
print(f"{sum(1 for v in validate_info.values() if v == 3)} individuals had three visits each")
print(f"{sum(1 for v in validate_info.values() if v == 4)} individuals had four visits each")
print(f"{sum(1 for v in validate_info.values() if v == 5)} individuals had five visits each")

19 individuals had two visits each
1 individuals had three visits each
1 individuals had four visits each
2 individuals had five visits each


This confirms the above mentioned statement.

# Data Preparation

Make a separate folder for each patient storing all multiple records with a patient information dataframe.

In [9]:
processed_data = "data/processed_data"
raw_data = "data/raw_data/csv"
! mkdir {processed_data}

In [10]:
for ind in participants_info.index:
  id_record = participants_info.iloc[ind].id_record
  csv_file = os.path.join(raw_data, "{:04n}".format(id_record)+".csv")

  if id_record in rep_record_map.keys():
    folder_name = os.path.join(processed_data, "{:04n}".format(rep_record_map[id_record]))
  else:
    folder_name = os.path.join(processed_data, "{:04n}".format(id_record))
    os.mkdir(folder_name)

  shutil.copyfile(csv_file, os.path.join(folder_name, "{:04n}".format(id_record)+".csv"))

In [12]:
rev_rep_record_map = {}

for key, val in rep_record_map.items():
  if val in rev_rep_record_map.keys():
    rev_rep_record_map[val].append(key)
  else:
    rev_rep_record_map[val] = [key]

In [53]:
for ind in participants_info.index:
  id_record = participants_info.iloc[ind].id_record
  temp_df = pd.DataFrame()
  temp_df = pd.concat([temp_df, participants_info.iloc[ind:ind+1]], axis=0)
  folder_name = os.path.join(processed_data, "{:04n}".format(id_record))

  if id_record in rev_rep_record_map.keys():
    for rep_record in rev_rep_record_map[id_record]:
      ind_rep_record = rep_record - 1
      temp_df = pd.concat([temp_df, participants_info.iloc[ind_rep_record:ind_rep_record+1]], axis=0)

  else:
    if id_record in rep_record_map.keys():
      continue

  temp_df.to_csv(os.path.join(folder_name, "participant_info.csv"))