# Deep learning echocardiograms: cleaning tabular data

In [1]:
from tensorflow import keras
import tensorflow.keras.utils as utils

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

np.set_printoptions(suppress=True) # Suppress scientific notation where possible
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Import & clean video_df
>- Each row represents a video file
>- Videos are all 112 x 112 pixels

In [2]:
video_df = pd.read_csv("EchoNet-Dynamic/FileList.csv")
video_df.head()

Unnamed: 0,FileName,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
0,0X100009310A3BD7FC,78.49841,14.88137,69.21053,112,112,50,174,VAL
1,0X1002E8FBACD08477,59.10199,40.38388,98.74288,112,112,50,215,TRAIN
2,0X1005D03EED19C65B,62.3638,14.26778,37.90973,112,112,50,104,TRAIN
3,0X10075961BC11C88E,54.5451,33.14308,72.91421,112,112,55,122,TRAIN
4,0X10094BA0A028EAC3,24.88774,127.58194,169.85502,112,112,52,207,VAL


In [3]:
# 10,030 video files

video_df.shape

(10030, 9)

In [4]:
video_df = video_df.set_index('FileName')
video_df.head()

Unnamed: 0_level_0,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0X100009310A3BD7FC,78.49841,14.88137,69.21053,112,112,50,174,VAL
0X1002E8FBACD08477,59.10199,40.38388,98.74288,112,112,50,215,TRAIN
0X1005D03EED19C65B,62.3638,14.26778,37.90973,112,112,50,104,TRAIN
0X10075961BC11C88E,54.5451,33.14308,72.91421,112,112,55,122,TRAIN
0X10094BA0A028EAC3,24.88774,127.58194,169.85502,112,112,52,207,VAL


## Importing & cleaning tracings_df
>- Each video has 2 sets of tracings, one at end diastole (used to calculate end-diastolic volume, EDV) and one at end systole (used to calculate end-systolic volume, ESV)
>- 10030 videos x 2 volumes/video x 21 measurements each = 421,260 measurements (each a row of this df)

In [5]:
tracings_df = pd.read_csv("EchoNet-Dynamic/VolumeTracings.csv")
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame
0,0X100009310A3BD7FC.avi,51.26042,15.34896,64.93229,69.125,46
1,0X100009310A3BD7FC.avi,50.03761,17.16784,53.36722,16.32133,46
2,0X100009310A3BD7FC.avi,49.15738,20.40763,57.09055,18.39072,46
3,0X100009310A3BD7FC.avi,48.53817,23.58105,59.99734,20.66771,46
4,0X100009310A3BD7FC.avi,47.91897,26.75448,62.90413,22.94469,46


### Initial exploration/cleaning 

In [6]:
# actual df has slightly more rows than expected based on above calculation

tracings_df.shape

(425010, 6)

In [7]:
# remove ".avi" from file name to match video_df

tracings_df['FileName'] = tracings_df['FileName'].apply(lambda x: x[:-4])

In [8]:
# tracing_df contains 10,025 videos, 5 fewer than video_df
# each video has exactly 2 frames that were used for tracing

grouped_tracings_df = tracings_df.groupby('FileName').nunique().sort_values(by='Frame')
grouped_tracings_df

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,41,42,41,42,2
0X601A0FE2BA2F9F68,40,42,40,42,2
0X601B706CC84025BD,42,42,40,42,2
0X602005216EB0FAB3,42,42,40,42,2
0X6020D0C5C256684D,42,42,42,42,2
...,...,...,...,...,...
0X3869C8E146FBD123,42,42,42,42,2
0X386DAB52217592A2,42,42,42,42,2
0X386E38E6214B10A1,42,42,42,42,2
0X3860318B66CC8557,42,42,39,42,2


In [9]:
# some frames have way over 21 measurements

tracings_df.groupby(['FileName', 'Frame']).count().reset_index() \
    .sort_values(by=['X1'], ascending=False).head(10)

Unnamed: 0,FileName,Frame,X1,Y1,X2,Y2
11951,0X57AF4D24B154C573,15,168,168,168,168
6254,0X35A5E9C9075E56EE,44,147,147,147,147
6184,0X354B37A25C64276F,31,105,105,105,105
15781,0X6E02E0F24F63EFD7,121,105,105,105,105
19090,0XA20EE6C5B1F48CB,95,84,84,84,84
8772,0X44C18287CA978438,51,84,84,84,84
9000,0X46024CC33D00D4C6,33,84,84,84,84
9001,0X46024CC33D00D4C6,48,84,84,84,84
10505,0X4EA078CC4E65B6A3,61,84,84,84,84
14393,0X65E605F203321860,53,84,84,84,84


### Identifying and handling nulls 
>- Going into this section, `video_df` had 10,030 videos and `tracing_df` had 10,025 videos  
>- By the end, both dfs will have the same 10,024 videos

In [10]:
# try joining grouped version of tracings_df with video_df to identify discrepancies

merged_data = pd.merge(grouped_tracings_df, video_df, on='FileName', how='outer')
merged_data.head()

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X100009310A3BD7FC,41.0,42.0,41.0,42.0,2.0,78.49841,14.88137,69.21053,112.0,112.0,50.0,174.0,VAL
0X601A0FE2BA2F9F68,40.0,42.0,40.0,42.0,2.0,34.52831,86.42871,132.00928,112.0,112.0,50.0,167.0,TRAIN
0X601B706CC84025BD,42.0,42.0,40.0,42.0,2.0,50.03082,71.0173,142.12219,112.0,112.0,50.0,137.0,TRAIN
0X602005216EB0FAB3,42.0,42.0,40.0,42.0,2.0,59.95281,37.31474,93.17692,112.0,112.0,50.0,153.0,TRAIN
0X6020D0C5C256684D,42.0,42.0,42.0,42.0,2.0,59.43198,29.48287,72.67515,112.0,112.0,62.0,150.0,TRAIN


In [11]:
# compare to grouped_tracings_df (10,025 rows) and video_df (10,030 rows)

merged_data.shape

(10031, 13)

In [12]:
# looks like 6 nulls in the columns from tracings_df 
# and 1 null in the columns from video_df

merged_data.isnull().sum()

X1                6
Y1                6
X2                6
Y2                6
Frame             6
EF                1
ESV               1
EDV               1
FrameHeight       1
FrameWidth        1
FPS               1
NumberOfFrames    1
Split             1
dtype: int64

In [13]:
# the 6 entries missing from tracings_df are all at the end of the merged df

merged_data.tail(6)

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X5DD5283AC43CCDD1,,,,,,62.69188,36.28439,97.256,768.0,1024.0,50.0,188.0,TEST
0X234005774F4CB5CD,,,,,,51.72474,47.06533,97.49369,768.0,1040.0,50.0,127.0,TRAIN
0X2DC68261CBCC04AE,,,,,,62.18778,26.33348,69.64277,768.0,1024.0,50.0,66.0,TRAIN
0X35291BE9AB90FB89,,,,,,62.07076,49.06434,129.35756,768.0,1024.0,50.0,208.0,TRAIN
0X6C435C1B417FDE8A,,,,,,59.63526,57.72117,142.99898,768.0,1024.0,50.0,166.0,TRAIN
0X5515B0BD077BE68A,,,,,,46.01999,27.26039,50.50091,768.0,1024.0,50.0,126.0,TRAIN


In [14]:
# dropping the 6 rows from video_df

video_df = video_df.drop(['0X5DD5283AC43CCDD1', '0X234005774F4CB5CD', '0X2DC68261CBCC04AE',
'0X35291BE9AB90FB89', '0X6C435C1B417FDE8A', '0X5515B0BD077BE68A'])
video_df.shape

(10024, 8)

In [15]:
# here is the one entry missing from video_df (but present in tracings_df)
# this entry is also missing a video file

merged_data[merged_data['EF'].isnull()]

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X4F8859C8AB4DA9CB,42.0,42.0,42.0,42.0,2.0,,,,,,,,


In [16]:
# remove the one filename from tracings_df (42 rows)
# original length of tracings_df: 425,010

tracings_df = tracings_df[tracings_df['FileName'] != '0X4F8859C8AB4DA9CB']
tracings_df.shape

(424968, 6)

In [17]:
# 10,024 unique videos

tracings_df.groupby('FileName').nunique().shape

(10024, 5)

### Converting tracings to volumes 
Background on tracings:  
>- Each video has 2 frames in which the LV was traced by a human
>- For each frame, the first row is the long-axis and the rest are short axes

In [18]:
# add a new columnn that combines filename and frame

tracings_df['File_Frame'] = tracings_df['FileName'] + '_' + tracings_df['Frame'].astype(str)

In [19]:
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame,File_Frame
0,0X100009310A3BD7FC,51.26042,15.34896,64.93229,69.125,46,0X100009310A3BD7FC_46
1,0X100009310A3BD7FC,50.03761,17.16784,53.36722,16.32133,46,0X100009310A3BD7FC_46
2,0X100009310A3BD7FC,49.15738,20.40763,57.09055,18.39072,46,0X100009310A3BD7FC_46
3,0X100009310A3BD7FC,48.53817,23.58105,59.99734,20.66771,46,0X100009310A3BD7FC_46
4,0X100009310A3BD7FC,47.91897,26.75448,62.90413,22.94469,46,0X100009310A3BD7FC_46


In [20]:
# calculating the euclidean distance between points 1 and 2 for each row

tracings_df['Distance'] = tracings_df.apply(lambda x: np.linalg.norm(np.array((x['X1'], x['Y1'])) - 
                                           np.array((x['X2'], x['Y2']))), axis=1)

In [21]:
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance
0,0X100009310A3BD7FC,51.26042,15.34896,64.93229,69.125,46,0X100009310A3BD7FC_46,55.48678
1,0X100009310A3BD7FC,50.03761,17.16784,53.36722,16.32133,46,0X100009310A3BD7FC_46,3.43553
2,0X100009310A3BD7FC,49.15738,20.40763,57.09055,18.39072,46,0X100009310A3BD7FC_46,8.18554
3,0X100009310A3BD7FC,48.53817,23.58105,59.99734,20.66771,46,0X100009310A3BD7FC_46,11.82371
4,0X100009310A3BD7FC,47.91897,26.75448,62.90413,22.94469,46,0X100009310A3BD7FC_46,15.46187


In [22]:
# returns 1st row of each File_Frame - in this case, the long-axis measurements

long_axis = tracings_df.groupby('File_Frame').nth(0)
long_axis.head()

Unnamed: 0_level_0,FileName,X1,Y1,X2,Y2,Frame,Distance
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0X100009310A3BD7FC_46,0X100009310A3BD7FC,51.26042,15.34896,64.93229,69.125,46,55.48678
0X100009310A3BD7FC_61,0X100009310A3BD7FC,56.0,19.54167,61.65104,62.74479,61,43.57114
0X1002E8FBACD08477_18,0X1002E8FBACD08477,52.53646,27.01562,69.48958,64.02083,18,40.70373
0X1002E8FBACD08477_3,0X1002E8FBACD08477,48.16146,20.63542,67.84896,70.40104,3,53.51836
0X1005D03EED19C65B_24,0X1005D03EED19C65B,65.84375,20.27083,80.97396,63.65625,24,45.94799


In [23]:
# 2 per video

long_axis.shape

(20048, 7)

In [24]:
# for each File_Frame, selects the 20+ short-axis measurements (all rows except the first)

short_axis = tracings_df.groupby('File_Frame').apply(lambda group: group.iloc[1:])
short_axis.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0X100009310A3BD7FC_46,1,0X100009310A3BD7FC,50.03761,17.16784,53.36722,16.32133,46,0X100009310A3BD7FC_46,3.43553
0X100009310A3BD7FC_46,2,0X100009310A3BD7FC,49.15738,20.40763,57.09055,18.39072,46,0X100009310A3BD7FC_46,8.18554
0X100009310A3BD7FC_46,3,0X100009310A3BD7FC,48.53817,23.58105,59.99734,20.66771,46,0X100009310A3BD7FC_46,11.82371
0X100009310A3BD7FC_46,4,0X100009310A3BD7FC,47.91897,26.75448,62.90413,22.94469,46,0X100009310A3BD7FC_46,15.46187
0X100009310A3BD7FC_46,5,0X100009310A3BD7FC,47.9621,29.75951,65.81092,25.22168,46,0X100009310A3BD7FC_46,18.41663


In [25]:
# 20+ per video

short_axis.shape

(404920, 8)

In [26]:
# convert each short-axis diameter to an area

short_axis['Area'] = short_axis['Distance'] \
    .apply(lambda diameter: np.pi * np.square(diameter/2))
short_axis.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance,Area
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0X100009310A3BD7FC_46,1,0X100009310A3BD7FC,50.03761,17.16784,53.36722,16.32133,46,0X100009310A3BD7FC_46,3.43553,9.26997
0X100009310A3BD7FC_46,2,0X100009310A3BD7FC,49.15738,20.40763,57.09055,18.39072,46,0X100009310A3BD7FC_46,8.18554,52.62412
0X100009310A3BD7FC_46,3,0X100009310A3BD7FC,48.53817,23.58105,59.99734,20.66771,46,0X100009310A3BD7FC_46,11.82371,109.79872
0X100009310A3BD7FC_46,4,0X100009310A3BD7FC,47.91897,26.75448,62.90413,22.94469,46,0X100009310A3BD7FC_46,15.46187,187.76477
0X100009310A3BD7FC_46,5,0X100009310A3BD7FC,47.9621,29.75951,65.81092,25.22168,46,0X100009310A3BD7FC_46,18.41663,266.38518


In [27]:
# take the sum of all short-axis areas in a frame

area_sums = short_axis.reset_index(drop=True).groupby('File_Frame')['Area'] \
    .apply((lambda area: np.sum(area)))
area_sums.head()

File_Frame
0X100009310A3BD7FC_46   8651.58346
0X100009310A3BD7FC_61   2484.07780
0X1002E8FBACD08477_18   4207.79729
0X1002E8FBACD08477_3    7582.30172
0X1005D03EED19C65B_24   5011.09773
Name: Area, dtype: float64

In [28]:
# create a new df that includes long axis height

vol_df = long_axis.reset_index()[['File_Frame', 'FileName', 'Distance']]
vol_df = vol_df.rename(columns={'Distance': 'Height'})
vol_df.head()

Unnamed: 0,File_Frame,FileName,Height
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,55.48678
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,43.57114
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,40.70373
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,53.51836
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,45.94799


In [29]:
# add sum of short-axis areas

vol_df['Areas'] = area_sums.reset_index()['Area']

In [30]:
# combine areas and height to get volume

vol_df['LV_Vol'] = vol_df['Height'] * vol_df['Areas'] 
vol_df.head()

Unnamed: 0,File_Frame,FileName,Height,Areas,LV_Vol
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,55.48678,8651.58346,480048.51335
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,43.57114,2484.0778,108234.1028
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,40.70373,4207.79729,171273.05805
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,53.51836,7582.30172,405792.36944
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,45.94799,5011.09773,230249.8574


In [31]:
# drop unneeded columns

vol_df = vol_df.drop(['Height', 'Areas'], axis=1)
vol_df.head()

Unnamed: 0,File_Frame,FileName,LV_Vol
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,480048.51335
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,108234.1028
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,171273.05805
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,405792.36944
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,230249.8574


In [32]:
vol_df.shape

(20048, 3)

### Combining volume data
>- In the df above, each video has 2 rows - each with one left ventricular volume
>- Here we will transform the df such that each video has 1 row containing the 2 volumes side-by-side

In [33]:
# for each FileName, returns the first of two rows 

first = vol_df.groupby('FileName')['File_Frame', "LV_Vol"].nth(0)
first.head()

  first = vol_df.groupby('FileName')['File_Frame', "LV_Vol"].nth(0)


Unnamed: 0_level_0,File_Frame,LV_Vol
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.51335
0X1002E8FBACD08477,0X1002E8FBACD08477_18,171273.05805
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.8574
0X10075961BC11C88E,0X10075961BC11C88E_108,169583.92843
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020073.77475


In [34]:
# for each FileName, returns the second of two rows 

second = vol_df.groupby('FileName').nth(1)
second.head()

Unnamed: 0_level_0,File_Frame,LV_Vol
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_61,108234.1028
0X1002E8FBACD08477,0X1002E8FBACD08477_3,405792.36944
0X1005D03EED19C65B,0X1005D03EED19C65B_35,85805.29037
0X10075961BC11C88E,0X10075961BC11C88E_91,373724.36275
0X10094BA0A028EAC3,0X10094BA0A028EAC3_156,775376.39354


In [35]:
# combines first and second "rows" column-wise

file_df = pd.merge(first, second, how='outer', on='FileName')
file_df.head()

Unnamed: 0_level_0,File_Frame_x,LV_Vol_x,File_Frame_y,LV_Vol_y
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.51335,0X100009310A3BD7FC_61,108234.1028
0X1002E8FBACD08477,0X1002E8FBACD08477_18,171273.05805,0X1002E8FBACD08477_3,405792.36944
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.8574,0X1005D03EED19C65B_35,85805.29037
0X10075961BC11C88E,0X10075961BC11C88E_108,169583.92843,0X10075961BC11C88E_91,373724.36275
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020073.77475,0X10094BA0A028EAC3_156,775376.39354


In [36]:
# one row per video

file_df.shape

(10024, 4)

### Categorizing volumes as EDV and ESV
>- Each video has two left ventricular volumes in this dataset, one representing end diastolic volume (EDV) and one representing end systolic volume (ESV)
>- By definition, EDV is the larger volume and ESV the smaller

In [37]:
def get_vol_type(frame_x, vol_x, frame_y, vol_y):
    '''For each row, determines which of the two volumes (and corresponding frame) represents
    EDV and which represents ESV'''
    
    if vol_x > vol_y:
        EDV_frame = frame_x
        EDV = vol_x
        ESV_frame = frame_y
        ESV = vol_y
    if vol_x < vol_y:
        ESV_frame = frame_x
        ESV = vol_x
        EDV_frame = frame_y
        EDV = vol_y
    return EDV_frame, EDV, ESV_frame, ESV

In [38]:
# categorizing each volume as either EDV or ESV

file_df[['EDV_frame', 'EDV', 'ESV_frame', 'ESV']] = file_df.apply(lambda v: get_vol_type(v['File_Frame_x'], v['LV_Vol_x'], v['File_Frame_y'], v['LV_Vol_y']), axis=1, result_type='expand')

In [39]:
file_df.head()

Unnamed: 0_level_0,File_Frame_x,LV_Vol_x,File_Frame_y,LV_Vol_y,EDV_frame,EDV,ESV_frame,ESV
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.51335,0X100009310A3BD7FC_61,108234.1028,0X100009310A3BD7FC_46,480048.51335,0X100009310A3BD7FC_61,108234.1028
0X1002E8FBACD08477,0X1002E8FBACD08477_18,171273.05805,0X1002E8FBACD08477_3,405792.36944,0X1002E8FBACD08477_3,405792.36944,0X1002E8FBACD08477_18,171273.05805
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.8574,0X1005D03EED19C65B_35,85805.29037,0X1005D03EED19C65B_24,230249.8574,0X1005D03EED19C65B_35,85805.29037
0X10075961BC11C88E,0X10075961BC11C88E_108,169583.92843,0X10075961BC11C88E_91,373724.36275,0X10075961BC11C88E_91,373724.36275,0X10075961BC11C88E_108,169583.92843
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020073.77475,0X10094BA0A028EAC3_156,775376.39354,0X10094BA0A028EAC3_137,1020073.77475,0X10094BA0A028EAC3_156,775376.39354


In [40]:
# drop unnecessary columns

file_df = file_df.drop(['File_Frame_y', 'LV_Vol_y','File_Frame_x', 'LV_Vol_x'], axis=1)
file_df.head()

Unnamed: 0_level_0,EDV_frame,EDV,ESV_frame,ESV
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.51335,0X100009310A3BD7FC_61,108234.1028
0X1002E8FBACD08477,0X1002E8FBACD08477_3,405792.36944,0X1002E8FBACD08477_18,171273.05805
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.8574,0X1005D03EED19C65B_35,85805.29037
0X10075961BC11C88E,0X10075961BC11C88E_91,373724.36275,0X10075961BC11C88E_108,169583.92843
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020073.77475,0X10094BA0A028EAC3_156,775376.39354


In [41]:
# remove FileNames from 'frame' columns

file_df['EDV_frame'] = file_df.EDV_frame.str.split('_').str[1]
file_df['ESV_frame'] = file_df.ESV_frame.str.split('_').str[1]

In [42]:
# calculating EF (based on calculations on tracing data)

file_df['EF'] = ((file_df['EDV'] - file_df['ESV']) / file_df['EDV']) * 100

In [43]:
file_df.head()

Unnamed: 0_level_0,EDV_frame,EDV,ESV_frame,ESV,EF
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,46,480048.51335,61,108234.1028,77.45351
0X1002E8FBACD08477,3,405792.36944,18,171273.05805,57.79293
0X1005D03EED19C65B,24,230249.8574,35,85805.29037,62.73384
0X10075961BC11C88E,91,373724.36275,108,169583.92843,54.62326
0X10094BA0A028EAC3,137,1020073.77475,156,775376.39354,23.9882


In [44]:
# EFs are all positive and have reasonable stats

file_df.describe()

Unnamed: 0,EDV,ESV,EF
count,10024.0,10024.0,10024.0
mean,632325.06795,291726.91699,55.28218
std,405148.40441,232226.20483,12.5814
min,4246.42048,1737.23461,0.84648
25%,380320.04934,153381.03681,51.14175
50%,526355.36269,222729.42283,58.67357
75%,763349.92009,354007.07303,63.55534
max,6165758.35805,3084905.25453,94.75742


## Join two datasets 

In [45]:
# merge with original dataset (video_df)

video_df = pd.merge(video_df, file_df, how='outer', on='FileName')
video_df.head()

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X100009310A3BD7FC,78.49841,14.88137,69.21053,112,112,50,174,VAL,46,480048.51335,61,108234.1028,77.45351
0X1002E8FBACD08477,59.10199,40.38388,98.74288,112,112,50,215,TRAIN,3,405792.36944,18,171273.05805,57.79293
0X1005D03EED19C65B,62.3638,14.26778,37.90973,112,112,50,104,TRAIN,24,230249.8574,35,85805.29037,62.73384
0X10075961BC11C88E,54.5451,33.14308,72.91421,112,112,55,122,TRAIN,91,373724.36275,108,169583.92843,54.62326
0X10094BA0A028EAC3,24.88774,127.58194,169.85502,112,112,52,207,VAL,137,1020073.77475,156,775376.39354,23.9882


In [46]:
# no change in number of rows - one line per video

video_df.shape

(10024, 13)

###  Comparing calculated vs original EF values

In [47]:
# compare EF columns

video_df['EF_diff'] = video_df['EF_x'] - video_df['EF_y']
video_df.head()

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X100009310A3BD7FC,78.49841,14.88137,69.21053,112,112,50,174,VAL,46,480048.51335,61,108234.1028,77.45351,1.0449
0X1002E8FBACD08477,59.10199,40.38388,98.74288,112,112,50,215,TRAIN,3,405792.36944,18,171273.05805,57.79293,1.30906
0X1005D03EED19C65B,62.3638,14.26778,37.90973,112,112,50,104,TRAIN,24,230249.8574,35,85805.29037,62.73384,-0.37004
0X10075961BC11C88E,54.5451,33.14308,72.91421,112,112,55,122,TRAIN,91,373724.36275,108,169583.92843,54.62326,-0.07816
0X10094BA0A028EAC3,24.88774,127.58194,169.85502,112,112,52,207,VAL,137,1020073.77475,156,775376.39354,23.9882,0.89954


In [48]:
# most videos have only slight differences between EFs - but there are very large differences

video_df['EF_diff'].describe()

count   10024.00000
mean        0.46509
std         3.07087
min       -43.69195
25%        -0.17809
50%         0.39698
75%         1.03341
max        77.26998
Name: EF_diff, dtype: float64

In [49]:
# approx 50 videos have EF differences smaller (more negative) than -5

video_df.sort_values(by='EF_diff').head(60)

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X2AC09763183674E8,29.22029,283.25819,400.19686,112,112,43,131,VAL,64,2492267.09752,43,675099.34981,72.91224,-43.69195
0X67F8AC58B0BAA98,37.40685,57.27966,91.51106,112,112,50,201,TRAIN,198,2508654.95196,162,501849.37582,79.99528,-42.58843
0X13D1459C51B5C32E,34.20492,50.83527,77.26303,112,112,50,107,TRAIN,43,813667.73494,25,223249.26921,72.5626,-38.35768
0X5D38D994C2490EAE,43.75999,48.98003,87.09107,112,112,50,157,TRAIN,75,1854559.55311,90,353305.14529,80.94938,-37.18939
0X67E8F2D130F1A55,45.25871,72.90714,133.18489,112,112,50,142,TEST,20,2116110.48152,111,387580.84844,81.68428,-36.42558
0X3D8353611168F743,47.4062,29.42624,55.95002,112,112,50,173,TRAIN,43,771919.35874,58,129361.42354,83.24159,-35.83539
0X526BA02D476E9274,47.98304,42.46839,81.64335,112,112,50,151,TRAIN,93,1705673.58035,73,309556.62771,81.85136,-33.86832
0X62120814160BA377,34.60873,117.73767,180.05105,112,112,50,204,TRAIN,203,2575292.91647,166,849623.00001,67.00868,-32.39996
0X280B7441A7E287B2,39.50445,15.40601,25.46635,112,112,57,183,VAL,1,282030.03667,117,80134.35557,71.58659,-32.08213
0X500FC4E8716B0A8F,46.76611,155.84356,292.75252,112,112,40,161,TEST,97,1396555.78905,63,303163.96838,78.29203,-31.52592


In [50]:
# approx 45 videos have EF differences larger than 5

video_df.sort_values(by='EF_diff', ascending=False).head(50)

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X354B37A25C64276F,96.96724,9.75651,321.70357,112,112,50,71,TRAIN,31,649246.71649,38,521362.95343,19.69725,77.26998
0X65E605F203321860,86.75797,15.86398,119.80025,112,112,50,67,TRAIN,32,817932.04783,53,710043.7258,13.19038,73.5676
0X973E4A9DAADDF9F,85.25287,19.36817,131.33518,112,112,50,57,TRAIN,49,535882.24302,40,408103.8556,23.84449,61.40838
0X2AD994F98C491FA6,63.87166,30.71909,85.02768,112,112,50,150,VAL,41,319634.13363,57,309638.48782,3.12721,60.74445
0X37F9E9981E207C04,60.14113,30.8991,77.52126,112,112,50,110,TRAIN,75,1324701.2342,90,1313487.91934,0.84648,59.29465
0X36C5A15AC7FC6AAA,59.49418,53.57839,132.27329,112,112,47,229,TRAIN,74,376186.6388,58,371455.4915,1.25766,58.23652
0X411E89F93DAB415A,59.77186,61.96434,154.03231,112,112,54,109,TRAIN,48,556603.00868,69,543033.44216,2.43793,57.33393
0X28980B95F9769CE7,72.72383,13.27886,48.68301,112,112,62,149,TRAIN,3,831466.69926,21,652651.36887,21.50601,51.21782
0X5B6FCBB75BF8FCB7,70.12472,27.77603,92.97328,112,112,50,105,TRAIN,84,947419.47495,69,759183.07756,19.86833,50.25639
0X4EA078CC4E65B6A3,83.40026,7.24578,43.64996,112,112,50,87,TRAIN,61,108987.92724,42,68757.06998,36.91313,46.48712


In [51]:
# dropping videos with EF_differences less than -5

video_df = video_df[(video_df['EF_diff'] >= -5)]

In [52]:
# dropping videos with EF_differences greater than 5

video_df = video_df[(video_df['EF_diff'] <= 5)]

In [53]:
# dropped a total of 95 videos

video_df.shape

(9929, 14)

In [54]:
# confirming differences are much smaller after dropping 95 most extreme videos

video_df['EF_diff'].describe()

count   9929.00000
mean       0.44306
std        1.01231
min       -4.82546
25%       -0.17345
50%        0.39770
75%        1.02736
max        4.96701
Name: EF_diff, dtype: float64

## Prepare tabular data for video processing 

In [55]:
# create a new df with info we'll need for processing videos

video_files = video_df[['Split', 'NumberOfFrames', 'EDV_frame', 'EDV_y', 
                        'ESV_frame', 'ESV_y', 'EF_y']]

In [56]:
video_files = video_files.rename(columns={'NumberOfFrames': 'NumFrames',
                    'EDV_y': 'EDV', 'ESV_y': 'ESV', 'EF_y': 'EF'})

In [57]:
video_files = video_files.reset_index()

In [58]:
video_files.head()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
0,0X100009310A3BD7FC,VAL,174,46,480048.51335,61,108234.1028,77.45351
1,0X1002E8FBACD08477,TRAIN,215,3,405792.36944,18,171273.05805,57.79293
2,0X1005D03EED19C65B,TRAIN,104,24,230249.8574,35,85805.29037,62.73384
3,0X10075961BC11C88E,TRAIN,122,91,373724.36275,108,169583.92843,54.62326
4,0X10094BA0A028EAC3,VAL,207,137,1020073.77475,156,775376.39354,23.9882


In [59]:
video_files.shape

(9929, 8)

In [60]:
video_files[video_files['FileName'] == '0X1A76A1A8448B456']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
835,0X1A76A1A8448B456,TRAIN,206,2,369574.08029,18,155478.99363,57.93022


In [61]:
video_files[video_files['FileName'] == '0X1A349D84388BD74B']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
820,0X1A349D84388BD74B,TRAIN,177,44,390899.45182,61,154799.88006,60.39905


In [62]:
video_files[video_files['FileName'] == '0X1AE20B8AE3B5E9EF']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
860,0X1AE20B8AE3B5E9EF,TRAIN,194,100,522782.36007,118,203660.68817,61.04293


### Train-test split 

In [63]:
# split into train, val, and test according to 'split' column of original dataset

train_videos = video_files[video_files['Split'] == 'TRAIN']
val_videos = video_files[video_files['Split'] == 'VAL']
test_videos = video_files[video_files['Split'] == 'TEST']

In [64]:
train_videos["Split"] = "Train"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_videos["Split"] = "Train"


In [65]:
train_videos = train_videos.reset_index(drop=True)
train_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
7387,0XFD9464CF9E40B66,Train,142,106,477658.66681,121,137513.01165,71.21103
7388,0XFDC39A88895DE1E,Train,181,42,385448.23781,63,164745.84859,57.25863
7389,0XFDFBA5702E94ABF,Train,192,100,731582.45236,117,338787.2945,53.69117
7390,0XFDFD17B7CCFE5AF,Train,268,66,390980.34985,85,145372.0487,62.81858
7391,0XFEBEEFF93F6FEB9,Train,109,25,375613.38211,38,257292.45072,31.50072


In [66]:
# dataset is approximately 75% train

train_videos.shape

(7392, 8)

In [67]:
val_videos["Split"] = "Val"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_videos["Split"] = "Val"


In [68]:
val_videos = val_videos.reset_index(drop=True)
val_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
1268,0XFBD22BB93EE05C1,Val,193,39,510239.40745,56,206763.70398,59.47712
1269,0XFCC2AF8E23AFBA8,Val,161,41,562010.4561,57,213292.24669,62.04835
1270,0XFD06AD51C6DA6E5,Val,191,47,426498.8975,62,184570.14062,56.72436
1271,0XFDB874C30A9C923,Val,184,53,613907.96153,73,493886.86006,19.55034
1272,0XFE83FF3D3B13C3A,Val,192,49,266065.06572,67,98194.09073,63.09396


In [69]:
# approx 12.5% val

val_videos.shape

(1273, 8)

In [70]:
test_videos["Split"] = "Test"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_videos["Split"] = "Test"


In [71]:
test_videos = test_videos.reset_index(drop=True)
test_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
1259,0XECF82DC9301EE77,Test,176,91,734262.51418,111,269118.28984,63.34849
1260,0XF557EF658FD13D0,Test,179,0,485878.22632,20,232627.26213,52.12231
1261,0XF6661AF354401A5,Test,221,93,610161.98585,111,213721.12554,64.97305
1262,0XF829F634971A0F7,Test,183,48,472216.64911,66,173155.67445,63.33131
1263,0XFE6E32991136338,Test,120,31,462524.95389,45,322695.8627,30.23169


In [72]:
# approx 12.5% test

test_videos.shape

(1264, 8)

## Process videos 

In [73]:
def process_df(df):
    '''Takes in a echocardiogram video clip and extracts 3 images: 
    one at end-diastole, one at end-systole, and one in between (other).'''
    
    for row in df.itertuples():
        # load and open videos
        path = "EchoNet-Dynamic/Videos/" + row.FileName + ".avi"
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:  # if frame is read correctly, ret is true
                break
            frames.append(frame)
        
        # select and save EDV frames
        EDV_image = frames[int(row.EDV_frame) - 1] # account for 0 indexing
        EDV_path = row.Split + "_Images/EDV/" + row.FileName + '_EDV.jpg'   
        tf.keras.preprocessing.image.save_img(EDV_path, np.array(EDV_image), scale=False)
        
        # select and save ESV frames
        ESV_image = frames[int(row.ESV_frame) - 1] 
        ESV_path = row.Split + "_Images/ESV/" + row.FileName + '_ESV.jpg'  
        tf.keras.preprocessing.image.save_img(ESV_path, np.array(ESV_image), scale=False)
        
        # select and save an "Other" frame
        other_frame = np.around((int(row.EDV_frame) + int(row.ESV_frame)) / 2)
        other_image = frames[int(other_frame) - 1] 
        other_path = row.Split + "_Images/Other/" + row.FileName + '_Other.jpg'
        tf.keras.preprocessing.image.save_img(other_path, np.array(other_image), scale=False)
        
        cap.release()

In [74]:
#process_df(train_videos)

In [75]:
#process_df(val_videos)

In [76]:
#process_df(test_videos)