# NN for interpretation of echocardiograms: cleaning tabular data

In [23]:
from tensorflow import keras

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import cv2

np.set_printoptions(suppress=True) # Suppress scientific notation where possible

## Import & clean datasets 

### Importing video_df 

In [24]:
# import list of video files and their measurements/metadata
# each video is 112 x 112 pixels

video_df = pd.read_csv("EchoNet-Dynamic/FileList.csv")
video_df.head()

Unnamed: 0,FileName,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
0,0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL
1,0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN
2,0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN
3,0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN
4,0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL


In [25]:
# 10,030 video files

video_df.shape

(10030, 9)

In [26]:
video_df = video_df.set_index('FileName')
video_df

Unnamed: 0_level_0,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL
0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN
0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN
0X10075961BC11C88E,54.545097,33.143084,72.914210,112,112,55,122,TRAIN
0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL
...,...,...,...,...,...,...,...,...
0X234005774F4CB5CD,51.724743,47.065329,97.493690,768,1040,50,127,TRAIN
0X2DC68261CBCC04AE,62.187781,26.333478,69.642772,768,1024,50,66,TRAIN
0X35291BE9AB90FB89,62.070762,49.064338,129.357561,768,1024,50,208,TRAIN
0X6C435C1B417FDE8A,59.635257,57.721170,142.998978,768,1024,50,166,TRAIN


### Importing & cleaning tracings_df

In [27]:
#import tracing measurements

tracings_df = pd.read_csv("EchoNet-Dynamic/VolumeTracings.csv")
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame
0,0X100009310A3BD7FC.avi,51.260417,15.348958,64.932292,69.125,46
1,0X100009310A3BD7FC.avi,50.037611,17.167841,53.367222,16.32133,46
2,0X100009310A3BD7FC.avi,49.157378,20.407629,57.090549,18.390722,46
3,0X100009310A3BD7FC.avi,48.538173,23.581055,59.997339,20.667707,46
4,0X100009310A3BD7FC.avi,47.918968,26.75448,62.904129,22.944693,46


In [28]:
# 10030 videos x 2 volumes/video x 21 measurements each = 421,260
# actual df has slightly more rows than expected based on above calculation

tracings_df.shape

(425010, 6)

In [29]:
# remove ".avi" from file name to match video_df

tracings_df['FileName'] = tracings_df['FileName'].apply(lambda x: x[:-4])

In [30]:
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame
0,0X100009310A3BD7FC,51.260417,15.348958,64.932292,69.125,46
1,0X100009310A3BD7FC,50.037611,17.167841,53.367222,16.32133,46
2,0X100009310A3BD7FC,49.157378,20.407629,57.090549,18.390722,46
3,0X100009310A3BD7FC,48.538173,23.581055,59.997339,20.667707,46
4,0X100009310A3BD7FC,47.918968,26.75448,62.904129,22.944693,46


In [31]:
tracings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 425010 entries, 0 to 425009
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   FileName  425010 non-null  object 
 1   X1        425010 non-null  float64
 2   Y1        425010 non-null  float64
 3   X2        425010 non-null  float64
 4   Y2        425010 non-null  float64
 5   Frame     425010 non-null  int64  
dtypes: float64(4), int64(1), object(1)
memory usage: 19.5+ MB


In [32]:
# tracing_df contains 10,025 videos, 5 fewer than video_df
# each video has exactly 2 frames that were used for tracing

grouped_tracings_df = tracings_df.groupby('FileName').nunique().sort_values(by='Frame')
grouped_tracings_df

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,41,42,41,42,2
0X601A0FE2BA2F9F68,40,42,40,42,2
0X601B706CC84025BD,42,42,40,42,2
0X602005216EB0FAB3,42,42,40,42,2
0X6020D0C5C256684D,42,42,42,42,2
...,...,...,...,...,...
0X3869C8E146FBD123,42,42,42,42,2
0X386DAB52217592A2,42,42,42,42,2
0X386E38E6214B10A1,42,42,42,42,2
0X3860318B66CC8557,42,42,39,42,2


In [33]:
# some frames have way over 21 measurements

tracings_df.groupby(['FileName', 'Frame']).count().reset_index() \
    .sort_values(by=['X1'], ascending=False).head(10)

Unnamed: 0,FileName,Frame,X1,Y1,X2,Y2
11951,0X57AF4D24B154C573,15,168,168,168,168
6254,0X35A5E9C9075E56EE,44,147,147,147,147
6184,0X354B37A25C64276F,31,105,105,105,105
15781,0X6E02E0F24F63EFD7,121,105,105,105,105
19090,0XA20EE6C5B1F48CB,95,84,84,84,84
8772,0X44C18287CA978438,51,84,84,84,84
9000,0X46024CC33D00D4C6,33,84,84,84,84
9001,0X46024CC33D00D4C6,48,84,84,84,84
10505,0X4EA078CC4E65B6A3,61,84,84,84,84
14393,0X65E605F203321860,53,84,84,84,84


### Identifying and handling nulls 
>- Going into this section, `video_df` had 10,030 videos and `tracing_df` had 10,025 videos  
>- By the end, both dfs will have the same 10,024 videos

In [34]:
# try joining grouped version of tracings_df with video_df to identify discrepancies

merged_data = pd.merge(grouped_tracings_df, video_df, on='FileName', how='outer')
merged_data.head()

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X100009310A3BD7FC,41.0,42.0,41.0,42.0,2.0,78.498406,14.881368,69.210534,112.0,112.0,50.0,174.0,VAL
0X601A0FE2BA2F9F68,40.0,42.0,40.0,42.0,2.0,34.528308,86.428707,132.009277,112.0,112.0,50.0,167.0,TRAIN
0X601B706CC84025BD,42.0,42.0,40.0,42.0,2.0,50.030816,71.017299,142.122192,112.0,112.0,50.0,137.0,TRAIN
0X602005216EB0FAB3,42.0,42.0,40.0,42.0,2.0,59.952807,37.31474,93.176918,112.0,112.0,50.0,153.0,TRAIN
0X6020D0C5C256684D,42.0,42.0,42.0,42.0,2.0,59.431978,29.482871,72.67515,112.0,112.0,62.0,150.0,TRAIN


In [35]:
# compare to grouped_tracings_df (10,025 rows) and video_df (10,030 rows)

merged_data.shape

(10031, 13)

In [36]:
# looks like 6 nulls in the columns from tracings_df 
# and 1 null in the columns from video_df

merged_data.isnull().sum()

X1                6
Y1                6
X2                6
Y2                6
Frame             6
EF                1
ESV               1
EDV               1
FrameHeight       1
FrameWidth        1
FPS               1
NumberOfFrames    1
Split             1
dtype: int64

In [37]:
# the 6 entries missing from tracings_df are all at the end of the merged df

merged_data.tail(6)

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X5DD5283AC43CCDD1,,,,,,62.691876,36.284389,97.256001,768.0,1024.0,50.0,188.0,TEST
0X234005774F4CB5CD,,,,,,51.724743,47.065329,97.49369,768.0,1040.0,50.0,127.0,TRAIN
0X2DC68261CBCC04AE,,,,,,62.187781,26.333478,69.642772,768.0,1024.0,50.0,66.0,TRAIN
0X35291BE9AB90FB89,,,,,,62.070762,49.064338,129.357561,768.0,1024.0,50.0,208.0,TRAIN
0X6C435C1B417FDE8A,,,,,,59.635257,57.72117,142.998978,768.0,1024.0,50.0,166.0,TRAIN
0X5515B0BD077BE68A,,,,,,46.019994,27.260394,50.50091,768.0,1024.0,50.0,126.0,TRAIN


In [38]:
# dropping the 6 rows from video_df

video_df = video_df.drop(['0X5DD5283AC43CCDD1', '0X234005774F4CB5CD', '0X2DC68261CBCC04AE',
'0X35291BE9AB90FB89', '0X6C435C1B417FDE8A', '0X5515B0BD077BE68A'])
video_df.shape

(10024, 8)

In [39]:
# here is the one entry missing from video_df (but present in tracings_df)
# this entry is also missing a video file

merged_data[merged_data['EF'].isnull()]

Unnamed: 0_level_0,X1,Y1,X2,Y2,Frame,EF,ESV,EDV,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X4F8859C8AB4DA9CB,42.0,42.0,42.0,42.0,2.0,,,,,,,,


In [40]:
# remove the one filename from tracings_df (42 rows)
# original length of tracings_df: 425,010

tracings_df = tracings_df[tracings_df['FileName'] != '0X4F8859C8AB4DA9CB']
tracings_df.shape

(424968, 6)

In [41]:
# 10,024 unique videos

tracings_df.groupby('FileName').nunique().shape

(10024, 5)

## Converting tracings to volumes 
Background on tracings:  
>- Each video has 2 frames in which the LV was traced by a human
>- For each frame, the first row is the long-axis and the rest are short axes

In [42]:
# add a new columnn that combines filename and frame

tracings_df['File_Frame'] = tracings_df['FileName'] + '_' + tracings_df['Frame'].astype(str)

In [43]:
tracings_df.head()

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame,File_Frame
0,0X100009310A3BD7FC,51.260417,15.348958,64.932292,69.125,46,0X100009310A3BD7FC_46
1,0X100009310A3BD7FC,50.037611,17.167841,53.367222,16.32133,46,0X100009310A3BD7FC_46
2,0X100009310A3BD7FC,49.157378,20.407629,57.090549,18.390722,46,0X100009310A3BD7FC_46
3,0X100009310A3BD7FC,48.538173,23.581055,59.997339,20.667707,46,0X100009310A3BD7FC_46
4,0X100009310A3BD7FC,47.918968,26.75448,62.904129,22.944693,46,0X100009310A3BD7FC_46


In [44]:
# calculating the euclidean distance between points 1 and 2 for each row

tracings_df['Distance'] = tracings_df.apply(lambda x: np.linalg.norm(np.array((x['X1'], x['Y1'])) - 
                                           np.array((x['X2'], x['Y2']))), axis=1)

In [45]:
tracings_df.head(42)

Unnamed: 0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance
0,0X100009310A3BD7FC,51.260417,15.348958,64.932292,69.125,46,0X100009310A3BD7FC_46,55.486781
1,0X100009310A3BD7FC,50.037611,17.167841,53.367222,16.32133,46,0X100009310A3BD7FC_46,3.435534
2,0X100009310A3BD7FC,49.157378,20.407629,57.090549,18.390722,46,0X100009310A3BD7FC_46,8.185543
3,0X100009310A3BD7FC,48.538173,23.581055,59.997339,20.667707,46,0X100009310A3BD7FC_46,11.823708
4,0X100009310A3BD7FC,47.918968,26.75448,62.904129,22.944693,46,0X100009310A3BD7FC_46,15.461873
5,0X100009310A3BD7FC,47.962105,29.759513,65.81092,25.221679,46,0X100009310A3BD7FC_46,18.416627
6,0X100009310A3BD7FC,48.167915,32.723188,68.247043,27.618326,46,0X100009310A3BD7FC_46,20.717891
7,0X100009310A3BD7FC,48.373726,35.686864,70.385311,30.090698,46,0X100009310A3BD7FC_46,22.711824
8,0X100009310A3BD7FC,48.579537,38.650539,72.523579,32.563071,46,0X100009310A3BD7FC_46,24.705757
9,0X100009310A3BD7FC,49.014039,41.556073,74.151644,35.165156,46,0X100009310A3BD7FC_46,25.937289


In [46]:
# returns 1st row of each File_Frame - in this case, the long-axis measurements

long_axis = tracings_df.groupby('File_Frame').nth(0)
long_axis.head()

Unnamed: 0_level_0,FileName,X1,Y1,X2,Y2,Frame,Distance
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0X100009310A3BD7FC_46,0X100009310A3BD7FC,51.260417,15.348958,64.932292,69.125,46,55.486781
0X100009310A3BD7FC_61,0X100009310A3BD7FC,56.0,19.541667,61.651042,62.744792,61,43.57114
0X1002E8FBACD08477_18,0X1002E8FBACD08477,52.536458,27.015625,69.489583,64.020833,18,40.703733
0X1002E8FBACD08477_3,0X1002E8FBACD08477,48.161458,20.635417,67.848958,70.401042,3,53.518362
0X1005D03EED19C65B_24,0X1005D03EED19C65B,65.84375,20.270833,80.973958,63.65625,24,45.947988


In [47]:
# 2 per video

long_axis.shape

(20048, 7)

In [48]:
short_axis = tracings_df.groupby('File_Frame').apply(lambda group: group.iloc[1:])
short_axis.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0X100009310A3BD7FC_46,1,0X100009310A3BD7FC,50.037611,17.167841,53.367222,16.32133,46,0X100009310A3BD7FC_46,3.435534
0X100009310A3BD7FC_46,2,0X100009310A3BD7FC,49.157378,20.407629,57.090549,18.390722,46,0X100009310A3BD7FC_46,8.185543
0X100009310A3BD7FC_46,3,0X100009310A3BD7FC,48.538173,23.581055,59.997339,20.667707,46,0X100009310A3BD7FC_46,11.823708
0X100009310A3BD7FC_46,4,0X100009310A3BD7FC,47.918968,26.75448,62.904129,22.944693,46,0X100009310A3BD7FC_46,15.461873
0X100009310A3BD7FC_46,5,0X100009310A3BD7FC,47.962105,29.759513,65.81092,25.221679,46,0X100009310A3BD7FC_46,18.416627


In [49]:
# 20+ per video

short_axis.shape

(404920, 8)

In [50]:
# convert each short-axis diameter to an area

short_axis['Area'] = short_axis['Distance'] \
    .apply(lambda diameter: np.pi * np.square(diameter/2))
short_axis.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,FileName,X1,Y1,X2,Y2,Frame,File_Frame,Distance,Area
File_Frame,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0X100009310A3BD7FC_46,1,0X100009310A3BD7FC,50.037611,17.167841,53.367222,16.32133,46,0X100009310A3BD7FC_46,3.435534,9.269969
0X100009310A3BD7FC_46,2,0X100009310A3BD7FC,49.157378,20.407629,57.090549,18.390722,46,0X100009310A3BD7FC_46,8.185543,52.624122
0X100009310A3BD7FC_46,3,0X100009310A3BD7FC,48.538173,23.581055,59.997339,20.667707,46,0X100009310A3BD7FC_46,11.823708,109.798721
0X100009310A3BD7FC_46,4,0X100009310A3BD7FC,47.918968,26.75448,62.904129,22.944693,46,0X100009310A3BD7FC_46,15.461873,187.764765
0X100009310A3BD7FC_46,5,0X100009310A3BD7FC,47.962105,29.759513,65.81092,25.221679,46,0X100009310A3BD7FC_46,18.416627,266.385178


In [51]:
# take the sum of all short-axis areas in a frame

area_sums = short_axis.reset_index(drop=True).groupby('File_Frame')['Area'] \
    .apply((lambda area: np.sum(area)))
area_sums.head()

File_Frame
0X100009310A3BD7FC_46    8651.583458
0X100009310A3BD7FC_61    2484.077801
0X1002E8FBACD08477_18    4207.797292
0X1002E8FBACD08477_3     7582.301720
0X1005D03EED19C65B_24    5011.097730
Name: Area, dtype: float64

In [52]:
# create a new df that includes long axis height

vol_df = long_axis.reset_index()[['File_Frame', 'FileName', 'Distance']]
vol_df.head()

Unnamed: 0,File_Frame,FileName,Distance
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,55.486781
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,43.57114
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,40.703733
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,53.518362
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,45.947988


In [53]:
# rename 

vol_df = vol_df.rename(columns={'Distance': 'Height'})

In [54]:
# add sum of short-axis areas

vol_df['Areas'] = area_sums.reset_index()['Area']
vol_df.head()

Unnamed: 0,File_Frame,FileName,Height,Areas
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,55.486781,8651.583458
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,43.57114,2484.077801
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,40.703733,4207.797292
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,53.518362,7582.30172
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,45.947988,5011.09773


In [55]:
# combine areas and height to get volume

vol_df['LV_Vol'] = vol_df['Height'] * vol_df['Areas'] 
vol_df.head()

Unnamed: 0,File_Frame,FileName,Height,Areas,LV_Vol
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,55.486781,8651.583458,480048.513351
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,43.57114,2484.077801,108234.102796
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,40.703733,4207.797292,171273.058049
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,53.518362,7582.30172,405792.369439
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,45.947988,5011.09773,230249.857397


In [56]:
# drop unneeded columns

vol_df = vol_df.drop(['Height', 'Areas'], axis=1)
vol_df.head()

Unnamed: 0,File_Frame,FileName,LV_Vol
0,0X100009310A3BD7FC_46,0X100009310A3BD7FC,480048.513351
1,0X100009310A3BD7FC_61,0X100009310A3BD7FC,108234.102796
2,0X1002E8FBACD08477_18,0X1002E8FBACD08477,171273.058049
3,0X1002E8FBACD08477_3,0X1002E8FBACD08477,405792.369439
4,0X1005D03EED19C65B_24,0X1005D03EED19C65B,230249.857397


In [57]:
vol_df = vol_df.reset_index()

In [58]:
# for each FileName, returns the first of two rows 

first = vol_df.groupby('FileName')['File_Frame', "LV_Vol"].nth(0)
first.head()

  first = vol_df.groupby('FileName')['File_Frame', "LV_Vol"].nth(0)


Unnamed: 0_level_0,File_Frame,LV_Vol
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.5
0X1002E8FBACD08477,0X1002E8FBACD08477_18,171273.1
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.9
0X10075961BC11C88E,0X10075961BC11C88E_108,169583.9
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020074.0


In [59]:
# for each FileName, returns the second of two rows 

second = vol_df.groupby('FileName').nth(1)
second.head()

Unnamed: 0_level_0,index,File_Frame,LV_Vol
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0X100009310A3BD7FC,1,0X100009310A3BD7FC_61,108234.102796
0X1002E8FBACD08477,3,0X1002E8FBACD08477_3,405792.369439
0X1005D03EED19C65B,5,0X1005D03EED19C65B_35,85805.290368
0X10075961BC11C88E,7,0X10075961BC11C88E_91,373724.362746
0X10094BA0A028EAC3,9,0X10094BA0A028EAC3_156,775376.39354


In [60]:
# combines first and second "rows" column-wise

file_df = pd.merge(first, second, how='outer', on='FileName')
file_df.head()

Unnamed: 0_level_0,File_Frame_x,LV_Vol_x,index,File_Frame_y,LV_Vol_y
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.5,1,0X100009310A3BD7FC_61,108234.102796
0X1002E8FBACD08477,0X1002E8FBACD08477_18,171273.1,3,0X1002E8FBACD08477_3,405792.369439
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.9,5,0X1005D03EED19C65B_35,85805.290368
0X10075961BC11C88E,0X10075961BC11C88E_108,169583.9,7,0X10075961BC11C88E_91,373724.362746
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020074.0,9,0X10094BA0A028EAC3_156,775376.39354


In [61]:
# no change in num rows

file_df.shape

(10024, 5)

In [62]:
def get_vol_type(frame_x, vol_x, frame_y, vol_y):
    '''For each row, determines which volume (and corresponding frame) represents
    EDV versus ESV'''
    
    if vol_x > vol_y:
        EDV_frame = frame_x
        EDV = vol_x
        ESV_frame = frame_y
        ESV = vol_y
    if vol_x < vol_y:
        ESV_frame = frame_x
        ESV = vol_x
        EDV_frame = frame_y
        EDV = vol_y
    return EDV_frame, EDV, ESV_frame, ESV

In [63]:
# adds 4 new columns to file_df

file_df[['EDV_frame', 'EDV', 'ESV_frame', 'ESV']] = file_df.apply(lambda v: get_vol_type(v['File_Frame_x'], v['LV_Vol_x'], v['File_Frame_y'], v['LV_Vol_y']), axis=1, result_type='expand')

In [64]:
file_df

Unnamed: 0_level_0,File_Frame_x,LV_Vol_x,index,File_Frame_y,LV_Vol_y,EDV_frame,EDV,ESV_frame,ESV
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,4.800485e+05,1,0X100009310A3BD7FC_61,108234.102796,0X100009310A3BD7FC_46,4.800485e+05,0X100009310A3BD7FC_61,108234.102796
0X1002E8FBACD08477,0X1002E8FBACD08477_18,1.712731e+05,3,0X1002E8FBACD08477_3,405792.369439,0X1002E8FBACD08477_3,4.057924e+05,0X1002E8FBACD08477_18,171273.058049
0X1005D03EED19C65B,0X1005D03EED19C65B_24,2.302499e+05,5,0X1005D03EED19C65B_35,85805.290368,0X1005D03EED19C65B_24,2.302499e+05,0X1005D03EED19C65B_35,85805.290368
0X10075961BC11C88E,0X10075961BC11C88E_108,1.695839e+05,7,0X10075961BC11C88E_91,373724.362746,0X10075961BC11C88E_91,3.737244e+05,0X10075961BC11C88E_108,169583.928426
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1.020074e+06,9,0X10094BA0A028EAC3_156,775376.393540,0X10094BA0A028EAC3_137,1.020074e+06,0X10094BA0A028EAC3_156,775376.393540
...,...,...,...,...,...,...,...,...,...
0XFDFBA5702E94ABF,0XFDFBA5702E94ABF_100,7.315825e+05,20039,0XFDFBA5702E94ABF_117,338787.294499,0XFDFBA5702E94ABF_100,7.315825e+05,0XFDFBA5702E94ABF_117,338787.294499
0XFDFD17B7CCFE5AF,0XFDFD17B7CCFE5AF_66,3.909803e+05,20041,0XFDFD17B7CCFE5AF_85,145372.048701,0XFDFD17B7CCFE5AF_66,3.909803e+05,0XFDFD17B7CCFE5AF_85,145372.048701
0XFE6E32991136338,0XFE6E32991136338_31,4.625250e+05,20043,0XFE6E32991136338_45,322695.862701,0XFE6E32991136338_31,4.625250e+05,0XFE6E32991136338_45,322695.862701
0XFE83FF3D3B13C3A,0XFE83FF3D3B13C3A_49,2.660651e+05,20045,0XFE83FF3D3B13C3A_67,98194.090732,0XFE83FF3D3B13C3A_49,2.660651e+05,0XFE83FF3D3B13C3A_67,98194.090732


In [65]:
# drop unnecessary columns

file_df = file_df.drop(['File_Frame_y', 'LV_Vol_y', 'index','File_Frame_x', 'LV_Vol_x'], axis=1)
file_df.head()

Unnamed: 0_level_0,EDV_frame,EDV,ESV_frame,ESV
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.5,0X100009310A3BD7FC_61,108234.102796
0X1002E8FBACD08477,0X1002E8FBACD08477_3,405792.4,0X1002E8FBACD08477_18,171273.058049
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.9,0X1005D03EED19C65B_35,85805.290368
0X10075961BC11C88E,0X10075961BC11C88E_91,373724.4,0X10075961BC11C88E_108,169583.928426
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020074.0,0X10094BA0A028EAC3_156,775376.39354


In [66]:
# calculating EF (based on calculations on tracing data)

file_df['EF'] = ((file_df['EDV'] - file_df['ESV']) / file_df['EDV']) * 100

In [67]:
file_df.head()

Unnamed: 0_level_0,EDV_frame,EDV,ESV_frame,ESV,EF
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,0X100009310A3BD7FC_46,480048.5,0X100009310A3BD7FC_61,108234.102796,77.453507
0X1002E8FBACD08477,0X1002E8FBACD08477_3,405792.4,0X1002E8FBACD08477_18,171273.058049,57.792933
0X1005D03EED19C65B,0X1005D03EED19C65B_24,230249.9,0X1005D03EED19C65B_35,85805.290368,62.733836
0X10075961BC11C88E,0X10075961BC11C88E_91,373724.4,0X10075961BC11C88E_108,169583.928426,54.623261
0X10094BA0A028EAC3,0X10094BA0A028EAC3_137,1020074.0,0X10094BA0A028EAC3_156,775376.39354,23.988204


In [68]:
# EFs are all positive and have reasonable stats

file_df.describe()

Unnamed: 0,EDV,ESV,EF
count,10024.0,10024.0,10024.0
mean,632325.1,291726.9,55.282175
std,405148.4,232226.2,12.581399
min,4246.42,1737.235,0.846479
25%,380320.0,153381.0,51.141748
50%,526355.4,222729.4,58.673566
75%,763349.9,354007.1,63.555337
max,6165758.0,3084905.0,94.757421


In [69]:
# splice out FileNames

file_df['EDV_frame'] = file_df.EDV_frame.str.split('_').str[1]
file_df['ESV_frame'] = file_df.ESV_frame.str.split('_').str[1]

In [70]:
file_df.head()

Unnamed: 0_level_0,EDV_frame,EDV,ESV_frame,ESV,EF
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0X100009310A3BD7FC,46,480048.5,61,108234.102796,77.453507
0X1002E8FBACD08477,3,405792.4,18,171273.058049,57.792933
0X1005D03EED19C65B,24,230249.9,35,85805.290368,62.733836
0X10075961BC11C88E,91,373724.4,108,169583.928426,54.623261
0X10094BA0A028EAC3,137,1020074.0,156,775376.39354,23.988204


In [71]:
# merge datasets

video_df = pd.merge(video_df, file_df, how='outer', on='FileName')
video_df.head()

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL,46,480048.5,61,108234.102796,77.453507
0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN,3,405792.4,18,171273.058049,57.792933
0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN,24,230249.9,35,85805.290368,62.733836
0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN,91,373724.4,108,169583.928426,54.623261
0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL,137,1020074.0,156,775376.39354,23.988204


In [72]:
# no change in number of rows
video_df.shape

(10024, 13)

###  Comparing calculated vs original EF values

In [73]:
# compare EF columns

video_df['EF_diff'] = video_df['EF_x'] - video_df['EF_y']
video_df.head()

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X100009310A3BD7FC,78.498406,14.881368,69.210534,112,112,50,174,VAL,46,480048.5,61,108234.102796,77.453507,1.044899
0X1002E8FBACD08477,59.101988,40.383876,98.742884,112,112,50,215,TRAIN,3,405792.4,18,171273.058049,57.792933,1.309055
0X1005D03EED19C65B,62.363798,14.267784,37.909734,112,112,50,104,TRAIN,24,230249.9,35,85805.290368,62.733836,-0.370037
0X10075961BC11C88E,54.545097,33.143084,72.91421,112,112,55,122,TRAIN,91,373724.4,108,169583.928426,54.623261,-0.078164
0X10094BA0A028EAC3,24.887742,127.581945,169.855024,112,112,52,207,VAL,137,1020074.0,156,775376.39354,23.988204,0.899537


In [74]:
# most videos have only slight differences between EFs - but there are very large differences

video_df['EF_diff'].describe()

count    10024.000000
mean         0.465091
std          3.070866
min        -43.691952
25%         -0.178093
50%          0.396985
75%          1.033410
max         77.269984
Name: EF_diff, dtype: float64

In [75]:
# approx 50 videos have EF differences smaller (more negative) than -5

video_df.sort_values(by='EF_diff').head(60)

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X2AC09763183674E8,29.220287,283.25819,400.196863,112,112,43,131,VAL,64,2492267.0,43,675099.3,72.912239,-43.691952
0X67F8AC58B0BAA98,37.40685,57.279656,91.511061,112,112,50,201,TRAIN,198,2508655.0,162,501849.4,79.995281,-42.588431
0X13D1459C51B5C32E,34.204924,50.835267,77.263027,112,112,50,107,TRAIN,43,813667.7,25,223249.3,72.5626,-38.357676
0X5D38D994C2490EAE,43.759989,48.980026,87.091067,112,112,50,157,TRAIN,75,1854560.0,90,353305.1,80.949377,-37.189388
0X67E8F2D130F1A55,45.258705,72.907135,133.184893,112,112,50,142,TEST,20,2116110.0,111,387580.8,81.684281,-36.425576
0X3D8353611168F743,47.406198,29.426244,55.950022,112,112,50,173,TRAIN,43,771919.4,58,129361.4,83.241588,-35.83539
0X526BA02D476E9274,47.983035,42.468391,81.643346,112,112,50,151,TRAIN,93,1705674.0,73,309556.6,81.851356,-33.868321
0X62120814160BA377,34.608729,117.737668,180.051047,112,112,50,204,TRAIN,203,2575293.0,166,849623.0,67.008685,-32.399956
0X280B7441A7E287B2,39.504455,15.40601,25.466354,112,112,57,183,VAL,1,282030.0,117,80134.36,71.586588,-32.082134
0X500FC4E8716B0A8F,46.766109,155.843559,292.752522,112,112,40,161,TEST,97,1396556.0,63,303164.0,78.292026,-31.525917


In [76]:
# approx 45 videos have EF differences larger than 5

video_df.sort_values(by='EF_diff', ascending=False).head(50)

Unnamed: 0_level_0,EF_x,ESV_x,EDV_x,FrameHeight,FrameWidth,FPS,NumberOfFrames,Split,EDV_frame,EDV_y,ESV_frame,ESV_y,EF_y,EF_diff
FileName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0X354B37A25C64276F,96.967237,9.756508,321.703567,112,112,50,71,TRAIN,31,649246.7,38,521363.0,19.697252,77.269984
0X65E605F203321860,86.757974,15.86398,119.800245,112,112,50,67,TRAIN,32,817932.0,53,710043.7,13.190377,73.567596
0X973E4A9DAADDF9F,85.252871,19.368167,131.335176,112,112,50,57,TRAIN,49,535882.2,40,408103.9,23.84449,61.408381
0X2AD994F98C491FA6,63.87166,30.719091,85.027683,112,112,50,150,VAL,41,319634.1,57,309638.5,3.127215,60.744445
0X37F9E9981E207C04,60.141132,30.899096,77.521259,112,112,50,110,TRAIN,75,1324701.0,90,1313488.0,0.846479,59.294654
0X36C5A15AC7FC6AAA,59.494177,53.578387,132.273295,112,112,47,229,TRAIN,74,376186.6,58,371455.5,1.25766,58.236517
0X411E89F93DAB415A,59.771857,61.964338,154.032311,112,112,54,109,TRAIN,48,556603.0,69,543033.4,2.437925,57.333932
0X28980B95F9769CE7,72.723828,13.278861,48.683006,112,112,62,149,TRAIN,3,831466.7,21,652651.4,21.506012,51.217816
0X5B6FCBB75BF8FCB7,70.124718,27.776028,92.973275,112,112,50,105,TRAIN,84,947419.5,69,759183.1,19.868327,50.256392
0X4EA078CC4E65B6A3,83.400255,7.245782,43.649959,112,112,50,87,TRAIN,61,108987.9,42,68757.07,36.913132,46.487123


In [77]:
# dropping videos with EF_differences less than -5

video_df = video_df[(video_df['EF_diff'] >= -5)]

In [78]:
# dropping videos with EF_differences greater than 5

video_df = video_df[(video_df['EF_diff'] <= 5)]

In [79]:
# dropped a total of 95 videos

video_df.shape

(9929, 14)

In [80]:
# confirming differences are much smaller after dropping 95 most extreme videos

video_df['EF_diff'].describe()

count    9929.000000
mean        0.443055
std         1.012311
min        -4.825463
25%        -0.173447
50%         0.397697
75%         1.027362
max         4.967009
Name: EF_diff, dtype: float64

## Prepare tabular data for video processing 

In [81]:
# create a new df with info we'll need for processing videos

video_files = video_df[['Split', 'NumberOfFrames', 'EDV_frame', 'EDV_y', 'ESV_frame', 'ESV_y', 'EF_y']]

In [82]:
video_files = video_files.rename(columns={'NumberOfFrames': 'NumFrames', 'EDV_y': 'EDV', 'ESV_y': 'ESV', 'EF_y': 'EF'})

In [83]:
video_files = video_files.reset_index()

In [84]:
video_files.head()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
0,0X100009310A3BD7FC,VAL,174,46,480048.5,61,108234.102796,77.453507
1,0X1002E8FBACD08477,TRAIN,215,3,405792.4,18,171273.058049,57.792933
2,0X1005D03EED19C65B,TRAIN,104,24,230249.9,35,85805.290368,62.733836
3,0X10075961BC11C88E,TRAIN,122,91,373724.4,108,169583.928426,54.623261
4,0X10094BA0A028EAC3,VAL,207,137,1020074.0,156,775376.39354,23.988204


In [85]:
video_files.shape

(9929, 8)

In [99]:
video_files[video_files['FileName'] == '0X1A76A1A8448B456']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
835,0X1A76A1A8448B456,TRAIN,206,2,369574.080293,18,155478.99363,57.930222


In [100]:
video_files[video_files['FileName'] == '0X1A349D84388BD74B']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
820,0X1A349D84388BD74B,TRAIN,177,44,390899.451822,61,154799.880058,60.399054


In [102]:
video_files[video_files['FileName'] == '0X1AE20B8AE3B5E9EF']

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
860,0X1AE20B8AE3B5E9EF,TRAIN,194,100,522782.360074,118,203660.688171,61.04293


### Train-test split 

In [87]:
# split into train(includes train and val) and test according to division of original dataset

train_videos = video_files[video_files['Split'] == 'TRAIN']
val_videos = video_files[video_files['Split'] == 'VAL']
test_videos = video_files[video_files['Split'] == 'TEST']

In [88]:
train_videos["Split"] = "Train"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_videos["Split"] = "Train"


In [89]:
train_videos = train_videos.reset_index(drop=True)
train_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
7387,0XFD9464CF9E40B66,Train,142,106,477658.66681,121,137513.011648,71.21103
7388,0XFDC39A88895DE1E,Train,181,42,385448.237813,63,164745.848585,57.258632
7389,0XFDFBA5702E94ABF,Train,192,100,731582.452357,117,338787.294499,53.691167
7390,0XFDFD17B7CCFE5AF,Train,268,66,390980.349851,85,145372.048701,62.818579
7391,0XFEBEEFF93F6FEB9,Train,109,25,375613.382114,38,257292.450725,31.500723


In [90]:
# approximately 75% train

train_videos.shape

(7392, 8)

In [91]:
val_videos["Split"] = "Val"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_videos["Split"] = "Val"


In [92]:
val_videos = val_videos.reset_index(drop=True)
val_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
1268,0XFBD22BB93EE05C1,Val,193,39,510239.407446,56,206763.703985,59.47712
1269,0XFCC2AF8E23AFBA8,Val,161,41,562010.456102,57,213292.246691,62.048349
1270,0XFD06AD51C6DA6E5,Val,191,47,426498.8975,62,184570.140624,56.724357
1271,0XFDB874C30A9C923,Val,184,53,613907.961534,73,493886.860063,19.550341
1272,0XFE83FF3D3B13C3A,Val,192,49,266065.065718,67,98194.090732,63.093956


In [93]:
# approx 12.5% val

val_videos.shape

(1273, 8)

In [94]:
test_videos["Split"] = "Test"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_videos["Split"] = "Test"


In [95]:
test_videos = test_videos.reset_index(drop=True)
test_videos.tail()

Unnamed: 0,FileName,Split,NumFrames,EDV_frame,EDV,ESV_frame,ESV,EF
1259,0XECF82DC9301EE77,Test,176,91,734262.514179,111,269118.289836,63.348491
1260,0XF557EF658FD13D0,Test,179,0,485878.226316,20,232627.262127,52.122312
1261,0XF6661AF354401A5,Test,221,93,610161.985847,111,213721.125544,64.973051
1262,0XF829F634971A0F7,Test,183,48,472216.649112,66,173155.674447,63.331307
1263,0XFE6E32991136338,Test,120,31,462524.953885,45,322695.862701,30.231686


In [96]:
# approx 12.5% test

test_videos.shape

(1264, 8)

## Process videos 

In [97]:
import tensorflow.keras.utils as utils

In [None]:
def process_df(df):
    '''Takes in a echocardiogram video clip and extracts 3 images: 
    one at end-diastole, one at end-systole, and one in between (other).'''
    
    for row in df.itertuples():
        # load and open videos
        path = "EchoNet-Dynamic/Videos/" + row.FileName + ".avi"
        cap = cv2.VideoCapture(path)
        frames = []
        while cap.isOpened():
            ret, frame = cap.read()
            # if frame is read correctly, ret is true
            if not ret:
                break
            frames.append(frame)
        
        #print(f'Reported num of frames: {row.NumFrames}')
        #print(f'Actual num of frames: {len(frames)}')
        
        # select and save EDV frames
        #print(f'EDV frame: {row.EDV_frame}')
        EDV_image = frames[int(row.EDV_frame) - 1] # account for 0 indexing
        EDV_path = row.Split + "_Images/EDV/" + row.FileName + '_EDV.jpg'   
        tf.keras.preprocessing.image.save_img(EDV_path, np.array(EDV_image), scale=False)
        
        # select and save ESV frames
        #print(f'ESV frame: {row.ESV_frame}')
        ESV_image = frames[int(row.ESV_frame) - 1] # account for 0 indexing
        ESV_path = row.Split + "_Images/ESV/" + row.FileName + '_ESV.jpg'  
        tf.keras.preprocessing.image.save_img(ESV_path, np.array(ESV_image), scale=False)
        
        # select and save an "Other" frame
        other_frame = np.around((int(row.EDV_frame) + int(row.ESV_frame)) / 2)
        #print(f'Other frame: {other_frame}')
        other_image = frames[int(other_frame) - 1] # account for 0 indexing
        other_path = row.Split + "_Images/Other/" + row.FileName + '_Other.jpg'
        tf.keras.preprocessing.image.save_img(other_path, np.array(other_image), scale=False)
        
        cap.release()

In [None]:
process_df(train_videos.iloc[6000:8000])

In [None]:
process_df(val_videos)

In [None]:
process_df(test_videos)