IMPORTS

In [1]:
#imports
import pandas as pd
from io import StringIO
import numpy as np
import matplotlib.pyplot as plt
import os 
import numpy as np

Class for importing ECG

In [2]:
#dataclass for importing in ecg data
## channels : imports in the label for each channels in the file
### sample-freq gives the frequency here it is 1000hz
#### data actually gives the ecg data at each millisecond
class TxtFile:
    def __init__(self, filepath, verbose=False):
        self.filepath = filepath
        self.channels, self.sample_freq, self.data = self.load_file()
        if verbose: print("Channels: {}".format(self.channels))

    def load_file(self):
        with open(self.filepath) as f:
            channels, sample_freq = self.load_channels(f)
            _ = self._read_until(f, "[Data]")
            data = f.read()
            data = pd.read_table(StringIO(data), names=channels, sep=',')
            # data = self.filter_data(data)
            return channels, sample_freq, data

    def load_channels(self, file):
        channels = []
        line = self._read_until(file, "Channels exported:")
        sample_freq = int(self._read_until(file, "Sample Rate").rsplit(' ', 1)[-1].rsplit('Hz')[0])
        n_channels = int(line.split(' ')[-1])
        for n_channel in range(n_channels):
            line = self._read_until(file, "Label:")
            channel_name = line.replace('Label: ', '').rstrip()
            channels.append(channel_name)
        return channels, sample_freq

    @staticmethod
    def _read_until(file, string):
        line = file.readline()
        while string not in line:
            line = file.readline()
        return line

In [4]:
#loading in the file to see if the file path is correct 
file_path = "/rds/general/project/fsn-ai-ecg-data/live/afml/CSCL_1st/CS/CSCL1_post_cs_1.txt"
txt_file = TxtFile(file_path, verbose=True)


Channels: ['I', 'aVF', 'V1', 'V6', 'CS 1-2', 'CS 3-4', 'CS 5-6', 'CS 7-8', 'CS 9-10']


In [5]:
txt_file.data

Unnamed: 0,I,aVF,V1,V6,CS 1-2,CS 3-4,CS 5-6,CS 7-8,CS 9-10
0,64,-176,-480,-320,-32,-144,144,80,-96
1,64,-160,-480,-272,-16,-48,224,112,-128
2,48,-128,-480,-256,48,-48,128,80,64
3,48,-96,-496,-224,-48,-112,176,32,64
4,32,-48,-496,-208,-32,-128,112,0,-64
...,...,...,...,...,...,...,...,...,...
59995,-464,-1216,-16,-1184,144,48,-48,0,-48
59996,-480,-1200,-16,-1184,112,-96,32,-128,0
59997,-496,-1200,32,-1168,80,16,-48,-80,-176
59998,-496,-1168,16,-1168,-112,0,80,-80,-144


In [10]:
np.load("/rds/general/user/oe222/home/AF_ecg/Preprocessing/CS_training.npy")

array([[ -128,  -144,  -128, ...,    32,    96,   128],
       [  352,   208,   -32, ...,  -112,   -48,   -16],
       [   32,    48,   128, ...,   -32,   -32,     0],
       ...,
       [  -32,   -64,   -16, ..., -4272, -4496, -4144],
       [-3072, -1616, -1024, ...,  -144,  -112,   -96],
       [  -64,   -64,   -16, ...,  -224,  -176,  -128]])

In [7]:
txt_file.sample_freq

1000

creating a df with file path , procdure and patient numbers in the cscl_1st directory

In [8]:
import os
import pandas as pd
import re
pd.reset_option("display.max_rows")
directory = "/rds/general/project/fsn-ai-ecg-data/live/afml/CSCL_1st/CS/"

file_paths = []  # List to store file paths
patient_numbers = []  # List to store patient numbers
procedures = []  # List to store procedures

# Regular expression pattern to extract the patient number and procedure
pattern = r"CSCL(\d+)_(pre|post)_"

for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

        # Extract patient number and procedure from the file name using regex
        match = re.search(pattern, file)
        if match:
            patient_number = match.group(1)
            procedure = match.group(2)
            patient_numbers.append(patient_number)
            procedures.append(procedure)
        else:
            patient_numbers.append(None)
            procedures.append(None)

# Create a DataFrame from the file_paths, patient_numbers, and procedures lists
df = pd.DataFrame({"File Path": file_paths, "Patient Number": patient_numbers, "Procedure": procedures})

print(df)




                                             File Path Patient Number  \
0    /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
1    /rds/general/project/fsn-ai-ecg-data/live/afml...             60   
2    /rds/general/project/fsn-ai-ecg-data/live/afml...             15   
3    /rds/general/project/fsn-ai-ecg-data/live/afml...             81   
4    /rds/general/project/fsn-ai-ecg-data/live/afml...             49   
..                                                 ...            ...   
591  /rds/general/project/fsn-ai-ecg-data/live/afml...             29   
592  /rds/general/project/fsn-ai-ecg-data/live/afml...             41   
593  /rds/general/project/fsn-ai-ecg-data/live/afml...             71   
594  /rds/general/project/fsn-ai-ecg-data/live/afml...             72   
595  /rds/general/project/fsn-ai-ecg-data/live/afml...             73   

    Procedure  
0        post  
1         pre  
2         pre  
3         pre  
4        post  
..        ...  
591       p

Adding the patients in the cscl_redo directory

In [9]:
directory = "/rds/general/project/fsn-ai-ecg-data/live/afml/CSCL_redo/"
# Regular expression pattern to extract the patient number and procedure
pattern = r"CSCL(\d+)_(pre|post)_"

for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

        # Extract patient number and procedure from the file name using regex
        match = re.search(pattern, file)
        if match:
            patient_number = match.group(1)
            procedure = match.group(2)
            patient_numbers.append(patient_number)
            procedures.append(procedure)
        else:
            patient_numbers.append(None)
            procedures.append(None)

# Create a DataFrame from the file_paths, patient_numbers, and procedures lists
df = pd.DataFrame({"File Path": file_paths, "Patient Number": patient_numbers, "Procedure": procedures})

print(df)

                                             File Path Patient Number  \
0    /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
1    /rds/general/project/fsn-ai-ecg-data/live/afml...             60   
2    /rds/general/project/fsn-ai-ecg-data/live/afml...             15   
3    /rds/general/project/fsn-ai-ecg-data/live/afml...             81   
4    /rds/general/project/fsn-ai-ecg-data/live/afml...             49   
..                                                 ...            ...   
711  /rds/general/project/fsn-ai-ecg-data/live/afml...            105   
712  /rds/general/project/fsn-ai-ecg-data/live/afml...            104   
713  /rds/general/project/fsn-ai-ecg-data/live/afml...            103   
714  /rds/general/project/fsn-ai-ecg-data/live/afml...             96   
715  /rds/general/project/fsn-ai-ecg-data/live/afml...             98   

    Procedure  
0        post  
1         pre  
2         pre  
3         pre  
4        post  
..        ...  
711       p

Adding patients from the RRAF direcotry 


In [10]:
import os
import pandas as pd
import re

directory = "/rds/general/project/fsn-ai-ecg-data/live/afml/RRAF/CS/"
pattern = r"RRAF(\d+)_(pre|post)_"

file_paths = []  # List to store file paths
patient_numbers = []  # List to store patient numbers
procedures = []  # List to store procedures

existing_patient_numbers = set(df["Patient Number"]) if "Patient Number" in df.columns else set()

for root, dirs, files in os.walk(directory):
    for file in files:
        file_path = os.path.join(root, file)
        file_paths.append(file_path)

        # Extract patient number and procedure from the file name using regex
        match = re.search(pattern, file)
        if match:
            patient_number = int(match.group(1)) + 107
            while patient_number in existing_patient_numbers:
                patient_number += 107
            procedure = match.group(2)
            patient_numbers.append(int(patient_number))  # Convert to integer
            procedures.append(procedure)
        else:
            patient_numbers.append(None)
            procedures.append(None)

# Create a DataFrame from the file_paths, patient_numbers, and procedures lists
new_df = pd.DataFrame({"File Path": file_paths, "Patient Number": patient_numbers, "Procedure": procedures})

# Concatenate the new_df with the previous df
df = pd.concat([df, new_df], ignore_index=True)

print(df)

                                              File Path Patient Number  \
0     /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
1     /rds/general/project/fsn-ai-ecg-data/live/afml...             60   
2     /rds/general/project/fsn-ai-ecg-data/live/afml...             15   
3     /rds/general/project/fsn-ai-ecg-data/live/afml...             81   
4     /rds/general/project/fsn-ai-ecg-data/live/afml...             49   
...                                                 ...            ...   
2411  /rds/general/project/fsn-ai-ecg-data/live/afml...          196.0   
2412  /rds/general/project/fsn-ai-ecg-data/live/afml...          181.0   
2413  /rds/general/project/fsn-ai-ecg-data/live/afml...          133.0   
2414  /rds/general/project/fsn-ai-ecg-data/live/afml...          208.0   
2415  /rds/general/project/fsn-ai-ecg-data/live/afml...          244.0   

     Procedure  
0         post  
1          pre  
2          pre  
3          pre  
4         post  
...      

In [11]:
#finding a removing rows with nas 
rows_with_nas = df[df.isna().any(axis=1)]
print("Rows with NaN values:")
print(rows_with_nas)


Rows with NaN values:
                                              File Path Patient Number  \
1709  /rds/general/project/fsn-ai-ecg-data/live/afml...            NaN   

     Procedure  
1709      None  


In [12]:
#dropping na row 
df = df.drop(1709)


In [13]:
#saving csv
df.to_csv("output.csv", index=False)

In [14]:
#looking at the counts of patients and how many ecg samples have been taken 
pd.set_option("display.max_rows", None)
# Count unique patients and their frequencies
unique_patients = df["Patient Number"].nunique()
patient_frequencies = df["Patient Number"].value_counts()

print("Number of unique patients:", unique_patients)
print("\nnumber of ecg samples per patient:")
print(patient_frequencies)
pd.reset_option("display.max_rows")

Number of unique patients: 285

number of ecg samples per patient:
197.0    20
48       10
201.0    10
163.0    10
208.0    10
260.0    10
146.0    10
302.0    10
116.0    10
226.0    10
253.0    10
249.0    10
184.0    10
296.0    10
110.0    10
140.0    10
125.0    10
155.0    10
256.0    10
278.0    10
143.0    10
281.0    10
251.0    10
219.0    10
242.0    10
115.0    10
228.0    10
233.0    10
179.0    10
203.0    10
280.0    10
190.0    10
178.0    10
199.0    10
211.0    10
160.0    10
174.0    10
172.0    10
113.0    10
159.0    10
284.0    10
263.0    10
193.0    10
266.0    10
237.0    10
252.0    10
176.0    10
216.0    10
196.0    10
170.0    10
167.0    10
202.0    10
308.0    10
132.0    10
192.0    10
177.0    10
303.0    10
200.0    10
152.0    10
217.0    10
229.0    10
188.0    10
123.0    10
129.0    10
153.0    10
269.0    10
295.0    10
111.0    10
145.0    10
257.0    10
137.0    10
204.0    10
258.0    10
235.0    10
275.0    10
240.0    10
248.0    10
114.0    

Some pateints do not even have more than one ECG reeading now I wil check hwo many people do not have pre and post readings

In [15]:
pd.set_option("display.max_rows", None)
# Group the DataFrame by 'Patient Number' and 'Procedure' and count the occurrences
procedure_counts = df.groupby(['Patient Number', 'Procedure']).size().reset_index(name='Count')

print(procedure_counts)
pd.reset_option("display.max_rows")

    Patient Number Procedure  Count
0            108.0      post      5
1            108.0       pre      5
2            110.0      post      5
3            110.0       pre      5
4            111.0      post      5
5            111.0       pre      5
6            112.0      post      5
7            112.0       pre      5
8            113.0      post      5
9            113.0       pre      5
10           114.0      post      5
11           114.0       pre      5
12           115.0      post      5
13           115.0       pre      5
14           116.0      post      5
15           116.0       pre      5
16           117.0      post      5
17           117.0       pre      5
18           118.0      post      5
19           118.0       pre      5
20           119.0      post      5
21           119.0       pre      5
22           120.0      post      5
23           120.0       pre      5
24           121.0      post      5
25           121.0       pre      5
26           122.0      post

In [16]:
import os
import pandas as pd
import re

# Rest of your code...

# Iterate through the DataFrame rows
for index, row in df.iterrows():
    filepath = row['File Path']
    txt_file = TxtFile(filepath)

    # Iterate through the channels in the TxtFile object
    for channel in txt_file.channels:
        # Create a column name based on the channel
        column_name = f"Channel_{channel}"
        
        # Add a new column to the DataFrame if it doesn't exist
        if column_name not in df.columns:
            df[column_name] = 0  # Initialize the column with zeros
        
        # Set the value to 1 if the channel is present in the TxtFile channels
        if channel in txt_file.channels:
            df.at[index, column_name] = 1

print(df) 

  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros
  df[column_name] = 0  # Initialize the column with zeros


                                              File Path Patient Number  \
0     /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
1     /rds/general/project/fsn-ai-ecg-data/live/afml...             60   
2     /rds/general/project/fsn-ai-ecg-data/live/afml...             15   
3     /rds/general/project/fsn-ai-ecg-data/live/afml...             81   
4     /rds/general/project/fsn-ai-ecg-data/live/afml...             49   
...                                                 ...            ...   
2411  /rds/general/project/fsn-ai-ecg-data/live/afml...          196.0   
2412  /rds/general/project/fsn-ai-ecg-data/live/afml...          181.0   
2413  /rds/general/project/fsn-ai-ecg-data/live/afml...          133.0   
2414  /rds/general/project/fsn-ai-ecg-data/live/afml...          208.0   
2415  /rds/general/project/fsn-ai-ecg-data/live/afml...          244.0   

     Procedure  Channel_I  Channel_aVF  Channel_V1  Channel_V6  \
0         post          1            1       

In [17]:
#saving csv
df.to_csv("output.csv", index=False)


In [18]:
columns_no_zeros = df.loc[:, 'Channel_I':].columns[df.loc[:, 'Channel_I':].all()]
print("Columns with no zeros:")
print(columns_no_zeros)

Columns with no zeros:
Index([], dtype='object')


In [19]:
pd.set_option("display.max_rows", None)
#caclulating percentage of zero in channel collumns 
channel_columns = df.loc[:, 'Channel_I':]

zero_percentages = channel_columns.eq(0).mean() * 100
print("Percentage of zeros in each channel column:")
print(zero_percentages)
pd.reset_option("display.max_rows")


Percentage of zeros in each channel column:
Channel_I                1.656315
Channel_aVF              1.449275
Channel_V1               5.962733
Channel_V6               3.478261
Channel_CS 1-2          10.310559
Channel_CS 3-4           4.720497
Channel_CS 5-6           8.695652
Channel_CS 7-8           8.902692
Channel_CS 9-10          9.109731
Channel_CS 1,2          98.385093
Channel_CS 3,4          98.923395
Channel_CS 5,6          98.923395
Channel_CS 7,8          99.130435
Channel_CS 9,10         99.130435
Channel_II              88.902692
Channel_V2              92.380952
Channel_CS 3, 4         99.461698
Channel_CS 5 , 6        99.461698
Channel_CS 7 ,8         99.461698
Channel_CS 9, 10        99.461698
Channel_III             95.031056
Channel_aVR             96.480331
Channel_aVL             96.480331
Channel_V3              96.687371
Channel_V4              96.687371
Channel_V5              96.107660
Channel_Mapd            43.064182
Channel_Mapp            43.064182
Chan

Reading in the smaples per channel of each txt file to make sure we have 60,000 milliseconds of data for each txt file 

In [20]:
def read_samples_per_channel(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("Samples per channel"):
                samples_per_channel = line.split(":")[1].strip()
                return int(samples_per_channel)
    return None 

In [21]:
# Iterate through the DataFrame and add a new column "Sample Channels"
df['Sample Channels'] = df['File Path'].apply(read_samples_per_channel)

# Display the updated DataFrame
print(df)

                                              File Path Patient Number  \
0     /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
1     /rds/general/project/fsn-ai-ecg-data/live/afml...             60   
2     /rds/general/project/fsn-ai-ecg-data/live/afml...             15   
3     /rds/general/project/fsn-ai-ecg-data/live/afml...             81   
4     /rds/general/project/fsn-ai-ecg-data/live/afml...             49   
...                                                 ...            ...   
2411  /rds/general/project/fsn-ai-ecg-data/live/afml...          196.0   
2412  /rds/general/project/fsn-ai-ecg-data/live/afml...          181.0   
2413  /rds/general/project/fsn-ai-ecg-data/live/afml...          133.0   
2414  /rds/general/project/fsn-ai-ecg-data/live/afml...          208.0   
2415  /rds/general/project/fsn-ai-ecg-data/live/afml...          244.0   

     Procedure  Channel_I  Channel_aVF  Channel_V1  Channel_V6  \
0         post          1            1       

  df['Sample Channels'] = df['File Path'].apply(read_samples_per_channel)


In [22]:
# Check for rows with sample channels not equal to 60,000
invalid_rows = df[df['Sample Channels'] != 60000]

# Check if there are any invalid rows
if not invalid_rows.empty:
    print("Rows without 60,000 sample channels:")
    print(invalid_rows)
    num_invalid_rows = len(invalid_rows)
    print("Number of rows without 60,000 sample channels:", num_invalid_rows)
    
    # Remove the invalid rows from the DataFrame
    df = df.drop(invalid_rows.index)

    print("Invalid rows removed.")

# Print the updated DataFrame
print(df.shape)




Rows without 60,000 sample channels:
                                              File Path Patient Number  \
31    /rds/general/project/fsn-ai-ecg-data/live/afml...             83   
250   /rds/general/project/fsn-ai-ecg-data/live/afml...             26   
449   /rds/general/project/fsn-ai-ecg-data/live/afml...             12   
533   /rds/general/project/fsn-ai-ecg-data/live/afml...             61   
585   /rds/general/project/fsn-ai-ecg-data/live/afml...             48   
...                                                 ...            ...   
2335  /rds/general/project/fsn-ai-ecg-data/live/afml...          250.0   
2349  /rds/general/project/fsn-ai-ecg-data/live/afml...          298.0   
2350  /rds/general/project/fsn-ai-ecg-data/live/afml...          118.0   
2356  /rds/general/project/fsn-ai-ecg-data/live/afml...          228.0   
2380  /rds/general/project/fsn-ai-ecg-data/live/afml...          297.0   

     Procedure  Channel_I  Channel_aVF  Channel_V1  Channel_V6  \
31      

In [23]:
#saving the final df 
df.to_csv('output.csv', index=False)

In [35]:
pd.reset_option("display.max_rows")

In [98]:
import pandas as pd
from io import StringIO

class TxtFile:
    def __init__(self, filepath, verbose=False):
        self.filepath = filepath
        self.channels, self.sample_freq, self.data = self.load_file()
        if verbose: print("Channels: {}".format(self.channels))

    def load_file(self):
        with open(self.filepath) as f:
            channels, sample_freq = self.load_channels(f)
            _ = self._read_until(f, "[Data]")
            data = f.read()
            data = pd.read_table(StringIO(data), names=channels, sep=',')
            # data = self.filter_data(data)
            return channels, sample_freq, data

    def load_channels(self, file):
        channels = []
        line = self._read_until(file, "Channels exported:")
        sample_freq = int(self._read_until(file, "Sample Rate").rsplit(' ', 1)[-1].rsplit('Hz')[0])
        n_channels = int(line.split(' ')[-1])
        for n_channel in range(n_channels):
            line = self._read_until(file, "Label:")
            channel_name = line.replace('Label: ', '').rstrip()
            channels.append(channel_name)
        return channels, sample_freq

    @staticmethod
    def _read_until(file, string):
        line = file.readline()
        while string not in line:
            line = file.readline()
        return line

    def get_data_shape(self):
        return self.data.shape


csv with all 60,000 samples exactly widled down to 2347 samples 

In [99]:
df["DataShape"] = df["File Path"].apply(lambda x: TxtFile(x).get_data_shape())
df.to_csv('output.csv', index=False)

In [100]:
# Remove rows with data shape other than (60000, *)
df = df[df["DataShape"].apply(lambda shape: shape[0] == 60000)]
print(df.shape)
# Save the updated DataFrame to a CSV file
df.to_csv('output.csv', index=False)

(2346, 119)


In [101]:
#creating a final_df with 1 with each of these channels 

# Specify the channel columns
channel_columns = ['Channel_I', 'Channel_aVF', 'Channel_V1', 'Channel_V6', 'Channel_CS 3-4']

# Filter rows that have a value of 1 in all channel columns
rows_with_ones = df[(df[channel_columns] == 1).all(axis=1)]

# Create a new DataFrame with the specified channel columns and additional columns
additional_columns = ['File Path', 'Patient Number', 'Procedure']
final_df = rows_with_ones[channel_columns + additional_columns].copy()

# Print the new DataFrame
print("Number of rows with a value of 1 in all channel columns:", len(final_df))
print("New DataFrame with the specified channel columns and additional columns:")
print(final_df)



Number of rows with a value of 1 in all channel columns: 2070
New DataFrame with the specified channel columns and additional columns:
      Channel_I  Channel_aVF  Channel_V1  Channel_V6  Channel_CS 3-4  \
0             1            1           1           1               1   
1             1            1           1           1               1   
2             1            1           1           1               1   
3             1            1           1           1               1   
4             1            1           1           1               1   
...         ...          ...         ...         ...             ...   
2411          1            1           1           1               1   
2412          1            1           1           1               1   
2413          1            1           1           1               1   
2414          1            1           1           1               1   
2415          1            1           1           1               1   



In [102]:
#checking pateint sample counts
#looking at the counts of patients and how many ecg samples have been taken 
pd.set_option("display.max_rows", None)
# Count unique patients and their frequencies
unique_patients = final_df["Patient Number"].nunique()
patient_frequencies = final_df["Patient Number"].value_counts()

print("Number of unique patients:", unique_patients)
print("\nnumber of ecg samples per patient:")
print(patient_frequencies)
pd.reset_option("display.max_rows")

Number of unique patients: 259

number of ecg samples per patient:
125.0    10
204.0    10
288.0    10
122.0    10
183.0    10
294.0    10
131.0    10
158.0    10
253.0    10
249.0    10
184.0    10
185.0    10
168.0    10
187.0    10
236.0    10
181.0    10
110.0    10
264.0    10
135.0    10
140.0    10
157.0    10
265.0    10
201.0    10
42       10
238.0    10
152.0    10
243.0    10
256.0    10
111.0    10
145.0    10
143.0    10
138.0    10
191.0    10
120.0    10
193.0    10
200.0    10
303.0    10
217.0    10
192.0    10
132.0    10
308.0    10
167.0    10
170.0    10
141.0    10
252.0    10
199.0    10
237.0    10
266.0    10
263.0    10
283.0    10
284.0    10
159.0    10
211.0    10
280.0    10
129.0    10
286.0    10
166.0    10
206.0    10
285.0    10
208.0    10
223.0    10
156.0    10
304.0    10
257.0    10
224.0    10
258.0    10
262.0    10
1        10
70       10
126.0    10
188.0    10
51       10
7        10
241.0    10
151.0    10
33       10
20       10
155.0    

In [103]:
pd.reset_option("display.max_rows")

In [104]:
import pandas as pd

# Count unique patients and their frequencies
unique_patients = final_df["Patient Number"].nunique()
patient_frequencies = final_df["Patient Number"].value_counts()

print("Number of unique patients:", unique_patients)
print("\nNumber of ECG samples per patient:")
print(patient_frequencies)

# Split the frequencies based on percentages
split_frequencies = [
    patient_frequencies.sample(frac=0.7, random_state=1),
    patient_frequencies.sample(frac=0.2, random_state=2),
    patient_frequencies.sample(frac=0.1, random_state=3)
]

# Print the split frequencies
split_labels = ["70%", "20%", "10%"]
for label, frequencies in zip(split_labels, split_frequencies):
    print("\n{} of patients ({} samples):".format(label, frequencies.sum()))
    print(frequencies)

# Create separate lists of patients for each split
split_70 = split_frequencies[0].index.tolist()
split_20 = split_frequencies[1].index.tolist()
split_10 = split_frequencies[2].index.tolist()

# Print the separate lists
print("\nList of patients in the 70% split:")
print(split_70)

print("\nList of patients in the 20% split:")
print(split_20)

print("\nList of patients in the 10% split:")
print(split_10)


Number of unique patients: 259

Number of ECG samples per patient:
125.0    10
204.0    10
288.0    10
122.0    10
183.0    10
         ..
12        3
61        2
54        2
255.0     2
87        1
Name: Patient Number, Length: 259, dtype: int64

70% of patients (1438 samples):
289.0     5
193.0    10
273.0     5
79        5
66        5
         ..
127.0     9
114.0    10
202.0     9
42       10
116.0     9
Name: Patient Number, Length: 181, dtype: int64

20% of patients (428 samples):
143.0    10
55        4
162.0     9
184.0    10
287.0     5
118.0     9
88        5
200.0    10
248.0    10
187.0    10
122.0    10
203.0     9
179.0     9
8        10
129.0    10
198.0     5
32       10
43       10
51       10
79        5
148.0     9
62        5
167.0    10
97        5
44       10
16        5
163.0     9
225.0     7
98        4
139.0    10
258.0    10
75       10
238.0    10
262.0    10
26        3
57        5
157.0    10
111.0    10
280.0    10
119.0     9
151.0    10
190.0     9
102 

In [105]:
import pandas as pd

# Create new DataFrames based on the patient numbers
df_70 = final_df[final_df["Patient Number"].isin(split_70)]
df_20 = final_df[final_df["Patient Number"].isin(split_20)]
df_10 = final_df[final_df["Patient Number"].isin(split_10)]

# Print the new DataFrames
print("New DataFrame for the 70% split:")
print(df_70)

print("\nNew DataFrame for the 20% split:")
print(df_20)

print("\nNew DataFrame for the 10% split:")
print(df_10)

New DataFrame for the 70% split:
      Channel_I  Channel_aVF  Channel_V1  Channel_V6  Channel_CS 3-4  \
1             1            1           1           1               1   
2             1            1           1           1               1   
3             1            1           1           1               1   
4             1            1           1           1               1   
5             1            1           1           1               1   
...         ...          ...         ...         ...             ...   
2409          1            1           1           1               1   
2410          1            1           1           1               1   
2411          1            1           1           1               1   
2413          1            1           1           1               1   
2414          1            1           1           1               1   

                                              File Path Patient Number  \
1     /rds/general/project/f

In [116]:
import numpy as np
import pandas as pd

# Create a list to store the arrays for each DataFrame
arrays_I_aVF_V1 = []
array_CS_3_4 = []

# Iterate through each file path in each DataFrame
for i, filepath in enumerate(df_70['File Path']):
    txt_file = TxtFile(filepath)
    
    # Extract the data for channels I, aVF, V1, V6
    channels_I_aVF_V1 = ['I', 'aVF', 'V1', 'V6']
    data_I_aVF_V1 = txt_file.data[channels_I_aVF_V1].to_numpy()
    
    # Check the shape of the data array for consistency
    if len(arrays_I_aVF_V1) > 0 and data_I_aVF_V1.shape != arrays_I_aVF_V1[0].shape:
        raise ValueError("Inconsistent shape for channels I, aVF, V1, V6. Please check the data.")
    
    arrays_I_aVF_V1.append(data_I_aVF_V1)
    
    # Extract the data for channel CS 3-4
    channel_CS_3_4 = 'CS 3-4'
    data_CS_3_4 = txt_file.data[channel_CS_3_4].to_numpy()
    array_CS_3_4.append(data_CS_3_4)
    

# Stack the arrays together using np.stack
array_I_aVF_V1 = np.stack(arrays_I_aVF_V1, axis=0)
array_CS_3_4 = np.stack(array_CS_3_4, axis=0)

# Print the shape of the final arrays
print("Shape of array_I_aVF_V1:", array_I_aVF_V1.shape)
print("Shape of array_CS_3_4:", array_CS_3_4.shape)




Shape of array_I_aVF_V1: (1438, 60000, 4)
Shape of array_CS_3_4: (1438, 60000)


In [117]:
#making training dataset
ECG_training = np.reshape(array_I_aVF_V1, (43140, 2000, 4))
CS_training = np.reshape(array_CS_3_4,(43140,2000))

In [122]:
import numpy as np
np.save('ECG_training.npy', ECG_training)


np.save('CS_training.npy', CS_training)

In [118]:
# Create a list to store the arrays for each DataFrame
arrays_I_aVF_V1 = []
array_CS_3_4 = []

# Iterate through each file path in each DataFrame
for i, filepath in enumerate(df_20['File Path']):
    txt_file = TxtFile(filepath)
    
    # Extract the data for channels I, aVF, V1, V6
    channels_I_aVF_V1 = ['I', 'aVF', 'V1', 'V6']
    data_I_aVF_V1 = txt_file.data[channels_I_aVF_V1].to_numpy()
    
    # Check the shape of the data array for consistency
    if len(arrays_I_aVF_V1) > 0 and data_I_aVF_V1.shape != arrays_I_aVF_V1[0].shape:
        raise ValueError("Inconsistent shape for channels I, aVF, V1, V6. Please check the data.")
    
    arrays_I_aVF_V1.append(data_I_aVF_V1)
    
    # Extract the data for channel CS 3-4
    channel_CS_3_4 = 'CS 3-4'
    data_CS_3_4 = txt_file.data[channel_CS_3_4].to_numpy()
    array_CS_3_4.append(data_CS_3_4)
    

# Stack the arrays together using np.stack
array_I_aVF_V1 = np.stack(arrays_I_aVF_V1, axis=0)
array_CS_3_4 = np.stack(array_CS_3_4, axis=0)

# Print the shape of the final arrays
print("Shape of array_I_aVF_V1:", array_I_aVF_V1.shape)
print("Shape of array_CS_3_4:", array_CS_3_4.shape)

Shape of array_I_aVF_V1: (428, 60000, 4)
Shape of array_CS_3_4: (428, 60000)


In [119]:
#making training dataset
ECG_testing = np.reshape(array_I_aVF_V1, (12840, 2000, 4))
CS_testing = np.reshape(array_CS_3_4,(12840,2000))

In [123]:
import numpy as np
np.save('ECG_testing.npy', ECG_testing)


np.save('CS_testing.npy', CS_testing)

In [120]:
arrays_I_aVF_V1 = []
array_CS_3_4 = []

# Iterate through each file path in each DataFrame
for i, filepath in enumerate(df_10['File Path']):
    txt_file = TxtFile(filepath)
    
    # Extract the data for channels I, aVF, V1, V6
    channels_I_aVF_V1 = ['I', 'aVF', 'V1', 'V6']
    data_I_aVF_V1 = txt_file.data[channels_I_aVF_V1].to_numpy()
    
    # Check the shape of the data array for consistency
    if len(arrays_I_aVF_V1) > 0 and data_I_aVF_V1.shape != arrays_I_aVF_V1[0].shape:
        raise ValueError("Inconsistent shape for channels I, aVF, V1, V6. Please check the data.")
    
    arrays_I_aVF_V1.append(data_I_aVF_V1)
    
    # Extract the data for channel CS 3-4
    channel_CS_3_4 = 'CS 3-4'
    data_CS_3_4 = txt_file.data[channel_CS_3_4].to_numpy()
    array_CS_3_4.append(data_CS_3_4)
    

# Stack the arrays together using np.stack
array_I_aVF_V1 = np.stack(arrays_I_aVF_V1, axis=0)
array_CS_3_4 = np.stack(array_CS_3_4, axis=0)

# Print the shape of the final arrays
print("Shape of array_I_aVF_V1:", array_I_aVF_V1.shape)
print("Shape of array_CS_3_4:", array_CS_3_4.shape)

Shape of array_I_aVF_V1: (205, 60000, 4)
Shape of array_CS_3_4: (205, 60000)


In [121]:
#making training dataset
ECG_validation = np.reshape(array_I_aVF_V1, (6150, 2000, 4))
CS_validation = np.reshape(array_CS_3_4,(6150,2000))

In [124]:
import numpy as np
np.save('ECG_validation.npy', ECG_validation)

np.save('CS_validation.npy', CS_validation)