In [4]:
import csv

# Function to parse the time from the timestamp string
def parse_time(timestamp):
    return timestamp.split()[0]

# Function to parse the heart rate from the record
def parse_heart_rate(record):
    heart_rate_index = record.find('Heart Rate')
    if heart_rate_index != -1:
        return int(record[heart_rate_index + len('Heart Rate: '):].split()[0])
    else:
        return None

# Function to parse the conductivity value from the record
def parse_conductivity(record):
    return int(record.split()[-1])

# Function to generate data points with average heart rate and conductivity readings for each second
def generate_data_points(data):
    data_points = []
    current_second = None
    last_heart_rate = None
    conductivities = []

    for timestamp, value in data:
        second = parse_time(timestamp)
        if interval_1_start <= second <= interval_2_end:  # Check if the timestamp is within the specified intervals
            if second != current_second:
                if current_second is not None:
                    avg_heart_rate = last_heart_rate if last_heart_rate is not None else 0
                    avg_conductivity = round(sum(conductivities) / len(conductivities), 1)
                    data_points.append((timestamp, avg_heart_rate, avg_conductivity, label_for_interval(current_second)))
                current_second = second
                conductivities = []

            heart_rate = parse_heart_rate(value)
            if heart_rate is not None:
                last_heart_rate = heart_rate
            else:
                conductivity = parse_conductivity(value)
                conductivities.append(conductivity)

    # Process the last second of data
    if current_second is not None:
        avg_heart_rate = last_heart_rate if last_heart_rate is not None else 0
        avg_conductivity = round(sum(conductivities) / len(conductivities), 1)
        data_points.append((timestamp, avg_heart_rate, avg_conductivity, label_for_interval(current_second)))

    return data_points

# Function to determine the label based on the timestamp
def label_for_interval(timestamp):
    interval_1_start = "16:27:50"
    interval_1_end = "16:37:50"
    interval_2_start = "16:39:15"
    interval_2_end = "16:49:15"

    if interval_1_start <= timestamp <= interval_1_end:
        return 1
    elif interval_2_start <= timestamp <= interval_2_end:
        return 0
    else:
        return None

# Read the CSV file
data = []
with open('arduinoReadingspani.csv', newline='') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        timestamp = parse_time(row[0])
        value = row[0]
        data.append((timestamp, value))

# Define the intervals
interval_1_start = "16:27:50"
interval_1_end = "16:37:50"
interval_2_start = "16:39:15"
interval_2_end = "16:49:15"

# Generate data points with average heart rate and conductivity readings for each second
data_points = generate_data_points(data)

# Print the generated data points with labels


In [5]:
# Create a new dataset by concatenating values within each 5-second interval
new_dataset = []
current_interval = None
heart_rates = []
conductivities = []

# get meand and sd from data to normalise it
hr_mean = 67.6 
hr_std = 7.0 
cond_mean = 300.7 
cond_std = 25.3

for timestamp, *data in data_points:
    interval = int(timestamp[-2:]) // 5  # Determine the interval based on the seconds (0-11)
    if current_interval is None:
        current_interval = interval
    elif interval != current_interval:
        concatenated_data = heart_rates + conductivities + [label]
        new_dataset.append((timestamp, concatenated_data))
        heart_rates = []
        conductivities = []
        current_interval = interval

    heart_rates.extend(data[:len(data) // 2])  # Extract heart rates
    conductivities.extend(data[len(data) // 2:-1])  # Extract conductivities
    label = data[-1]

# Add the last interval data
if heart_rates:
    concatenated_data = heart_rates + conductivities + [label]
    new_dataset.append((timestamp, concatenated_data))

# Compute mean and standard deviation for heart rate and conductivity data
heart_rate_values = [data_point[1] for _, data_point in new_dataset]
conductivity_values = [data_point[2] for _, data_point in new_dataset]

hr_mean = round(sum(heart_rate_values) / len(heart_rate_values), 1)
hr_std = round((sum((x - hr_mean) ** 2 for x in heart_rate_values) / len(heart_rate_values)) ** 0.5, 1)

con_mean = round(sum(conductivity_values) / len(conductivity_values), 1)
con_std = round((sum((x - con_mean) ** 2 for x in conductivity_values) / len(conductivity_values)) ** 0.5, 1)

# Normalize heart rate and conductivity data
normalized_data = []
for timestamp, data_point in new_dataset:
    normalized_hr = [round((x - hr_mean) / hr_std, 1) for x in data_point[:len(data_point) // 2]]  # Normalize heart rate data
    normalized_con = [round((x - con_mean) / con_std, 1) for x in data_point[len(data_point) // 2:-1]]  # Normalize conductivity data
    label = data_point[-1]
    normalized_data.append((timestamp, normalized_hr + normalized_con + [label]))

normalized_data = normalized_data[1:len(normalized_data) -2]
# Print the normalized data

for data in normalized_data:
    if data[1][-1] == None:
        print(data)


('16:37:55', [-0.1, 0.0, 0.0, 0.0, 0.0, 28.9, 28.9, 28.9, 28.9, 28.9, None])
('16:38:00', [0.0, -0.1, -0.1, 0.0, 0.1, 28.9, 28.9, 28.9, 28.9, 28.9, None])
('16:38:05', [0.1, 0.1, 0.1, 0.1, 0.1, 28.9, 28.9, 28.9, 28.9, 29.0, None])
('16:38:10', [0.1, 0.1, 0.1, 0.1, 0.1, 29.0, 29.0, 29.0, 29.0, 29.0, None])
('16:38:15', [0.1, 0.1, 0.2, 0.2, 0.2, 29.0, 29.0, 29.0, 29.1, 29.1, None])
('16:38:20', [0.1, 0.1, 0.1, 0.0, 0.0, 29.1, 29.1, 29.1, 29.1, 29.1, None])
('16:38:25', [0.1, 0.1, 0.1, 0.1, 0.1, 29.1, 29.1, 29.1, 29.1, 29.1, None])
('16:38:30', [0.2, 0.2, 0.2, 0.2, 0.1, 29.1, 29.1, 29.0, 29.1, 29.1, None])
('16:38:35', [0.1, 0.2, 0.3, 0.4, 0.4, 29.1, 29.0, 28.7, 28.5, 28.5, None])
('16:38:40', [0.5, 0.5, 0.5, 0.5, 0.6, 28.4, 28.3, 28.3, 28.1, 28.0, None])
('16:38:45', [0.5, 0.5, 0.5, 0.5, 0.5, 28.1, 28.1, 28.1, 28.2, 28.1, None])
('16:38:50', [0.4, 0.4, 0.4, 0.2, 0.2, 27.9, 27.5, 27.2, 26.8, 26.2, None])
('16:38:55', [0.1, 0.1, 0.1, 0.1, 0.1, 26.2, 26.6, 26.5, 26.1, 25.4, None])
('16:39:0

In [6]:
norm_data = []
for i in range(len(normalized_data)):
    norm_data.append(normalized_data[i][1])
sud_norm_data = norm_data[:119]
rest_norm_data = norm_data[136:]

data = sud_norm_data + rest_norm_data

labels = [d[-1] for d in data]
print(len(labels), len(data))

239 239


In [7]:
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

strat_train_set = []
strat_test_set = []

for train_index, test_index in split.split(data, labels):
    strat_train_set = [data[index] for index in train_index]
    strat_test_set = [data[index] for index in test_index]

In [8]:
train_labels = [d[-1] for d in strat_train_set]
test_labels = [d[-1] for d in strat_test_set]
for d in strat_test_set:
    d.pop()
for d in strat_train_set:
    d.pop()
train_set = [d for d in strat_train_set]
test_set = [d for d in strat_test_set]
print(train_set)

[[-0.1, -0.1, -0.1, -0.1, -0.1, 21.8, 21.9, 22.0, 22.1, 22.1], [0.1, 0.1, 0.2, 0.4, 0.4, 27.1, 26.8, 26.3, 26.0, 25.8], [0.1, 0.1, 0.1, 0.1, 0.1, 27.9, 27.9, 28.0, 28.0, 28.2], [0.2, 0.1, 0.1, 0.0, -0.1, 23.1, 23.2, 23.3, 23.3, 23.4], [-0.1, -0.1, -0.1, -0.1, -0.1, 27.3, 27.3, 27.3, 27.1, 27.1], [-0.5, -0.5, -0.4, -0.4, -0.4, 22.7, 22.6, 23.5, 22.5, 22.8], [0.6, 0.6, 0.5, 0.5, 0.5, 27.7, 27.7, 27.7, 27.7, 27.9], [-0.6, -0.6, -0.2, -0.2, -0.2, 28.2, 28.2, 28.2, 28.2, 28.3], [0.0, 0.0, 0.0, 0.1, 0.1, 28.2, 28.3, 28.3, 28.3, 28.4], [0.1, 0.1, 0.2, 0.2, 0.1, 28.3, 28.3, 28.4, 28.3, 28.5], [0.5, 0.4, 0.4, 0.3, 0.3, 28.5, 28.6, 28.6, 28.7, 28.6], [0.1, 0.1, 0.1, 0.1, 0.1, 29.0, 28.8, 28.5, 28.2, 28.0], [-1.3, -1.3, -1.6, -1.6, -1.6, 24.5, 24.5, 24.4, 24.0, 23.9], [-0.2, -0.3, -0.3, 0.2, 0.2, 22.8, 22.9, 23.0, 23.2, 23.3], [-0.6, -0.6, -0.6, -0.6, -0.6, 24.0, 24.0, 24.1, 24.1, 24.1], [-0.2, -0.2, -0.2, -0.2, -0.2, 24.2, 24.3, 24.5, 24.7, 24.9], [0.0, -0.1, -0.1, -0.1, -0.2, 25.8, 35.5, 34.3, 

LogReg

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Instantiate Logistic Regression model
log_reg = LogisticRegression()

# Fit the model to the training data
log_reg.fit(train_set, train_labels)

# Predict on the test data
predictions = log_reg.predict(test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9375


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


decision tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Instantiate Decision Tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
tree_clf.fit(train_set, train_labels)

# Predict on the test data
predictions = tree_clf.predict(test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9583333333333334


Now try without heart-rate

In [19]:
conductivity_only_test_set = [d[5:] for d in test_set]
conductivity_only_train_set = [d[5:] for d in train_set]

In [21]:
# Instantiate Decision Tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
tree_clf.fit(conductivity_only_train_set, train_labels)

# Predict on the test data
predictions = tree_clf.predict(conductivity_only_test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.9375


In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Instantiate Logistic Regression model
log_reg = LogisticRegression()

# Fit the model to the training data
log_reg.fit(conductivity_only_train_set, train_labels)

# Predict on the test data
predictions = log_reg.predict(conductivity_only_test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.9375


Now try only using heart rate

In [23]:
hr_only_test_set = [d[:5] for d in test_set]
hr_only_train_set = [d[:5] for d in train_set]

In [24]:
# Instantiate Decision Tree classifier
tree_clf = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
tree_clf.fit(hr_only_train_set, train_labels)

# Predict on the test data
predictions = tree_clf.predict(hr_only_test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)

Accuracy: 0.5416666666666666


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Instantiate Logistic Regression model
log_reg = LogisticRegression()

# Fit the model to the training data
log_reg.fit(hr_only_train_set, train_labels)

# Predict on the test data
predictions = log_reg.predict(hr_only_test_set)

# Evaluate the accuracy of the model
accuracy = accuracy_score(test_labels, predictions)
print("Accuracy:", accuracy)


Accuracy: 0.6041666666666666
