In [10]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

#all sklearn imports
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

1. Dataset Generation

In [11]:
#--- 1. LOADING THE DATA--
# Load feature names
features_df = pd.read_csv("UCI HAR Dataset//UCI HAR Dataset//features.txt", sep="\\s+", header=None, names=["idx", "feature"])

#this appends the column index to any duplicate names.
features_df["feature"] = features_df["feature"].astype(str) + "_" + features_df.index.astype(str)
feature_names = features_df["feature"].tolist()

# Load activity labels (mapping IDs 1-6 to string names)
activity_labels_df = pd.read_csv("UCI HAR Dataset/UCI HAR Dataset/activity_labels.txt", sep="\\s+", header=
                                None, names=["id", "activity"])

activity_map = dict(zip(activity_labels_df.id, activity_labels_df.activity))

In [13]:
# Load train/test sets
X_train = pd.read_csv("UCI HAR Dataset//UCI HAR Dataset//train//X_train.txt", sep="\\s+", header=None, names=feature_names)
y_train = pd.read_csv("UCI HAR Dataset//UCI HAR Dataset//train//y_train.txt", sep="\\s+", header=None, names=["Activity"])
X_test = pd.read_csv("UCI HAR Dataset//UCI HAR Dataset//test//X_test.txt", sep="\\s+", header=None, names=feature_names)
y_test = pd.read_csv("UCI HAR Dataset//UCI HAR Dataset//test//y_test.txt", sep="\\s+", header=None, names=["Activity"])

# Map the activity IDs to their names
y_train["Activity"] = y_train["Activity"].map(activity_map)
y_test["Activity"] = y_test["Activity"].map(activity_map)

In [14]:
# --- 2. CONVERT MULTI-CLASS TO BINARY ---
def to_binary_label(activity):
    if activity in ["WALKING", "WALKING_UPSTAIRS", "WALKING_DOWNSTAIRS"]:
        return 1  # Active
    else:
        return 0  # Inactive

y_train["Binary"] = y_train["Activity"].apply(to_binary_label)
y_test["Binary"] = y_test["Activity"].apply(to_binary_label)

In [15]:
#Verify if the labels are balanced
y_train['Binary'].value_counts()

#there are significant amounts of examples
#for each of the labels (active vs inactive)
#no need to rebalance

Binary
0    4067
1    3285
Name: count, dtype: int64

2. Reduction of the Number of Features

In [16]:
pipeline = Pipeline([
("scaler", StandardScaler()),
("pca", PCA(n_components=50)), # reduce from 561-> 50
("svc", SVC())
])

3. Hyperparameter Tuning (GridSearchCV)

In [17]:
# Example pipeline (with optional PCA)
param_grid = [

#parameters for each of the 3 models we want to test
{
"svc__kernel": ["linear"],
"svc__C": [0.1, 1, 10, 100]
},

{
"svc__kernel": ["poly"],
"svc__C": [0.1, 1],
"svc__degree": [2, 3],
"svc__gamma": [0.001, 0.01, 0.1]
},

{
"svc__kernel": ["rbf"],
"svc__C": [0.1, 1, 10],
"svc__gamma": [0.001, 0.01, 0.1]
}]

grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
scoring="accuracy", # or another multi-class metric, e.g., "f1_micro"
cv=3, # might use 3-fold to save time (or 5-fold if feasible)
n_jobs=-1,
verbose=1
)

#Needed to fix this, it wasn't working for .ravel
#Only training for the binary activity values
grid_search.fit(X_train, y_train['Binary'].values)

#Overall best model
print("Best␣parameters:", grid_search.best_params_)
print("Best␣cross-validation␣accuracy:", grid_search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best␣parameters: {'svc__C': 0.1, 'svc__degree': 3, 'svc__gamma': 0.01, 'svc__kernel': 'poly'}
Best␣cross-validation␣accuracy: 0.9993200054399565


5. Evaluate and interpret results using confusion matrices and classification metrics

In [18]:
#examine how the best of each of the 3 models differ from each other

#using a dict for ease of access
best_models= {}

#gridsearch is an object, as such you can directly access
#some of the properties without having to edit the training itself
#basically, sk saves the results as attributes

#zip goes through iterables and gives out tuples of length
#equal to the amount of iterables you zipped together
for params, score in zip(grid_search.cv_results_["params"], grid_search.cv_results_["mean_test_score"]):
    kernel= params["svc__kernel"]

    #need to check for whichever has the highest score
    #or if the given kernel is not already on the dictionary
    #since it starts out empty
    if kernel not in best_models or score > best_models[kernel]["score"]:
        best_models[kernel]= {"params": params, "score": score}

for kernel, result in best_models.items():
    print(f"Best {kernel} Model results: ")
    print(result['params'])
    print(f"Accuracy: {result['score']}")

#The results are extremely similar between each of the models
#for the analysis, I think that therefore just using one confusion matrix
#to visualize will suffice

Best linear Model results: 
{'svc__C': 1, 'svc__kernel': 'linear'}
Accuracy: 0.9990477855768991
Best poly Model results: 
{'svc__C': 0.1, 'svc__degree': 3, 'svc__gamma': 0.01, 'svc__kernel': 'poly'}
Accuracy: 0.9993200054399565
Best rbf Model results: 
{'svc__C': 1, 'svc__gamma': 0.001, 'svc__kernel': 'rbf'}
Accuracy: 0.9993199499301966


In [19]:
y_test['Binary']

0       0
1       0
2       0
3       0
4       0
       ..
2942    1
2943    1
2944    1
2945    1
2946    1
Name: Binary, Length: 2947, dtype: int64

In [20]:
#the confusion matrix is used to analyze the test set
#so need to find the best model, apply it to the test
#and then calculate all metrics

best_model= grid_search.best_estimator_

y_pred= best_model.predict(X_test)

#need to convert them all to float
#was giving out an error that indicate
#one of them was as str and another as float
y_test= y_test["Binary"].astype(float)
y_pred= y_pred.astype(float)

#sklearn already has a built-in that generates
#confusion matrix
c_matrix= confusion_matrix(y_test, y_pred)

labels= np.unique(y_test)

c_matrix_df= pd.DataFrame(c_matrix, index= labels, columns= labels)

#recall:
#binary: 1 is active and 0 Inactive
c_matrix_df


Unnamed: 0,0.0,1.0
0.0,1558,2
1.0,0,1387


In [21]:
fig= go.Figure()

fig.add_trace(go.Heatmap(
    z= c_matrix_df.values,
    x= c_matrix_df.columns,
    y= c_matrix_df.index,
    colorscale= 'blues',
    hoverinfo= "z",
    texttemplate="%{text}",
    text= c_matrix_df.values
))

fig.update_layout(
    #also displays the accuracy of the model on the test set
    title=f"Confusion Matrix of Best Model: Accuracy = {np.trace(c_matrix_df)/c_matrix_df.values.sum()}",
    xaxis_title="Predicted Label",
    yaxis_title="Actual Label",
    width=1000,
    height=750
)

fig.show()