# **Data Mining for Confidence Level Detection**

Lydia Lonzarich and Katie Park
CPSC 322-01, Fall 2025

# Import Libraries

In [53]:
import importlib

import mysklearn.myclassifiers
importlib.reload(mysklearn.myclassifiers)
from mysklearn.myclassifiers import MyRandomForestClassifier, MyDecisionTreeClassifier, MyNaiveBayesClassifier, MyDecisionTreeSolo


import mysklearn.myutils
importlib.reload(mysklearn.myutils)
import mysklearn.myutils as myutils

import mysklearn.myevaluation
importlib.reload(mysklearn.myevaluation)
import mysklearn.myevaluation as myevaluation

import mysklearn.mypytable
importlib.reload(mysklearn.mypytable)
from mysklearn.mypytable import MyPyTable 

# Load the Confidence Level Dataset

In [54]:
confidence_raw_data = MyPyTable().load_from_file("confidence_features.csv") # the unnormalized dataset.
confidence = confidence_raw_data.new_deep_copy() # the dataset we will normalize. This is so week retain a dataset is not affected by normalization.

all_attributes = ["body_lean_x", "shoulder_center_x", "hip_center_x", "spine_angle", "head_tilt_angle", "shoulder_slope", "head_direction", "arm_position", "posture"]
continuous_attributes = all_attributes[:6]
categorical_attributes = all_attributes[6:]

# normalize cols that have type=float attribute values.
confidence.normalize_columns(continuous_attributes)

# find the column indices of the attributes we're using
att_indices = []
for att in all_attributes:
   att_idx = confidence.column_names.index(att)
   att_indices.append(att_idx)

# # discretize values (to make continuous attribute vals --> categorical attribute vals)
for row_index, row in enumerate(confidence.data):
    for val_index, value in enumerate(row):
        if val_index in att_indices and type(confidence.data[row_index][val_index]) != str:
            confidence.data[row_index][val_index] = myutils.my_discretizer(confidence.data[row_index][val_index])

for col in confidence.get_columns(all_attributes):
    print(myutils.get_frequency(col))

# confidence.compute_summary_statistics(continuous_attributes).pretty_print()



{5: 2591, 4: 978, 6: 1508, 3: 259, 7: 377, 8: 175, 9: 12, 2: 33, 1: 10, 10: 2, 0: 4}
{6: 1914, 5: 2499, 4: 487, 3: 203, 7: 368, 8: 296, 9: 62, 2: 53, 1: 49, 0: 10, 10: 8}
{6: 1996, 7: 607, 5: 2230, 4: 654, 3: 148, 8: 172, 9: 43, 10: 8, 2: 57, 1: 27, 0: 7}
{5: 2019, 6: 1936, 4: 727, 7: 727, 8: 265, 3: 206, 9: 15, 2: 33, 10: 6, 1: 7, 0: 8}
{5: 4723, 10: 364, 0: 117, 4: 622, 6: 52, 9: 56, 1: 15}
{1: 2032, 0: 991, 2: 1319, 3: 680, 4: 361, 5: 274, 6: 132, 7: 59, 8: 44, 9: 53, 10: 4}
{'Looking Straight': 4323, 'Center': 552, 'Looking Right': 420, 'Looking Left': 654}
{'Partially Open': 2981, 'Closed Arms': 900, 'Open Arms': 2068}
{'Upright': 3521, 'Stiff': 2045, 'Slouched': 383}


# Implement Random Forest for Classification

In [55]:
# define X and y data
X = [[row[idx] for idx in att_indices] for row in confidence.data]
y = confidence.get_column("confidence_label")

# create a random forest classifer instance using the best N, M, and F parameters found.
myForest = MyRandomForestClassifier(N=20, M=5, F=4)

# train the random forest classifier on our train data. (class does internal split into train and test set, so here we just use internal train set).
myForest.fit(X, y)

# generate confidence label predictions using our random forest classifier. (uses internal test set).
y_preds = myForest.predict()

# display accuracy of random forest classifier.
acc = myevaluation.accuracy_score(myForest.y_test, y_preds)
print(acc)


0.5519877675840978


In [56]:
# # define X_train and y_train datasets
# X = [[row[idx] for idx in att_indices] for row in confidence.data]
# y = confidence.get_column("confidence_label")



# # compute k fold cross validation with k=10 folds to evaluate model performance with different N, M, and F...
# # attempt 1: N=20, M=5, F=4
# acc, err_rate, precision, recall, f1, y_trues, y_preds = myutils.cross_val_predict(X, y, 10, lambda: MyRandomForestClassifier(N = 20, M = 5, F = 4), True)
# print("Performance metrics for M=20, M=5, F=4...")
# print("accuracy: ", acc, ", error rate: ", err_rate, ", precision: ", precision, ", recall: ", recall, ", f1: ", f1)

# # attempt 2: N=20, M=7, F=2
# acc, err_rate, precision, recall, f1, y_trues, y_preds = myutils.cross_val_predict(X, y, 10, lambda: MyRandomForestClassifier(N = 20, M = 7, F = 2), True)
# print("Performance metrics for M=20, M=7, F=2...")
# print("accuracy: ", acc, ", error rate: ", err_rate, ", precision: ", precision, ", recall: ", recall, ", f1: ", f1)




# # create a random forest classifer instance using the best N, M, and F parameters found.
# myForest = MyRandomForestClassifier(N = 20, M = 5, F = 4)

# # train the random forest classifier on our train data.
# myForest.fit(myForest.X_train, myForest.y_train)

# # generate confidence label predictions using our random forest classifier.
# y_preds = myForest.predict()

# # display accuracy of random forest classifier.
# print(myevaluation.accuracy_score(myForest.y_test, y_preds))



# Implement Decision Tree for Classification

In [57]:
# define X and y data
X = [[row[idx] for idx in att_indices] for row in confidence.data]
y = confidence.get_column("confidence_label")

# get all unique class labels.
labels = list(set(y)) 

# compute k fold cross validation with k=10 folds to evaluate model performance across different train and test subsets of data.
acc, err_rate, precision, recall, f1, y_trues, y_preds = myutils.cross_val_predict(X, y, 10, MyDecisionTreeClassifier, True)
print("K-FOLD CROSS VALIDATION (k=10) RESULTS...")
print("(Avg) Accuracy: ", acc)
print("(Avg) Error Rate: ", err_rate)
print("(Avg) Precision: ", precision)
print("(Avg) Recall: ", recall)
print("(Avg) F1 score: ", f1)

# create a decision tree instance.
myTree = MyDecisionTreeClassifier()

# train the decision tree classifer (use the same train set that we generated in the random forest class for fair classifier comparison).
myTree.fit(myForest.X_train, myForest.y_train)

# generate predictions (use the same test set that we generated in the random forest class for fair classifier comparison).
y_pred = myTree.predict(myForest.X_test)

# display decision tree performance metrics.
print("")
print("==========================================")
print("DECISION TREE CLASSIFIER RESULTS...")
acc = myevaluation.accuracy_score(myForest.y_test, y_pred)
precision = myevaluation.multiclass_precision_score(myForest.y_test, y_pred, labels=labels)
recall = myevaluation.multiclass_recall_score(myForest.y_test, y_pred, labels=labels)
f1 = myevaluation.multiclass_f1_score(myForest.y_test, y_pred, labels=labels)
print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-Score: ", f1)


K-FOLD CROSS VALIDATION (k=10) RESULTS...
(Avg) Accuracy:  0.7334087140057289
(Avg) Error Rate:  0.26659128599427107
(Avg) Precision:  0.7912925659509351
(Avg) Recall:  0.7334087140057289
(Avg) F1 score:  0.6856578735156569

DECISION TREE CLASSIFIER RESULTS...
Accuracy:  0.5530071355759429
Precision:  0.37763793297131554
Recall:  0.5530071355759429
F1-Score:  0.4418278072120188


# Implement Naive Bayes for Classification

In [58]:
# define X and y data
X = [[row[idx] for idx in att_indices] for row in confidence.data]
y = confidence.get_column("confidence_label")

# get all unique class labels.
labels = list(set(y))

# compute the avg acc and error rate, avg precision, avg recall, and avg F1 over each train/test split of the data.
acc, err_rate, precision, recall, f1, y_trues, y_preds = myutils.cross_val_predict(X, y, 10, MyNaiveBayesClassifier, True)
print("K-FOLD CROSS VALIDATION (k=10) RESULTS...")
print("(Avg) Accuracy: ", acc)
print("(Avg) Error Rate: ", err_rate)
print("(Avg) Precision: ", precision)
print("(Avg) Recall: ", recall)
print("(Avg) F1 score: ", f1)

# create a naive bayes classifier instance.
my_nb = MyNaiveBayesClassifier()

# train our naive bayes classifier (use the same train set that we generated in the random forest class for fair classifier comparison).
my_nb.fit(myForest.X_train, myForest.y_train)

# generate confidence label predictions (use the same test set that we generated in the random forest class for fair classifier comparison).
y_pred = my_nb.predict(myForest.X_test)


# display naive bayes performance metrics.
print("")
print("==========================================")
print("NAIVE BAYES CLASSIFIER RESULTS...")
acc = myevaluation.accuracy_score(myForest.y_test, y_pred)
precision = myevaluation.multiclass_precision_score(myForest.y_test, y_pred, labels=labels)
recall = myevaluation.multiclass_recall_score(myForest.y_test, y_pred, labels=labels)
f1 = myevaluation.multiclass_f1_score(myForest.y_test, y_pred, labels=labels)
print("Accuracy: ", acc)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1-Score: ", f1)



K-FOLD CROSS VALIDATION (k=10) RESULTS...
(Avg) Accuracy:  0.7426554098195889
(Avg) Error Rate:  0.2573445901804111
(Avg) Precision:  0.7389945247869342
(Avg) Recall:  0.7426554098195889
(Avg) F1 score:  0.7331011558614058

NAIVE BAYES CLASSIFIER RESULTS...
Accuracy:  0.5688073394495413
Precision:  0.40590627281503944
Recall:  0.5688073394495413
F1-Score:  0.4663228934302336


# **Introduction**

### **Dataset Used**

### **Findings**

In [3]:
# interview dataset
X_train_interview = [
        ["Senior", "Java", "no", "no"], # False
        ["Senior", "Java", "no", "yes"], # False
        ["Mid", "Python", "no", "no"], # True
        ["Junior", "Python", "no", "no"], # True 
        ["Junior", "R", "yes", "no"], # True
        ["Junior", "R", "yes", "yes"], # False 
        ["Mid", "R", "yes", "yes"], # True
        ["Senior", "Python", "no", "no"], # False 
        ["Senior", "R", "yes", "no"], # True 
        ["Junior", "Python", "yes", "no"], # True
        ["Senior", "Python", "yes", "yes"], # True 
        ["Mid", "Python", "no", "yes"], # True
        ["Mid", "Java", "yes", "no"], # True
        ["Junior", "Python", "no", "yes"] # False
    ]
y_train_interview = ["False", "False", "True", "True", "True", "False", "True", "False", "True", "True", "True", "True", "True", "False"]

X_test_interview = [["Junior", "Java", "yes", "no"], ["Junior", "Java", "yes", "yes"]]
test = MyDecisionTreeClassifier()

test.fit(X_train_interview, y_train_interview)

print(test.predict(X_test_interview))
print(test.tree)



current N: 20
['Attribute', 'att2', ['Value', 'no', ['Leaf', 'False', 5, 9]], ['Value', 'yes', ['Leaf', 'True', 4, 9]]]
current N: 20
['Attribute', 'att2', ['Value', 'no', ['Leaf', 'False', 5, 9]], ['Value', 'yes', ['Leaf', 'True', 4, 9]]]
current N: 20
['Attribute', 'att1', ['Value', 'Java', ['Leaf', 'False', 3, 9]], ['Value', 'Python', ['Leaf', 'True', 3, 9]], ['Value', 'R', ['Attribute', 'att0', ['Value', 'Junior', ['Leaf', 'False', 1, 3]], ['Value', 'Senior', ['Leaf', 'True', 2, 3]]]]]
current N: 20
['Attribute', 'att2', ['Value', 'no', ['Leaf', 'False', 5, 9]], ['Value', 'yes', ['Attribute', 'att0', ['Value', 'Junior', ['Leaf', 'False', 3, 4]], ['Value', 'Senior', ['Leaf', 'True', 1, 4]]]]]
current N: 20
['Attribute', 'att1', ['Value', 'Java', ['Leaf', 'False', 1, 9]], ['Value', 'Python', ['Leaf', 'True', 1, 9]], ['Value', 'R', ['Attribute', 'att0', ['Value', 'Junior', ['Leaf', 'False', 1, 7]], ['Value', 'Mid', ['Leaf', 'True', 4, 7]], ['Value', 'Senior', ['Leaf', 'True', 2, 7]]]]