# ICE 7: Diagnostic Metrics
#### Course: HUDK 4050, Week 10
#### Author: Madeline Maeloa

### For this ICE, I will be using a confusion matrix to evaluate the model I built in ACA2. 

## Import and Clean

In [1]:
# Importing necessary modules
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
# Import the dataset
trainingData = pd.read_csv("aca2_dataset_training.csv")
trainingData

Unnamed: 0,UNIQUEID,SCHOOL,Class,GRADE,CODER,STUDENTID,Gender,OBSNUM,totalobs-forsession,Activity,ONTASK,TRANSITIONS,NumACTIVITIES,FORMATchanges,NumFORMATS,Obsv/act,Transitions/Durations,Total Time
0,34880,A,T8T,2,Z,965160,1,12,224,Wholecarpet,Y,1,2,1,2,1390.500000,0.000664,1027
1,32344,B,T9U,3,Z,11665,1,11,225,Individual,Y,2,3,2,3,890.333333,0.002545,213
2,14774,B,T9Q,0,Z,187441,0,21,374,Smallgroup,Y,3,4,1,2,770.500000,0.001439,2078
3,19468,A,T8S,1,Z,402837,0,17,320,Wholedesks,Y,5,6,0,1,592.333333,0.001407,3229
4,31570,B,T9T,2,Z,300647,1,2,35,Wholedesks,Y,2,3,1,2,870.000000,0.005882,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22179,30762,B,T9Q,0,Z,749212,1,5,95,Wholecarpet,Y,2,3,2,3,969.000000,0.002959,652
22180,14611,B,T9Q,0,Z,87701,0,12,211,Smallgroup,N,3,4,1,2,770.500000,0.001439,708
22181,20432,A,T8VQ,4,Z,130859,0,6,95,Smallgroup,N,1,2,0,1,1060.000000,0.000472,362
22182,38264,E,T6Q,0,Y,874565,1,10,147,Wholedesks,Y,1,2,1,2,1204.500000,0.000838,60


In [3]:
# Creates a new table called classData with variables to include in classifier
classData = trainingData[['NumACTIVITIES','ONTASK']]
classData

Unnamed: 0,NumACTIVITIES,ONTASK
0,2,Y
1,3,Y
2,4,Y
3,6,Y
4,3,Y
...,...,...
22179,3,Y
22180,4,N
22181,2,N
22182,2,Y


## Naive Bayes Model from ACA2

In [None]:
# Here we are creating the same Naive Bayes Model that was done in ACA2

In [4]:
# Import GaussianNB from sklearn and creates Xs and Y
from sklearn.naive_bayes import GaussianNB

Xs_NB = classData.drop('ONTASK', axis = 1)
ONTASK = classData['ONTASK']

In [5]:
# Initialize the classification naive Bayes model 
classNBModel = GaussianNB()
classNBModel.fit(Xs_NB, ONTASK)

GaussianNB()

In [6]:
# Use predict() to use given Xs to predict the given Y
# Use list comprehension to see if each element in certified_pred matches elements in certified
# Results return true (1) or false (0)
# Take summation of them and divide by length of the list to get accuracy 
ONTASK_pred = classNBModel.predict(Xs_NB)
performance = [item in ONTASK_pred for item in ONTASK]
print('The accuracy is', sum(performance)/len(performance)*100, '%')

The accuracy is 67.3368193292463 %


## Model Evaluation

In [15]:
# We will be using a confusion matrix to check model performance
# True positive = you predicted positive and it's true 
# True negative = you predicted negative and it's true
# False positive (type 1 error) = you predicted positive but it's false
# False negative (type 2 error) = you predicted negative but it's true
# Accuracy = (TP+TN)/total

#### Step 1: split data into 2 sets (80:20)

In [9]:
from sklearn.model_selection import train_test_split
Xs_TNB = classData.drop('ONTASK', axis = 1)
ONTASK_TNB = classData['ONTASK']
xs_TNB_training, xs_TNB_test, y_TNB_training, y_TNB_test = train_test_split(Xs_TNB, ONTASK_TNB, test_size = 0.2)

#### Step 2: retrain model with just training data

In [10]:
classNBModel = GaussianNB()
classNBModel.fit(xs_TNB_training, y_TNB_training)

GaussianNB()

#### Step 3: feed the Xs in the testing dataset and obtain the predicted Ys

In [16]:
# Using the Xs to obtain the predicted Ys
ONTASK_pred_NB = classNBModel.predict(xs_TNB_test)

#### Step 4: compare the predicted Ys with what is actually in the testing dataset (the ground truth) and obtain the confusion matrix

In [13]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_TNB_test, ONTASK_pred_NB))

[[   0 1460]
 [   0 2977]]


#### Step 5: obtain accuracy score

In [14]:
from sklearn.metrics import accuracy_score
ac_NB = accuracy_score(y_TNB_test, ONTASK_pred_NB)
print("The accuracy for the naive bayes model", ac_NB*100, "%")

The accuracy for the naive bayes model 67.09488393058372 %


In [None]:
# The accuracy for the naive bayes model is 67%
# This means that this is a better predictor of on task behavior versus just randomly guessing (around 50%)
# However, it is still less accurate than a blind guess (around 70%)