# Evaluation of ICC parameters and correctness probabilities.

## Diagnostic


In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

# Load the actual results
actual_df = pd.read_csv('DiagnosticResults_8q_36students.csv')

# Load the predicted probabilities
predicted_df = pd.read_csv('DiagnosticProbabilities_8q.csv')

# Ensure that the 'SUBJECT' column is set as the index
actual_df.set_index('SUBJECT', inplace=True)
predicted_df.set_index('SUBJECT', inplace=True)

# Align the dataframes to ensure the students are in the same order
actual_df = actual_df.loc[predicted_df.index]

# Flatten the data into arrays for computation
actual_values = actual_df.values.flatten().astype(int)
predicted_probs = predicted_df.values.flatten().astype(float)

def compute_log_likelihood(actual, predicted):
    # Avoid log(0) by clipping predicted probabilities
    epsilon = 1e-15
    predicted = np.clip(predicted, epsilon, 1 - epsilon)
    # Compute log-likelihood
    log_likelihood = actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted)
    total_log_likelihood = np.sum(log_likelihood)
    return total_log_likelihood


def compute_brier_score(actual, predicted):
    brier_score = np.mean((predicted - actual) ** 2)
    return brier_score

def compute_auc(actual, predicted):
    return roc_auc_score(actual, predicted)


log_likelihood = compute_log_likelihood(actual_values, predicted_probs)
brier_score = compute_brier_score(actual_values, predicted_probs)
auc = compute_auc(actual_values, predicted_probs)

print(f'Log-Likelihood: {log_likelihood}')
print(f'Brier Score: {brier_score}')
print(f'AUC: {auc}')


Log-Likelihood: -195.28165819912815
Brier Score: 0.2428195590277778
AUC: 0.7066822066822067


### Explanation of the Metrics
Log-Likelihood: Summarizes the probability of observing the actual outcomes given the predicted probabilities. Higher values indicate better model fit (less negative).

Brier Score: Ranges from 0 to 1, where 0 indicates perfect accuracy. It assesses the accuracy of probabilistic predictions.

AUC: Ranges from 0 to 1, where 1 indicates perfect discrimination between classes. It evaluates the model's ability to rank positive instances higher than negative ones.

### Overall Assessment of the reliability of Abilities and Probabilities:

The metrics collectively suggest that your IRT model's predicted probabilities are reasonably reliable.
Moderate Predictive Power: The model captures meaningful patterns but isn't perfect.
Support for Model Validity: Acceptable AUC and moderate Brier Score support using these probabilities and abilities in further analysis.


## Preliminary minitest

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

# Load the actual preliminary results
actual_prelim_df = pd.read_csv('PreliminaryResults_8q.csv')

# Load the predicted preliminary probabilities
predicted_prelim_df = pd.read_csv('PreliminaryProbabilities_8q.csv')
## predicted_prelim_df = pd.read_csv('PreliminaryProbabilities_8q_GUESS4.csv')
## predicted_prelim_df = pd.read_csv('PreliminaryProbabilities_8q_from14q.csv')

# Ensure that the 'SUBJECT' column is set as the index
actual_prelim_df.set_index('SUBJECT', inplace=True)
predicted_prelim_df.set_index('SUBJECT', inplace=True)

# Align the dataframes to ensure the students are in the same order
actual_prelim_df = actual_prelim_df.loc[predicted_prelim_df.index]

# Flatten the data into arrays for computation
actual_prelim_values = actual_prelim_df.values.flatten().astype(int)
predicted_prelim_probs = predicted_prelim_df.values.flatten().astype(float)


def compute_log_likelihood(actual, predicted):
    # Avoid log(0) by clipping predicted probabilities
    epsilon = 1e-15
    predicted = np.clip(predicted, epsilon, 1 - epsilon)
    # Compute log-likelihood
    log_likelihood = actual * np.log(predicted) + (1 - actual) * np.log(1 - predicted)
    total_log_likelihood = np.sum(log_likelihood)
    return total_log_likelihood


def compute_brier_score(actual, predicted):
    brier_score = np.mean((predicted - actual) ** 2)
    return brier_score

def compute_auc(actual, predicted):
    return roc_auc_score(actual, predicted)


log_likelihood_prelim = compute_log_likelihood(actual_prelim_values, predicted_prelim_probs)
brier_score_prelim = compute_brier_score(actual_prelim_values, predicted_prelim_probs)
auc_prelim = compute_auc(actual_prelim_values, predicted_prelim_probs)

print(f'Log-Likelihood (Preliminary): {log_likelihood_prelim}')
print(f'Brier Score (Preliminary): {brier_score_prelim}')
print(f'AUC (Preliminary): {auc_prelim}')



Log-Likelihood (Preliminary): -229.30410976518783
Brier Score (Preliminary): 0.26157686875
AUC (Preliminary): 0.550570234181582


### Explanation of the Metrics
Log-Likelihood: Summarizes the probability of observing the actual outcomes given the predicted probabilities. Higher values indicate better model fit (less negative).

Brier Score: Ranges from 0 to 1, where 0 indicates perfect accuracy. It assesses the accuracy of probabilistic predictions.

AUC: Ranges from 0 to 1, where 1 indicates perfect discrimination between classes. It evaluates the model's ability to rank positive instances higher than negative ones.

### Diagnostic Performance and Preliminary Performance are correlated 

We also confirmed that both minitests are correlated:

Perf [MINI DIAG] and Perf [MINI PREL]	r=0.36	p=.033


