In [None]:
# import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm
import random
import openai

from sklearn.metrics import classification_report, roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, cohen_kappa_score
from sklearn.model_selection import GroupKFold, train_test_split, cross_val_score, StratifiedKFold
from sklearn import tree, metrics
from sklearn.utils import resample
import xgboost

from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.utils import resample

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad, Adamax, Nadam, Ftrl
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

random.seed(20)

## Label Frequency

The code below reads in the relevant ground-truth SRL label files including associated student utterances. The English data set is available via DataShop:

https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=5371 <br>
https://pslcdatashop.web.cmu.edu/DatasetInfo?datasetId=5820

The German data set is not available due to data protection terms in the initial data collection, in line with relevant regulations.

In [None]:
df_combined = pd.concat([
    pd.read_csv('think_aloud_english_chem.csv'),
    pd.read_csv('think_aloud_english_logic.csv'),
    pd.read_csv('German-Labels.csv')
])

In [None]:
df_combined['sample'] = df_combined.row.map(lambda x: 'German' if pd.isna(x) else 'English')

In [None]:
df_combined['n_words'] = df_combined.utterance_combined.map(lambda s: len(s.split()))

In [None]:
round(df_combined.groupby(['platform'])['n_words'].mean(), 2)

In [None]:
round(df_combined.groupby(['sample'])['n_words'].mean(), 2)

In [None]:
round(df_combined.groupby(['sample', 'platform'])['n_words'].mean(), 2)

In [None]:
def print_summary(f='think_aloud_english_logic.csv', filter_platform=None):
    def get_perc(decimal):
        return f'{round(decimal*100, 2)}%'
    dd = pd.read_csv(f)
    if filter_platform is not None:
        dd = dd[dd['platform']==filter_platform].copy()
    for var in ['process', 'plan', 'act', 'wrong']:
        dd[var] = dd[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
    n = dd.shape[0]
    print(f'N Utterances: {n}')
    print(f'% Process: {get_perc(sum(dd.process.values==1)/n)}')
    print(f'% Plan: {get_perc(sum(dd.plan.values==1)/n)}')
    print(f'% Enact: {get_perc(sum(dd.act.values==1)/n)}')
    print(f'% Realizing errors: {get_perc(sum(dd.wrong.values==1)/n)}')

In [None]:
print_summary('think_aloud_english_chem.csv', filter_platform='Stoich')

In [None]:
print_summary('think_aloud_english_chem.csv', filter_platform='ORCCA')

In [None]:
print_summary('think_aloud_english_logic.csv')

In [None]:
print_summary('German-Labels.csv', filter_platform='Stoich')

In [None]:
print_summary('German-Labels.csv', filter_platform='ORCCA')

### OpenAI Embedding Model

The following code creates procedures to text embeddings of student utterances which are latter used as model input features.

In this example, we create and store embeddings in the following manner to avoid recomputing embeddings each time:


```
text = df['utterance_combined']

X = []
for r in tqdm(text):
    emb = get_embedding(r)
    review_emb = tf.reshape(emb, [-1]).numpy()
    X.append(review_emb)
X = np.array(X)
np.save('file.npy', X)
```

In [None]:
def read_key(path='token_conrad.txt'):
    """Reads the API key from token.txt."""
    with open(path, 'r') as file:
        api_key = file.read().strip()
    return api_key

In [None]:
openai.api_key = read_key(path='token_conrad.txt')

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return openai.Embedding.create(input = [text], model=model)['data'][0]['embedding']

In [None]:
df = pd.read_csv('think_aloud_english_chem.csv')
#df = pd.read_csv("German-Labels.csv")
#df = pd.read_csv("think_aloud_all_platforms_2023Nov27.csv")

### Machine Learning SRL Prediction Model

Below, we run the model training and evaluation procedure described in our paper for each data subset separately (e.g., all English data or all Chemistry data).

In [None]:
def prep_data(f="think_aloud_all_platforms_2023Nov27.csv", student_col='anon_student_id'):
    df = pd.read_csv(f) # this csv contains all valid input, in attempt level
    for var in ['process', 'plan', 'act', 'wrong']:
        df[var] = df[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
    
    group_dict = dict()
    groups = np.array([])
    
    for index, row in df.iterrows():
        s_id = row[student_col]
        if s_id not in group_dict:
            group_dict[s_id] = index
        groups = np.append(groups, group_dict[s_id])
        
    # Set up the splitter with 5 splits
    gkf = GroupKFold(n_splits = 5)
    return df, gkf, groups

In [None]:
def train_model(df, gkf, groups, X, label='wrong', modelfile='english-te3-wrong.h5'):
    y = df[label]
    # set up storage arrays for each round of validation
    roc_auc_scores_all = np.array([])
    pred = pd.DataFrame()
    
    for train_index, test_index in gkf.split(X, y, groups=groups):
        
        # Get the training and test data from the dataset for this group
        X_train = X[train_index]
        X_test = X[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        
        model = Sequential()
        model.add(Dense(28, input_shape=(1536,), activation='relu')) 
        model.add(Dense(28, activation='relu')) 
        model.add(Dense(1, activation='sigmoid'))
        
        optimizer = tf.keras.optimizers.legacy.Adam(lr=0.01)
        model.compile(loss = 'binary_crossentropy', optimizer= optimizer,metrics = ['acc'])
        
        num_epochs = 30
        batch_size = 10
    
        model.fit(
            X_train, 
            y_train, 
            epochs=num_epochs, 
            validation_split=0.1,
            shuffle=True, 
            batch_size=batch_size, 
            verbose=0)
        
        # test classifier on this round of testing group
        predictions = model.predict(X_test)
        
        pred_new = pd.concat([
        pd.DataFrame(df.iloc[test_index,[0]]).reset_index(drop=True), #row number
        pd.DataFrame(y.iloc[test_index]).reset_index(drop=True),
        pd.DataFrame(predictions).reset_index(drop=True)],ignore_index=True, axis = 1)
        
        pred = pred.append(pred_new, ignore_index=True)
        
        # compute some metrics and store them for averaging later on
       
        # AUC
        roc_auc_scores = roc_auc_score(y_test, predictions)
        roc_auc_scores_all = np.append(roc_auc_scores_all, roc_auc_scores)

    # Train model on full data set and save it for later
    model.fit(
        X, 
        y, 
        epochs=num_epochs, 
        validation_split=0.1,
        shuffle=True, 
        batch_size=batch_size, 
        verbose=0)
    model.save(modelfile)
    
    return pred, roc_auc_scores_all

In [None]:
def train_procedure(f_df, f_X, student_col='anon_student_id', label='act', modelfile='tmp.h5'):
    df, gkf, groups = prep_data(f_df, student_col)
    X = np.load(f_X)
    pred_english_wrong, auc_english_wrong = train_model(df, gkf, groups, X, label=label, modelfile=modelfile)
    return pred_english_wrong, auc_english_wrong

In [None]:
model_pairs = [
    ('think_aloud_all_platforms_2023Nov27.csv', 'X-text-embedding-small-955-English.npy', 'anon_student_id', 'english-all'),
    ('think_aloud_english_chem.csv', 'X-text-embedding-small-631-English-chem.npy', 'anon_student_id', 'english-chem'),
    ('think_aloud_english_logic.csv', 'X-text-embedding-small-324-English-logic.npy', 'anon_student_id', 'english-logic'),
    ('German-Labels.csv', 'X-text-embedding-small-584-German.npy', 'user', 'german-chem'),
    ('think_chem_all_languages.csv', 'X-text-embedding-small-1215-chem-alldata.npy', 'anon_student_id', 'all-chem'),
    ('think_all_all_platforms_all_languages.csv', 'X-text-embedding-small-1539-alldata.npy', 'anon_student_id', 'all-all')
]

In [None]:
model_results = []
for f_df, f_X, student_col, ref in model_pairs:
    for label in ['process', 'plan', 'act', 'wrong']:  
        reference = ref+'-'+label
        pred_english_wrong, auc_english_wrong = train_procedure(f_df, f_X, student_col=student_col,
                                                                label=label, modelfile=f'{reference}.h5')
        model_results.append((reference, pred_english_wrong, auc_english_wrong))

In [None]:
df_cv_res = pd.concat([pd.DataFrame([{'ref': ref, 'auc_mean': np.round(aucs.mean(), 3), 'auc_std': np.round(aucs.std(), 3)}]) for ref, _, aucs in model_results])

In [None]:
df_cv_res.head()

## Transferability

The same modeling procedure, using precomputed embedding files for training and testing, are used to investigate the accuracy of training a model on one subset and then applying it to another. DeLong tests are used to compare AUC values.

## Transfer 1: English all to German Chem vs. English Chem to German Chem

In [None]:
def load_and_apply_model(model_path, X_test):
    loaded_model = load_model(model_path)
    predictions = loaded_model.predict(X_test)    
    return predictions

In [None]:
def add_predictions_and_export(df, predictions, var='wrong'):
    df[var+'_pred_num'] = [round(x, 3) for x in predictions.flatten()]
    df[var+'_pred'] = [1 if x>0.5 else 0 for x in predictions.flatten()]
    return df

In [None]:
def bootstrap_auc_ci(labels, predictions, n_bootstraps=1000, alpha=0.05, round_digits=True):
    """
    Calculate the ROC AUC score and bootstrap a 95% confidence interval.
    
    Parameters:
    labels (array-like): True binary labels.
    predictions (array-like): Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions.
    n_bootstraps (int): Number of bootstrap samples to draw.
    alpha (float): Significance level for the CI.
    
    Returns:
    tuple: (ROC AUC score, lower bound of CI, upper bound of CI)
    """
    # Calculate the ROC AUC score
    roc_auc = roc_auc_score(labels, predictions)
    
    # Generate bootstrap samples
    bootstrapped_scores = []
    for _ in range(n_bootstraps):
        # Resample with replacement
        indices = resample(range(len(labels)))
        if len(np.unique(labels[indices])) < 2:
            # Skip this sample if it does not contain both classes
            continue
        score = roc_auc_score(labels[indices], predictions[indices])
        bootstrapped_scores.append(score)
    
    # Calculate the confidence interval
    sorted_scores = np.sort(bootstrapped_scores)
    lower_bound = np.percentile(sorted_scores, 100 * alpha / 2)
    upper_bound = np.percentile(sorted_scores, 100 * (1 - alpha / 2))

    if round_digits:
        roc_auc = f"{roc_auc:.{3}f}"
        lower_bound = f"{lower_bound:.{3}f}"
        upper_bound = f"{upper_bound:.{3}f}"
        return f'AUC = {roc_auc}, CI95% = [{lower_bound}, {upper_bound}]'
    
    return roc_auc, lower_bound, upper_bound

In [None]:
from sklearn.metrics import roc_auc_score
from scipy.stats import norm
import numpy as np

def auc_confidence_interval(y_true, y_scores, alpha=0.05, round_digits=True):
    """
    Calculate the ROC AUC score and its confidence interval using the DeLong method.
    
    Parameters:
    y_true (array-like): True binary labels.
    y_scores (array-like): Target scores, can either be probability estimates of the positive class, confidence values, or binary decisions.
    alpha (float): Significance level for the CI.
    
    Returns:
    tuple: (ROC AUC score, lower bound of CI, upper bound of CI)
    """
    # Compute the ROC AUC score
    auc = roc_auc_score(y_true, y_scores)
    
    # Calculate AUC variance using the DeLong method
    n1 = sum(y_true)
    n2 = len(y_true) - n1
    q1 = auc / (2 - auc)
    q2 = 2 * auc ** 2 / (1 + auc)

    auc_var = (auc * (1 - auc) + (n1 - 1) * (q1 - auc ** 2) + (n2 - 1) * (q2 - auc ** 2)) / (n1 * n2)
    auc_std = np.sqrt(auc_var)

    # Calculate the confidence interval
    z = norm.ppf(1 - alpha / 2)
    lower_bound = auc - z * auc_std
    upper_bound = auc + z * auc_std
    
    # Bound the values between 0 and 1
    lower_bound = max(0, lower_bound)
    upper_bound = min(1, upper_bound)

    if round_digits:
        auc = f"{auc:.{3}f}"
        lower_bound = f"{lower_bound:.{3}f}"
        upper_bound = f"{upper_bound:.{3}f}"
        return f'AUC = {auc}, CI95% = [{lower_bound}, {upper_bound}]'
    
    return auc, lower_bound, upper_bound


In [None]:
import numpy as np
import scipy.stats
from scipy import stats

# AUC comparison adapted from
# https://github.com/Netflix/vmaf/
def compute_midrank(x):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5*(i + j - 1)
        i = j
    T2 = np.empty(N, dtype=float)
    # Note(kazeevn) +1 is due to Python using 0-based indexing
    # instead of 1-based in the AUC formula in the paper
    T2[J] = T + 1
    return T2


def compute_midrank_weight(x, sample_weight):
    """Computes midranks.
    Args:
       x - a 1D numpy array
    Returns:
       array of midranks
    """
    J = np.argsort(x)
    Z = x[J]
    cumulative_weight = np.cumsum(sample_weight[J])
    N = len(x)
    T = np.zeros(N, dtype=float)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = cumulative_weight[i:j].mean()
        i = j
    T2 = np.empty(N, dtype=float)
    T2[J] = T
    return T2


def fastDeLong(predictions_sorted_transposed, label_1_count, sample_weight):
    if sample_weight is None:
        return fastDeLong_no_weights(predictions_sorted_transposed, label_1_count)
    else:
        return fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight)


def fastDeLong_weights(predictions_sorted_transposed, label_1_count, sample_weight):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r, :] = compute_midrank_weight(positive_examples[r, :], sample_weight[:m])
        ty[r, :] = compute_midrank_weight(negative_examples[r, :], sample_weight[m:])
        tz[r, :] = compute_midrank_weight(predictions_sorted_transposed[r, :], sample_weight)
    total_positive_weights = sample_weight[:m].sum()
    total_negative_weights = sample_weight[m:].sum()
    pair_weights = np.dot(sample_weight[:m, np.newaxis], sample_weight[np.newaxis, m:])
    total_pair_weights = pair_weights.sum()
    aucs = (sample_weight[:m]*(tz[:, :m] - tx)).sum(axis=1) / total_pair_weights
    v01 = (tz[:, :m] - tx[:, :]) / total_negative_weights
    v10 = 1. - (tz[:, m:] - ty[:, :]) / total_positive_weights
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def fastDeLong_no_weights(predictions_sorted_transposed, label_1_count):
    """
    The fast version of DeLong's method for computing the covariance of
    unadjusted AUC.
    Args:
       predictions_sorted_transposed: a 2D numpy.array[n_classifiers, n_examples]
          sorted such as the examples with label "1" are first
    Returns:
       (AUC value, DeLong covariance)
    Reference:
     @article{sun2014fast,
       title={Fast Implementation of DeLong's Algorithm for
              Comparing the Areas Under Correlated Receiver Oerating
              Characteristic Curves},
       author={Xu Sun and Weichao Xu},
       journal={IEEE Signal Processing Letters},
       volume={21},
       number={11},
       pages={1389--1393},
       year={2014},
       publisher={IEEE}
     }
    """
    # Short variables are named as they are in the paper
    m = label_1_count
    n = predictions_sorted_transposed.shape[1] - m
    positive_examples = predictions_sorted_transposed[:, :m]
    negative_examples = predictions_sorted_transposed[:, m:]
    k = predictions_sorted_transposed.shape[0]

    tx = np.empty([k, m], dtype=float)
    ty = np.empty([k, n], dtype=float)
    tz = np.empty([k, m + n], dtype=float)
    for r in range(k):
        tx[r, :] = compute_midrank(positive_examples[r, :])
        ty[r, :] = compute_midrank(negative_examples[r, :])
        tz[r, :] = compute_midrank(predictions_sorted_transposed[r, :])
    aucs = tz[:, :m].sum(axis=1) / m / n - float(m + 1.0) / 2.0 / n
    v01 = (tz[:, :m] - tx[:, :]) / n
    v10 = 1.0 - (tz[:, m:] - ty[:, :]) / m
    sx = np.cov(v01)
    sy = np.cov(v10)
    delongcov = sx / m + sy / n
    return aucs, delongcov


def calc_pvalue(aucs, sigma):
    """Computes log(10) of p-values.
    Args:
       aucs: 1D array of AUCs
       sigma: AUC DeLong covariances
    Returns:
       log10(pvalue)
    """
    l = np.array([[1, -1]])
    z = np.abs(np.diff(aucs)) / np.sqrt(np.dot(np.dot(l, sigma), l.T))
    return np.log10(2) + scipy.stats.norm.logsf(z, loc=0, scale=1) / np.log(10)


def compute_ground_truth_statistics(ground_truth, sample_weight):
    assert np.array_equal(np.unique(ground_truth), [0, 1])
    order = (-ground_truth).argsort()
    label_1_count = int(ground_truth.sum())
    if sample_weight is None:
        ordered_sample_weight = None
    else:
        ordered_sample_weight = sample_weight[order]

    return order, label_1_count, ordered_sample_weight


def delong_roc_variance(ground_truth, predictions, sample_weight=None):
    """
    Computes ROC AUC variance for a single set of predictions
    Args:
       ground_truth: np.array of 0 and 1
       predictions: np.array of floats of the probability of being class 1
    """
    order, label_1_count, ordered_sample_weight = compute_ground_truth_statistics(
        ground_truth, sample_weight)
    predictions_sorted_transposed = predictions[np.newaxis, order]
    aucs, delongcov = fastDeLong(predictions_sorted_transposed, label_1_count, ordered_sample_weight)
    assert len(aucs) == 1, "There is a bug in the code, please forward this to the developers"
    return aucs[0], delongcov


def delong_auc_ci(y_true, y_pred, alpha=.95):
    auc, auc_cov = delong_roc_variance(
        y_true,
        y_pred)
    
    auc_std = np.sqrt(auc_cov)
    lower_upper_q = np.abs(np.array([0, 1]) - (1 - alpha) / 2)
    
    ci = stats.norm.ppf(
        lower_upper_q,
        loc=auc,
        scale=auc_std)
    
    ci[ci > 1] = 1
    
    return f'{auc}: {ci}'

In [None]:
f_df = 'German-Labels.csv'
f_X = 'X-text-embedding-small-584-German.npy'
student_col='user'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('german-chem', 'english-all'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-{label}-using-english-all', bootstrap_auc_ci(df[label], predictions)))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-english-all-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('german-chem', 'english-chem'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-{label}-using-english-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-english-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'German-Labels.csv'
f_X = 'X-text-embedding-small-584-German.npy'
student_col='user'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model, X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('german-chem', 'english-logic'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-{label}-using-english-logic', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-english-logic-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

# Generalizablity by platform

In [None]:
f_df = 'english-stoich.csv'
f_X = 'X-text-embedding-small-469-English-Stoich.npy'
student_col='anon_student_id'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'english-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('english-chem', 'german-chem'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'english-chem-stoich-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'english-orcca.csv'
f_X = 'X-text-embedding-small-162-English-ORCCA.npy'
student_col='anon_student_id'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'english-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('english-chem', 'german-chem'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'english-chem-ORCCA-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'german-stoich.csv'
f_X = 'X-text-embedding-small-439-German-Stoich.npy'
student_col='user'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('german-chem', 'english-chem'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-stoich-{label}-using-english-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-english-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'german-ORCCA.csv'
f_X = 'X-text-embedding-small-145-German-ORCCA.npy'
student_col='user'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model.replace('german-chem', 'english-chem'), X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'german-chem-ORCCA-{label}-using-english-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-english-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

## Transfer 2: German chem to English chem, logic, all (three transfers)

In [None]:
f_df = 'think_aloud_english_chem.csv'
f_X = 'X-text-embedding-small-631-English-chem.npy'
student_col='anon_student_id'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model, X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'english-chem-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'think_aloud_english_logic.csv'
f_X = 'X-text-embedding-small-324-English-logic.npy'
student_col='anon_student_id'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model, X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'english-logic-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])

In [None]:
f_df = 'think_aloud_all_platforms_2023Nov27.csv'
f_X = 'X-text-embedding-small-955-English.npy'
student_col='anon_student_id'
df, gkf, groups = prep_data(f_df, student_col)
X = np.load(f_X)

In [None]:
transfer_aucs = []
df_out = pd.read_csv(f_df)
for var in ['process', 'plan', 'act', 'wrong']:
    df_out[var] = df_out[var].map(lambda x: x if not isinstance(x, str) else 1 if x=='Yes' else 0)
for label in ['process', 'plan', 'act', 'wrong']:  
    f_model = f'german-chem-{label}.h5'
    predictions = load_and_apply_model(f_model, X)
    df_out = add_predictions_and_export(df_out, predictions, var=label)
    transfer_aucs.append((f'english-all-{label}-using-german-chem', delong_auc_ci(df[label], predictions.flatten())))
for label in ['process', 'plan', 'act', 'wrong']:  
    df_out[df_out[label]!=df_out[f'{label}_pred']][[c for c in df_out if label in c or c=='utterance_combined' or c=='platform']].to_csv(f_df.replace('.csv', f'-with-predictions-using-german-chem-inconsistent-{label}.csv'), index=False)

In [None]:
pd.concat([pd.DataFrame([{'ref': ref, 'auc': auc}]) for ref, auc in transfer_aucs])