# Investigating impact of different window sizes on the model performance in classifying collaboration quality
This source code is for LAK 23 paper with title **Impact of window size on the generalizability of collaboration quality
estimation models developed using Multimodal Learning Analytics** 

In [4]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean,nanmean
from numpy import std,nanstd

from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold  
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import make_scorer 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score 
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from tabulate import tabulate

from datetime import datetime

In [8]:
# Load data
context_data = pd.read_csv('../data/CoTrack_dataset_2022_features_v2.csv')
context_data = context_data.loc[context_data.session.isin([91,98,99])]

# Add contextual data
def addContextData(df):
    context = {}
    new_df_list = [] 
    final_feature_cols = set(feature_cols).intersection(set(df.columns.to_list())) - set(['group'])
    data_con = pd.get_dummies(context_data[context_cols])  
    context_dummy_cols = data_con.columns
    for group in df.group.unique(): 
        data_con_grp = data_con.loc[context_data['group'] == group]
        data_fet_grp = df.loc[df['group'] == group]      
        data_con_grp = data_con_grp.reset_index()
        data_con_grp = data_con_grp[data_con_grp.columns.difference(['index'])]   
        res = data_con_grp.iloc[0,:].to_dict()  
        for col in res.keys():
            data_fet_grp[col] = res[col]  
        new_df_list.append(data_fet_grp)   
    new_df = pd.concat(new_df_list)   
    total_cols = list(final_feature_cols) + list(context_dummy_cols)  
    scaler = StandardScaler()
    data_without_context = df[list(final_feature_cols)]
    scaled_data = scaler.fit_transform(data_without_context)
    return pd.DataFrame(scaled_data,columns = data_without_context.columns)


In [16]:
# Columns for features and labels
feature_cols = ['user_add_mean', 'user_add_sd', 'user_del_mean', 'user_del_sd',
       'user_self_mean', 'user_self_sd', 'user_speak_mean', 'user_speak_sd',
        'user_turns_mean',
        'user_turns_sd', 'user_us_mean', 'user_us_sd', 
        'user_yes_mean', 'user_yes_sd'] + ['group']

context_cols = ['language',
       'learning_design', 'time', 'students', 'class', 'teacher', 'subject']

# Target columns
label_cols = ['ARG', 'CF', 'CO', 'CQ', 'ITO', 'KE', 'SMU', 'STR']

# Features to remove from processed dataset
cols_diff = ['index','frame','group','session','write_text','user_speech','students']

# Dimensions of collaboration quality
dimensions = ['ARG', 'CF', 'CO', 'CQ', 'ITO', 'KE', 'STR','SMU']

In [14]:
# Binarize the rating scores of collaboration quality and its dimensions
def binarize_dim(v):  
    if v/2 > 0:
        return 1
    else:
        return 0
    
def binarize_cq(v):  
    if v/7 > .5:
        return 1
    else:
        return 0    

def binarize(data):
    data['CQ_bi'] = data['CQ'].map(binarize_cq)
    data['ARG_bi'] = data['ARG'].map(binarize_dim)
    data['CO_bi'] = data['CO'].map(binarize_dim)
    data['SMU_bi'] = data['SMU'].map(binarize_dim)
    data['KE_bi'] = data['KE'].map(binarize_dim)
    data['STR_bi'] = data['STR'].map(binarize_dim)
    data['ITO_bi'] = data['ITO'].map(binarize_dim)
    data['CF_bi'] = data['CF'].map(binarize_dim)
    
    return data

## Predicting collaboration quality and its dimensions

We used random forest classifier based on state of art research in MMLA. Random forest has been found a high performing models at classification task of collaboration aspects using multimodal data.

In [None]:
# To store train and test performance measures
train_kappa = []
test_kappa = []

# Performance measures to computer
scoring = ['kappa','accuracy','accuracy2','f1','recall','auc']

# Scorer function for performance measures
scorers = {'kappa':make_scorer(cohen_kappa_score,greater_is_better=True),
           'accuracy':make_scorer(accuracy_score,greater_is_better=True),
           'accuracy2':make_scorer(balanced_accuracy_score,greater_is_better=True),
           'f1':make_scorer(f1_score,greater_is_better=True),
           'recall':make_scorer(recall_score,greater_is_better=True),
           'auc':make_scorer(roc_auc_score,greater_is_better=True)}

# Dataframe columns for storing results
results = [['level','dimension','window','kappa','accuracy','accuracy2','f1','recall','auc']]
start_time  = datetime.now()

# Trying different window size datasets
for window in [30,60,90,120,180,240]:
    features_df = pd.read_csv('../data/CoTrack_dataset_2022_features_labels_v2_rolling_{}.csv'.format(window))
    features_df['session'] = features_df.group.map(lambda x: int(x.split('_')[0])) 
    
    # We used a subset of data where students were commuincating in Estonian.
    features_df = features_df.loc[features_df.session.isin([91,98,99])]
    features_df.reset_index(inplace=True)

    # Add contextual data
    features = addContextData(features_df)

    # Binarize the target labels
    labels = binarize(features_df[label_cols])

    # Build model for each dimension
    for dim in dimensions:
        temp = []
        
        dim_label = dim + '_bi'
        y = labels[dim_label].to_numpy()
        
        temp = ['instance',dim,window]
        
        # Outer cross-validation for instance generalizablity assessment
        outer_cv = KFold(n_splits=10,shuffle=True,random_state=0).split(features,y)   

        # Outer cross-validation for generalizability generalizablity assessment
        outer_cv2 = LeaveOneGroupOut().split(X=features,groups=features_df['session'])
        
        # Random forest classifier
        clf = RandomForestClassifier(n_estimators=200,random_state = 11850)

        # Obtain performance measures using KFold (instance generalizability)
        scores = cross_validate(clf,X=features, y=y, cv=outer_cv,scoring=scorers)

        # Take average and standard deviation of performance measures
        for index,score in enumerate(scoring):
            key = 'test_' + score
            m,s= np.nanmean(scores[key]),np.nanstd(scores[key])
            temp.append('{:.2f}({:.2f})'.format(m,s))   

        # Append to the results
        results.append(temp)

        # Random forest classifier
        clf = RandomForestClassifier(n_estimators=200,random_state = 11850)
        
        temp = ['session',dim,window]
        
        # Obtain performance measures using LeaveOneGroupOut (Content generalizability)
        scores = cross_validate(clf,X=features, y=y, cv=outer_cv2,scoring=scorers)

        # Take average and standard deviation of performance measures
        for index,score in enumerate(scoring):
            key = 'test_' + score
            m,s= np.nanmean(scores[key]),np.nanstd(scores[key])
            temp.append('{:.2f}({:.2f})'.format(m,s))
            
        results.append(temp)
end_time = datetime.now()
print('Current time:{}, Execution time:{}'.format(end_time,end_time-start_time))
print(tabulate(results))

In [None]:
columns = ['level','dimension','window','kappa','accuracy','accuracy2','f1','recall','auc']
df = pd.DataFrame(results[1:],columns=columns)
df.to_csv('random_forest_different_windows_91_98_99_sessions.csv',index=False)

# Performance table with AUC metric

The below code tabulate AUC metrics for random forest models developed using dataset of different window sizes.

In [None]:
acroynm = {'ARG':'Argumentation',
          'CF':'Collaboration flow',
          'CO':'Cooperative orientation',
          'CQ':'Collaboration quality',
          'ITO':'Individual task orientation',
          'KE':'Knowledge exchange',
          'STR':'Structuring problem solving',
          'SMU':'Sustaining mutual understanding'}

METRIC = 'auc'
for level in ['instance','session']:
    level_df = df.loc[df['level'] == level,:]
    met_table = []#[['label','30','60','90','120','180','240']]
    for dim in dimensions:
        temp = []
        temp.append(acroynm[dim])
        dim_df = level_df.loc[level_df['dimension'] == dim,:]
        x = []
        y = []
        error= []
        for window in [30,60,90,120,180,240]:
            win_df = dim_df.loc[dim_df['window']==window,METRIC]
            val = list(win_df.to_dict().values())[0]

            avg = 100 * float((val).split('(')[0])
            std = 100 * float((val).split('(')[1].split(')')[0])
            #temp.append('{:.2f}'.format(std/avg))
            temp.append('{:.0f} ({:.0f})'.format(avg,std))
        met_table.append(temp)
    print('\n')
    #print(tabulate(met_table,headers='firstrow'))
    temp_df = pd.DataFrame.from_records(met_table,columns=['label','30','60','90','120','180','240'])
    display(temp_df.style.hide_index())

## Plotting coefficient of variance

In [None]:
## Plotting coefficient of variance
def plotTwoBar(x_data,y_data1,y_data2,width=10):
    width = 10
    plt.xticks(x_data)
    windows = x_data
    x_ax = [item-width/2 for item in windows]
    x_ax2 = [item+width/2 for item in windows]

    yticks = np.arange(0,.2,.02)
    yticks_labels=['{:.2f}'.format(item) for item in  yticks]
    plt.xticks(windows,windows)
    plt.yticks(yticks,yticks_labels)
    plt.ylim([0,.20])
    #plt.plot(x_ax,y_data1,width=width,ecolor='black',label="instance",marker='o')
    #plt.plot(x_ax2,y_data2,width=width,ecolor='black',label='context',marker='o')
    plt.plot(x_data,y_data1,label="instance",marker='o')
    plt.plot(x_data,y_data2,label="context",marker='o')
    plt.xlabel('Window size')
    plt.ylabel('Coefficient of variance')
    plt.title('{} dimension'.format(dim))
    plt.legend()
    plt.tight_layout()

In [None]:
METRIC = 'auc'
windows = [30,60,90,120,180,240]

for dim in dimensions:
    win_df = df.loc[df['dimension']==dim,:]
    plt.figure()
    data1 = []
    data2 = []
    for window in windows:
        dim_df = win_df.loc[win_df['window'] == window,:]

        for level in ['instance','session']:
            level_df = dim_df.loc[dim_df['level'] == level,METRIC]
            val = list(level_df.to_dict().values())[0]
            
            

            avg = 100 * float((val).split('(')[0])
            std = 100 * float((val).split('(')[1].split(')')[0])
            if level == 'instance':
                data1.append(std/avg)
            else:
                data2.append(std/avg)
            #temp.append('{:.0f} ({:.0f})'.format(avg,std))
    plotTwoBar(windows,data1,data2)

## Plotting window-wise performance graph
Following code visualizes models' performance for each dimension on different window size. 

In [None]:
df = pd.read_csv('random_forest_different_windows_91_98_99_sessions.csv')
yticks = np.arange(0,1.1,.1)
yticks_labels=['{:.2f}'.format(item) for item in 100* yticks]

for dim in dimensions:
    arg_df = df.loc[df['dimension'] == dim,:]
    
    plt.figure()
    for level in ['instance','session']:
        arg_ins_df = arg_df.loc[arg_df['level'] == level,:]

        x = []
        y = []
        error= []
        for row in arg_ins_df.itertuples():
            x.append(row.window)
            y.append(float((row.accuracy2).split('(')[0]))
            error.append(float((row.accuracy2).split('(')[1].split(')')[0]))
    
        plt.xticks([30,60,90,120,180,240])
        plt.ylim([0,1])
        plt.errorbar(x,y,error,marker='o',capsize=3,label='context' if level=='session' else 'instance')
    label = '{} classification performance ({})'.format(dim,'auc')
    figure_file = '{}_lak23_window_auc.png'.format(dim)
    plt.yticks(yticks,yticks_labels)

    plt.xlabel('window size')
    plt.ylabel('AUC (%)')
    plt.title(label)
    
    plt.legend()
    plt.tight_layout()
    plt.savefig(figure_file,format='png')
    plt.show()