In [2]:
from openai import OpenAI
import pandas as pd
import numpy as np
import re
from collections import Counter
import random
import math
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import classification_report

In [3]:
def extract_first_number(s):
    """
    Extracts the first integer from a string, ignoring decimals.

    Args:
        s: The input string.

    Returns:
        The first integer found in the string, or None if no integer is found.
    """
    match = re.search(r'\d+', s)  # Matches only integers
    if match:
        return int(match.group())
    else:
        return None


## Find modes ###
def find_mode(data):
    """
    Finds the mode(s) of a list or pandas Series.
    
    Args:
        data: A list or pandas Series of values.
    
    Returns:
        A single mode if there are multiple modes;
        otherwise returns the mode(s) of the input data.
        Returns an empty list if the input is empty.
    """
    counts = Counter(data)
    max_count = max(counts.values())
    modes = [key for key, value in counts.items() if value == max_count]
    
    # If there are multiple modes, choose one randomly
    if len(modes) > 1:
        return random.choice(modes)
    
    return modes[0]

def SMD(truevalues,prediction):
    mt=truevalues.mean()
    mp=prediction.mean()
    stdt=truevalues.var()
    stdp=prediction.var()
    z= (mt-mp)/math.sqrt((stdt+stdt)/2)

    return(z)

In [5]:
path = 'C:/Users/mingf/Documents/bokeli/semester 8/AS_Multiple LLM/'
role = "You are a helpful rater in science education. You should rate students' responses based on the scoring guide and only return the scores.\n"

item='VR1'
scoring_prompt = pd.read_csv(path+'scoring prompts.csv')
scoring_prompt = scoring_prompt.loc[scoring_prompt.Item == item,'Prompt'].values[0]
resp = pd.read_excel(path+'All Graded Data from LPS3 2022_ Undergraduates.xlsx',sheet_name=item)
resp.replace({'Final': {'0a': 0}},inplace=True)
resp.replace({'Final': {'0b': 0}},inplace=True)
resp['Final']=resp['Final'].astype(int)



  resp.replace({'Final': {'0b': 0}},inplace=True)


In [4]:
resp.iloc[:, 2] = resp.iloc[:, 2].replace('0a', 0)
resp.iloc[:, 2] = resp.iloc[:, 2].replace('0b', 0)
resp.iloc[:, 3] = resp.iloc[:, 3].replace('0a', 0)
resp.iloc[:, 3] = resp.iloc[:, 3].replace('0b', 0)

resp['H2HA'] = (resp.iloc[:,2] == resp.iloc[:,3]).astype(float)
resp.loc[resp.iloc[:,2].isna(),'H2HA'] = np.nan
resp.loc[resp.iloc[:,3].isna(),'H2HA'] = np.nan

  resp.iloc[:, 2] = resp.iloc[:, 2].replace('0b', 0)
  resp.iloc[:, 3] = resp.iloc[:, 3].replace('0b', 0)


In [29]:

gpt_client = OpenAI(api_key='Input your own key')

In [30]:
temprs = [0,1]
nrep=5


for tempr in temprs:
    scP = []
    for index, row in resp.iterrows():
    #print(index)
        scoring_prompt_resp = scoring_prompt+'\n'+'RESPONSE:'+str(row[item])+'. ->'
        scL=[]
        for rp in range(nrep):
        
            gpt_scores = gpt_client.chat.completions.create(
                              model="gpt-4o-2024-11-20",
                              temperature=tempr,
                              n=1,
                              messages=[
                                {"role": "system", "content": role},
                                {"role": "user", "content": scoring_prompt_resp}
                              ])
            scL.append(gpt_scores.choices[0].message.content)

        scP.append(scL)

    gpt_scD = pd.DataFrame(scP,columns=['output_'+str(i )for i in range(nrep)])

    gpt_scD = pd.concat([gpt_scD,gpt_scD.apply(lambda col: [extract_first_number(i) for i in col]).\
    rename(columns=dict(zip(['output_'+str(i )for i in range(nrep)],['score_'+str(i )for i in range(nrep)])))], axis=1)

    gpt_scD['mode'] = gpt_scD.iloc[:,nrep:(2*nrep)].apply(find_mode,axis=1)

    gpt_scD.to_csv(path+'results/'+item+'_GPT_tempr'+str(tempr)+'.csv')



In [31]:
print(f"QWK: {cohen_kappa_score(gpt_scD['mode'],resp.Final,weights='quadratic')}")
print(f"Accuracy: {accuracy_score(gpt_scD['mode'],resp.Final)}")
print(f"rmse: {root_mean_squared_error(gpt_scD['mode'],resp.Final)}")
print(f"SMD: {SMD(prediction=gpt_scD['mode'],truevalues=resp.Final)}")
print(classification_report(gpt_scD['mode'],resp.Final))

QWK: 0.6437145763697137
Accuracy: 0.5837004405286343
rmse: 0.6897614987154126
SMD: 0.4711002503544767
              precision    recall  f1-score   support

           0       0.90      0.65      0.75        97
           1       0.69      0.33      0.45       212
           2       0.47      0.94      0.63       135
           3       0.38      0.50      0.43        10

    accuracy                           0.58       454
   macro avg       0.61      0.61      0.57       454
weighted avg       0.66      0.58      0.57       454



In [11]:
ds_client = OpenAI(api_key="Input your own key", base_url="https://api.deepseek.com")

In [35]:
temprs = [0,1]
nrep=5


for tempr in temprs:
    scP = []
    for index, row in resp.iterrows():
    #print(index)
        scoring_prompt_resp = scoring_prompt+'\n'+'RESPONSE:'+str(row[item])+'. ->'
        scL=[]
        for rp in range(nrep):
        
            ds_scores = ds_client.chat.completions.create(
                              model="deepseek-chat",
                              temperature=tempr,
                              n=1,
                              messages=[
                                {"role": "system", "content": role},
                                {"role": "user", "content": scoring_prompt_resp}
                              ])
            scL.append(ds_scores.choices[0].message.content)

        scP.append(scL)

    ds_scD = pd.DataFrame(scP,columns=['output_'+str(i )for i in range(nrep)])

    ds_scD = pd.concat([ds_scD,ds_scD.apply(lambda col: [extract_first_number(i) for i in col]).\
    rename(columns=dict(zip(['output_'+str(i )for i in range(nrep)],['score_'+str(i )for i in range(nrep)])))], axis=1)

    ds_scD['mode'] = ds_scD.iloc[:,nrep:(2*nrep)].apply(find_mode,axis=1)

    ds_scD.to_csv(path+'results/'+item+'_deepseek_tempr'+str(tempr)+'.csv')



In [5]:
qw_client = OpenAI(
    api_key="Input your own key", 
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)


In [7]:
temprs = [0,1]
nrep=5


for tempr in temprs:
    scP = []
    for index, row in resp.iterrows():
    #print(index)
        scoring_prompt_resp = scoring_prompt+'\n'+'RESPONSE:'+str(row[item])+'. ->'
        scL=[]
        for rp in range(nrep):
        
            qw_scores = qw_client.chat.completions.create(
                              model="qwen-plus",
                              temperature=tempr,
                              n=1,
                              messages=[
                                {"role": "system", "content": role},
                                {"role": "user", "content": scoring_prompt_resp}
                              ])
            scL.append(qw_scores.choices[0].message.content)

        scP.append(scL)

    qw_scD = pd.DataFrame(scP,columns=['output_'+str(i )for i in range(nrep)])

    qw_scD = pd.concat([qw_scD,qw_scD.apply(lambda col: [extract_first_number(i) for i in col]).\
    rename(columns=dict(zip(['output_'+str(i )for i in range(nrep)],['score_'+str(i )for i in range(nrep)])))], axis=1)

    qw_scD['mode'] = qw_scD.iloc[:,nrep:(2*nrep)].apply(find_mode,axis=1)

    qw_scD.to_csv(path+'results/'+item+'_qwen_tempr'+str(tempr)+'.csv')



In [13]:
print(f"QWK: {cohen_kappa_score(qw_scD['mode'],resp.Final,weights='quadratic')}")
print(f"Accuracy: {accuracy_score(qw_scD['mode'],resp.Final)}")
print(f"rmse: {root_mean_squared_error(qw_scD['mode'],resp.Final)}")
print(f"SMD: {SMD(prediction=qw_scD['mode'],truevalues=resp.Final)}")
print(classification_report(qw_scD['mode'],resp.Final))

QWK: 0.4425231296739678
Accuracy: 0.4043478260869565
rmse: 0.8444190181781018
SMD: 0.6327597196944021
              precision    recall  f1-score   support

           0       0.61      0.65      0.63        54
           1       0.84      0.33      0.47       330
           2       0.19      0.55      0.28        76
           3       0.00      0.00      0.00         0

    accuracy                           0.40       460
   macro avg       0.41      0.38      0.35       460
weighted avg       0.71      0.40      0.46       460



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
qw_scD_1 = pd.DataFrame({'output': [i[0] for i in scP],'prediction':[extract_first_number(i[0]) for i in scP],'tlabel':resp.Final})

In [24]:
print(f"QWK: {cohen_kappa_score(qw_scD_1['prediction'],qw_scD_1['tlabel'],weights='quadratic')}")
print(f"Accuracy: {accuracy_score(qw_scD_1['prediction'],qw_scD_1['tlabel'])}")
print(f"rmse: {root_mean_squared_error(qw_scD_1['prediction'],qw_scD_1['tlabel'])}")
print(f"SMD: {SMD(prediction=qw_scD_1['prediction'],truevalues=qw_scD_1['tlabel'])}")
print(classification_report(qw_scD_1['prediction'],qw_scD_1['tlabel']))

QWK: 0.7985629648304551
Accuracy: 0.9404255319148936
rmse: 0.26896175477549755
SMD: 0.0
              precision    recall  f1-score   support

           0       0.82      0.93      0.87        57
           1       0.99      0.95      0.97       391
           2       0.63      0.86      0.73        22

    accuracy                           0.94       470
   macro avg       0.81      0.91      0.86       470
weighted avg       0.95      0.94      0.94       470



In [26]:
print(classification_report(qw_scD_1['prediction'],scD_0['mode']))

              precision    recall  f1-score   support

           0       0.87      0.96      0.92        57
           1       0.97      0.97      0.97       391
           2       0.86      0.55      0.67        22

    accuracy                           0.95       470
   macro avg       0.90      0.83      0.85       470
weighted avg       0.95      0.95      0.95       470

