In [1]:
import pandas as pd
import scipy.stats as stats
from statsmodels.stats.contingency_tables import mcnemar


H1.1.1: There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for groups of concrete, abstract, emotional words.

In [18]:
df = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter rows based on wordGroup
word_groups_to_include = ['simple', 'abstract', 'emotion']
filtered_df = df[df['wordGroup'].isin(word_groups_to_include)]

contextual_rating = filtered_df['rating']
accuracy = filtered_df['guessed']

#Calculate Kendall's Tau
tau, p_value = stats.kendalltau(contextual_rating, accuracy)

#Print results
print(f"Kendall's Tau: {tau}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for groups of concrete, abstract, emotional words.")
else:
    print("There is no statistically significant correlation between contextual understanding rating and accuracy of guesses for groups of concrete, abstract, emotional words.")


Kendall's Tau: 0.5643479263927054
P-value: 1.2267223021626354e-32
There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for groups of concrete, abstract, emotional words.


H1.1.2: There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for groups of IT words.

In [19]:
df = pd.read_csv('Taboo_Game_Data(v2).csv')

#Calculate the mean of itKnowledge 
unique_names_df = df.drop_duplicates(subset='Name')
mean_it_knowledge = unique_names_df['itKnowledge'].mean()

#Separate data based on level of IT knowledge
low_it_df = df[df['itKnowledge'] < mean_it_knowledge]
high_it_df = df[df['itKnowledge'] > mean_it_knowledge]

#Calculate Kendall's Tau for low IT group
low_it_contextual_rating = low_it_df['rating']
low_it_accuracy = low_it_df['guessed']
tau, p_value = stats.kendalltau(low_it_contextual_rating,low_it_accuracy)

#Print results
print(f"Kendall's Tau for low IT group: {tau}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating" 
          + " and accuracy of guesses for group of IT words among people with low IT knowledge.")
else:
    print("There is no statistically significant correlation between contextual understanding rating" 
            +" and accuracy of guesses for group for groups of IT words among people with low IT knowledge.")


#Calculate Kendall's Tau for high IT group
high_it_contextual_rating = high_it_df['rating']
high_it_accuracy = high_it_df['guessed']
tau, p_value = stats.kendalltau(high_it_contextual_rating, high_it_accuracy)

#Print results
print(f"Kendall's Tau for high IT group: {tau}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating" 
          + " and accuracy of guesses for group of IT words among people with high IT knowledge.")
else:
    print("There is no statistically significant correlation between contextual understanding rating" 
            +" and accuracy of guesses for group for groups of IT words among people with high IT knowledge.")    


Kendall's Tau for low IT group: 0.6108773035001068
P-value: 8.082989366002255e-25
There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for group of IT words among people with low IT knowledge.
Kendall's Tau for high IT group: 0.579241870030421
P-value: 3.9066529680510135e-24
There is a statistically significant correlation between contextual understanding rating and accuracy of guesses for group of IT words among people with high IT knowledge.


H1.2.1: There is a statistically significant correlation between contextual understanding rating and TTS for groups of concrete, abstract, emotional words.

In [4]:
df = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter rows based on wordGroup
word_groups_to_include = ['simple', 'abstract', 'emotion']
filtered_df = df[df['wordGroup'].isin(word_groups_to_include)]

#Filter rows where TTS is not null 
guessed_true_df = filtered_df[filtered_df['guessed'] == True]
contextual_rating = guessed_true_df['rating']
TTS = guessed_true_df['timeToSuccess']

# Calculate the Pearson correlation coefficient and the p-value
correlation_coefficient, p_value = stats.pearsonr(contextual_rating, TTS)

# Print the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-Value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating" +
            " and TTS for groups of concrete, abstract, emotional words.")
else:
    print("There is no statistically significant correlation between contextual understanding rating" + 
          "and TTS for groups of concrete, abstract, emotional words.")


Pearson Correlation Coefficient: -0.24380955681648733
P-Value: 0.0004712358666319909
There is a statistically significant correlation between contextual understanding rating and TTS for groups of concrete, abstract, emotional words.


H1.2.2: There is a statistically significant correlation between contextual understanding rating and TTS for group of IT words.

In [5]:
df = pd.read_csv('Taboo_Game_Data(v2).csv')

#Calculate the mean of itKnowledge 
unique_names_df = df.drop_duplicates(subset='Name')
mean_it_knowledge = unique_names_df['itKnowledge'].mean()

#Separate data based on level of IT knowledge
low_it_df = df[df['itKnowledge'] < mean_it_knowledge]
high_it_df = df[df['itKnowledge'] > mean_it_knowledge]

#Filter rows based on wordGroup
word_groups_to_include = ['IT']

filtered_low_it_df = low_it_df[low_it_df['wordGroup'].isin(word_groups_to_include)]
filtered_high_it_df = high_it_df[high_it_df['wordGroup'].isin(word_groups_to_include)]

#Filter rows where TTS is not null and calculate the Pearson correlation coefficient and the p-value for low IT group 
guessed_true_low_it_df = filtered_low_it_df[filtered_low_it_df['guessed'] == True]
low_it_rating = guessed_true_low_it_df['rating']
low_it_TTS = guessed_true_low_it_df['timeToSuccess']

correlation_coefficient, p_value = stats.pearsonr(low_it_rating, low_it_TTS)

# Print the results
print(f"Pearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-Value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating and TTS for group IT words among people with limited IT knowledge.")
else:
    print("There is no statistically significant correlation between contextual understanding rating and TTS for group of IT words among people with limited IT knowledge.")

#Filter rows where TTS is not null and calculate the Pearson correlation coefficient and the p-value for high IT group 

guessed_true_high_it_df = filtered_high_it_df[filtered_high_it_df['guessed'] == True]
high_it_rating = guessed_true_high_it_df['rating']
high_it_TTS = guessed_true_high_it_df['timeToSuccess']

correlation_coefficient, p_value = stats.pearsonr(high_it_rating, high_it_TTS)

# Print the results
print(f"\nPearson Correlation Coefficient: {correlation_coefficient}")
print(f"P-Value: {p_value}")

if p_value < 0.05:
    print("There is a statistically significant correlation between contextual understanding rating and TTS for group IT words among people with good IT knowledge.")
else:
    print("There is no statistically significant correlation between contextual understanding rating and TTS for group of IT words among people with good IT knowledge.")



Pearson Correlation Coefficient: -0.26207276942192403
P-Value: 0.41058384873962206
There is no statistically significant correlation between contextual understanding rating and TTS for group of IT words among people with limited IT knowledge.

Pearson Correlation Coefficient: -0.34189028101050317
P-Value: 0.06947866946942606
There is no statistically significant correlation between contextual understanding rating and TTS for group of IT words among people with good IT knowledge.


H2.1: There is a statistically significant difference in the rating of contextual understanding in ChatGPT-generated and human-generated prompts for concrete words. 

In [6]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

concrete_df = data[data['wordGroup'].isin(['simple'])]

gpt_concrete_df = concrete_df[concrete_df['generatedBy'].isin(['gpt'])]
human_concrete_df = concrete_df[concrete_df['generatedBy'].isin(['human'])]

ChatGPT_rating = gpt_concrete_df['rating']
Human_rating = human_concrete_df['rating']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(ChatGPT_rating, Human_rating)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 297.5
P-value: 0.0008794839441877213
There is a statistically significant difference between the paired samples.


H2.2: There is a statistically significant difference in the rating of contextual understanding in ChatGPT-generated and human-generated prompts for abstract words.

In [7]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

abstract_df = data[data['wordGroup'].isin(['abstract'])]

#Filter data based on prompt provider, only legit pairs are kept
gpt_abstract_df = abstract_df[abstract_df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_abstract_df.drop_duplicates(subset='Name',keep=False)

human_abstract_df = abstract_df[abstract_df['generatedBy'].isin(['human'])]
unique_name_human_df = human_abstract_df.drop_duplicates(subset='Name',keep=False)

#Join two DF based on the same name 
combined_abstract_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))
ChatGPT_rating = combined_abstract_df['rating_gpt']
Human_rating = combined_abstract_df['rating_human']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(ChatGPT_rating, Human_rating,alternative='two-sided', mode='approx')

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 63.5
P-value: 6.4175256917547336e-06
There is a statistically significant difference between the paired samples.


H2.3: There is a statistically significant difference in the rating of contextual understanding in ChatGPT-generated and human-generated prompts for words describing emotional states.

In [8]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

emotion_df = data[data['wordGroup'].isin(['emotion'])]

gpt_emotion_df = emotion_df[emotion_df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_emotion_df.drop_duplicates(subset='Name', keep = False)

human_emotion_df = emotion_df[emotion_df['generatedBy'].isin(['human'])]
unique_name_human_df = human_emotion_df.drop_duplicates(subset='Name', keep = False)

#Join two DF based on the same name 
combined_emotion_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))

ChatGPT_rating = combined_emotion_df['rating_gpt']
Human_rating = combined_emotion_df['rating_human']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(ChatGPT_rating, Human_rating,alternative='two-sided', mode='approx')

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 58.0
P-value: 5.5679593164750926e-05
There is a statistically significant difference between the paired samples.


H2.4: There is a statistically significant difference in the rating of contextual understanding in ChatGPT-generated and human-generated prompts for IT-specific words.

In [9]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

IT_df = data[data['wordGroup'].isin(['IT'])]

gpt_IT_df = IT_df[IT_df['generatedBy'].isin(['gpt'])]
human_IT_df = IT_df[IT_df['generatedBy'].isin(['human'])]

ChatGPT_rating = gpt_IT_df['rating']
Human_rating = human_IT_df['rating']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(ChatGPT_rating, Human_rating)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 51.5
P-value: 1.334285717309862e-05
There is a statistically significant difference between the paired samples.


H3.1.1: In terms of accuracy, there is a statistically significant difference between the responses to ChatGPT-generated and human-generated prompts for concrete words.

In [10]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['simple'])]

gpt_df = df[df['generatedBy'].isin(['gpt'])]
human_df = df[df['generatedBy'].isin(['human'])]

#Create contigency table
combined_df = gpt_df.merge(human_df, on='Name', suffixes=('_gpt', '_human'))
contingency_table = pd.crosstab(combined_df['guessed_gpt'], combined_df['guessed_human'])

# Perform McNemar's test
result = mcnemar(contingency_table,exact=True)

# Output the test statistic and p-value
print(f"McNemar's test statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

# Interpret the results based on the p-value
if result.pvalue < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")


McNemar's test statistic: 9.0
P-value: 0.08715855330228806
There is no statistically significant difference between the paired samples.


H3.1.2: In terms of accuracy, there is a statistically significant difference between the responses to ChatGPT-generated and human-generated prompts for abstract words.

In [11]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

df = data[data['wordGroup'].isin(['abstract'])]

#Filter the data and ensure there are only legit pairs 
gpt_df = df[df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_df.drop_duplicates(subset='Name',keep=False)

human_df = df[df['generatedBy'].isin(['human'])]
unique_name_human_df = human_df.drop_duplicates(subset='Name',keep=False)

combined_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))

contingency_table = pd.crosstab(combined_df['guessed_gpt'], combined_df['guessed_human'])

# Perform McNemar's test
result = mcnemar(contingency_table,exact=True)

# Output the test statistic and p-value
print(f"McNemar's test statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

# Interpret the results based on the p-value
if result.pvalue < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")


McNemar's test statistic: 6.0
P-value: 0.005924612283706665
There is a statistically significant difference between the paired samples.


H3.1.3: In terms of accuracy, there is a statistically significant difference between the responses to ChatGPT-generated and human-generated prompts for words describing emotional states.

In [12]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')


df = data[data['wordGroup'].isin(['emotion'])]

#Filter the data and ensure there are only legit pairs 
gpt_df = df[df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_df.drop_duplicates(subset='Name',keep=False)

human_df = df[df['generatedBy'].isin(['human'])]
unique_name_human_df = human_df.drop_duplicates(subset='Name',keep=False)

#Create contigency table
combined_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))
contingency_table = pd.crosstab(combined_df['guessed_gpt'], combined_df['guessed_human'])

# Perform McNemar's test
result = mcnemar(contingency_table,exact=True)

# Output the test statistic and p-value
print(f"McNemar's test statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

# Interpret the results based on the p-value
if result.pvalue < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")


McNemar's test statistic: 3.0
P-value: 0.0025768280029296875
There is a statistically significant difference between the paired samples.


H3.1.4: In terms of accuracy, there is a statistically significant difference between the responses to ChatGPT-generated and human-generated prompts for IT words.

In [13]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['IT'])]

gpt_df = df[df['generatedBy'].isin(['gpt'])]
human_df = df[df['generatedBy'].isin(['human'])]

#Create contigency table
combined_df = gpt_df.merge(human_df, on='Name', suffixes=('_gpt', '_human'))
contingency_table = pd.crosstab(combined_df['guessed_gpt'], combined_df['guessed_human'])

# Perform McNemar's test
result = mcnemar(contingency_table,exact=True)

# Output the test statistic and p-value
print(f"McNemar's test statistic: {result.statistic}")
print(f"P-value: {result.pvalue}")

# Interpret the results based on the p-value
if result.pvalue < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")


McNemar's test statistic: 2.0
P-value: 6.604194641113281e-05
There is a statistically significant difference between the paired samples.


H3.2.1: In terms of time to succeed (TTS), there is a satistically significant difference between responses to ChatGPT-generated and human-generated prompts for concrete words. 

In [14]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['simple'])]

#Filter results where TTS is not null
true_df = df[df['guessed'] == True]

#Filter results according to prompt provider
gpt_df = true_df[true_df['generatedBy'].isin(['gpt'])]
human_df = true_df[true_df['generatedBy'].isin(['human'])]

#Filter TTS 
combined_df = gpt_df.merge(human_df, on='Name', suffixes=('_gpt', '_human'))
GPT_TTS = combined_df['timeToSuccess_gpt']
Human_TTS = combined_df['timeToSuccess_human']

#Perform paired t-test
t_statistic, p_value_t = stats.ttest_rel(GPT_TTS, Human_TTS)

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value_t)

# Determine if the difference is statistically significant  
if p_value_t < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(GPT_TTS, Human_TTS)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples. \n")
else:
    print("There is no statistically significant difference between the paired samples. \n")

T-Statistic: 2.1476602341757767
P-Value: 0.04204887704884888
There is a statistically significant difference between the paired samples.
Wilcoxon statistic: 82.0
P-value: 0.029578447341918945
There is a statistically significant difference between the paired samples. 



H3.2.2: In terms of time to succeed (TTS), there is a satistically significant difference between responses to ChatGPT-generated and human-generated prompts for abstract words. 

In [15]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['abstract'])]

#Filter results where TTS is not null
true_df = df[df['guessed'] == True]

#Filter results according to prompt provider and ensure there are only legit pairs
gpt_df = true_df[true_df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_df.drop_duplicates(subset='Name',keep=False)

human_df = true_df[true_df['generatedBy'].isin(['human'])]
unique_name_human_df = human_df.drop_duplicates(subset='Name',keep=False)

#Filter TTS 
combined_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))
GPT_TTS = combined_df['timeToSuccess_gpt']
Human_TTS = combined_df['timeToSuccess_human']

#Perform paired t-test
t_statistic, p_value_t = stats.ttest_rel(GPT_TTS, Human_TTS)

# Display the results
print("T-Statistic:", t_statistic)
print("P-Value:", p_value_t)

# Determine if the difference is statistically significant
if p_value_t < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")
    
#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(GPT_TTS, Human_TTS)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

T-Statistic: 1.0798153684065404
P-Value: 0.30333165290828507
There is no statistically significant difference between the paired samples.
Wilcoxon statistic: 27.0
P-value: 0.38037109375
There is no statistically significant difference between the paired samples.


H3.2.3: In terms of time to succeed (TTS), there is a satistically significant difference between responses to ChatGPT-generated and human-generated prompts for words describing emotional states. 

In [16]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['emotion'])]

#Filter results where TTS is not null
true_df = df[df['guessed'] == True]

#Filter results according to prompt provider
gpt_df = true_df[true_df['generatedBy'].isin(['gpt'])]
unique_name_gpt_df = gpt_df.drop_duplicates(subset='Name',keep=False)

human_df = true_df[true_df['generatedBy'].isin(['human'])]
unique_name_human_df = human_df.drop_duplicates(subset='Name',keep=False)

#Filter TTS 
combined_df = unique_name_gpt_df.merge(unique_name_human_df, on='Name', suffixes=('_gpt', '_human'))
GPT_TTS = combined_df['timeToSuccess_gpt']
Human_TTS = combined_df['timeToSuccess_human']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(GPT_TTS, Human_TTS)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 5.0
P-value: 0.009765625
There is a statistically significant difference between the paired samples.


H3.2.4: In terms of time to succeed (TTS), there is a satistically significant difference between responses to ChatGPT-generated and human-generated prompts for IT-specific words. 

In [17]:
data = pd.read_csv('Taboo_Game_Data(v2).csv')

#Filter results 
df = data[data['wordGroup'].isin(['IT'])]

#Filter results where TTS is not null
true_df = df[df['guessed'] == True]

#Filter results according to prompt provider
gpt_df = true_df[true_df['generatedBy'].isin(['gpt'])]
human_df = true_df[true_df['generatedBy'].isin(['human'])]

#Filter TTS 
combined_df = gpt_df.merge(human_df, on='Name', suffixes=('_gpt', '_human'))
GPT_TTS = combined_df['timeToSuccess_gpt']
Human_TTS = combined_df['timeToSuccess_human']

#Perform Wilcoxon signed-rank test
statistic, p_value = stats.wilcoxon(GPT_TTS, Human_TTS)

# Output the test statistic and p-value
print(f"Wilcoxon statistic: {statistic}")
print(f"P-value: {p_value}")

# Interpret the results based on the p-value
if p_value < 0.05:
    print("There is a statistically significant difference between the paired samples.")
else:
    print("There is no statistically significant difference between the paired samples.")

Wilcoxon statistic: 4.0
P-value: 0.02734375
There is a statistically significant difference between the paired samples.
