Some metrics to evaluate the human study. 

The basic structure of the study:
Split into two parts. In each part 4 conditions are tested: 
    
    Questions 0-10 Two Layer Neural Network
    Questions 11-20 Markov Chain Model
    Questions 21-30 Real Song Text
    Questions 31-40 Random Text
    
The first tests for the naturalness of the sentence. 
The second part compares the 

What do we want to know?  
Is there a correlation between a natural sentence and a human sentence?  

The Spearman Correlation results suggest a statistically significant correlation between a sentence that is percieved as natural and perceived as written by a human.  

Are our sentences significantly different from real or random text? t-test shows a significant difference between all conditions expect tl and mc 


Is there correlation to the demographic?  


Something about classification?  

In [1]:
#Data cleaning

In [2]:
import pandas as pd

In [3]:
data=pd.read_csv("study_results.csv",sep=';')

In [4]:
#filter for only completed studies
compl=data[data['Teilnahmestatus']=="teilgenommen und beendet"]

#drop all columns with text only
compl=compl.dropna(axis=1, how='all')

In [5]:
#drop all the meta data
compl=compl.drop(['_Antwort-ID', 'Resume-Code',"Start","Datum und Zeit","Teilnahmestatus"], axis=1)

#drop all questions about the participants
onlyanswers=compl.drop(['1. How old are you?',"Beginner (1) - Native Language (7)","I dont listen to rap at all (1) - I hear a lot of rap music (7)","No prior experience with machine generated text (1) - Frequent interaction with machine generated text (7)"],axis=1)

In [6]:
#The scale changed for the second half of the study (mainly to avoid bias) to make sure participants noticed 
#there was an extra testquestion. Four participants didnt answer the test question right and were excluded. 

onlyanswers=onlyanswers[onlyanswers['7']!=1.0]


In [7]:
#At the beginning of each part were test questions that should help familarise the participants with the setup and 
#make sure that they understood the task.
#Answers below a certain value would indicate that the participant had difficulties with the task.
#Some participants answered one of the four test question incorrectly but looking at their data 
#there is no indication that they had problems with the task overall.
#For this reason they were kept in the evaluation. 

#Extract the test questions
testquestion = onlyanswers[["Completely Unatural (1) - Completely Natural (7)", "Completely Unatural (1) - Completely Natural (7).1","Written by a human (1) - Written by a machine (7)","Written by a human (1) - Written by a machine (7).1"]].copy()


testquestion=testquestion.rename(columns={"Completely Unatural (1) - Completely Natural (7)": "RealNat", "Completely Unatural (1) - Completely Natural (7).1": "RandomNat","Written by a human (1) - Written by a machine (7)":"RealComp","Written by a human (1) - Written by a machine (7).1":"RandomComp"})
print("RealNat")
print(testquestion[testquestion.RealNat <3.0])
print("RandomNat")
print(testquestion[testquestion.RandomNat>5.0])
print("RealComp")
print(testquestion[testquestion.RealComp >5.0])
print("RandomComp")
print(testquestion[testquestion.RandomComp<3.0])


RealNat
    RealNat  RandomNat  RealComp  RandomComp
15        2          3         2           6
RandomNat
    RealNat  RandomNat  RealComp  RandomComp
20        5          7         5           3
RealComp
   RealNat  RandomNat  RealComp  RandomComp
8        3          1         6           7
RandomComp
   RealNat  RandomNat  RealComp  RandomComp
2        4          2         3           2


In [8]:
#Getting different dataframes

In [9]:
def frame_natural(inputframe):
    #Naturalness:

    #get all column names that belong to the natural part
    nat_col = [col for col in inputframe.columns if 'Natural' in col]

    #make a new dataframe
    natural= inputframe[nat_col].copy()

    #the order of the naturalness questions
    natorder=[30,23,7,9,25,26,12,19,8,20,3,33,28,14,36,24,37,5,11,39,35,10,2,32,38,6,13,17,40,15,34,4,29,16,18,31,21,22,1,27]

    #rename columns
    natural.columns = natural.columns[:2].tolist() + natorder

    #drop the first to columns they contain the test data
    natural=natural.drop(["Completely Unatural (1) - Completely Natural (7)","Completely Unatural (1) - Completely Natural (7).1"],axis=1)

    #sort the questions in their original order
    natural = natural.reindex(sorted(natural.columns), axis=1)
    return natural

In [10]:
natural=frame_natural(onlyanswers)

In [11]:
natural.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
0,6,5.0,2,3,4,2,3,3,3,4,...,1,2,1,1,1,2.0,2,2,2,1
1,7,3.0,5,6,5,4,2,4,5,3,...,1,2,2,1,1,2.0,1,1,2,1
2,3,5.0,3,3,5,4,3,4,2,4,...,3,4,3,4,3,4.0,3,3,4,3
4,5,6.0,5,4,6,5,2,5,5,6,...,6,2,2,2,2,5.0,2,2,5,2
5,5,5.0,5,3,3,4,4,4,3,5,...,4,4,5,5,4,4.0,3,4,4,5


In [12]:
def frame_comparison(inputframe):
    #Comparison:

    #get all column names that belong to the natural part
    comp_col = [col for col in inputframe.columns if 'human' in col]

    #make a new dataframe
    compare= inputframe[comp_col].copy()

    #the order of the naturalness questions
    comporder=[9,18,39,28,27,7,31,10,32,20,8,40,33,1,11,25,2,38,34,24,19,26,30,35,29,36,22,23,3,21,17,13,12,6,15,14,4,16,5,37]

    #rename columns
    compare.columns = compare.columns[:2].tolist() + comporder

    #drop the first to columns they contain the test data
    compare=compare.drop(["Written by a human (1) - Written by a machine (7)","Written by a human (1) - Written by a machine (7).1"],axis=1)

    #sort the questions in their original order
    compare = compare.reindex(sorted(compare.columns), axis=1)
    
    #In the second part of the study the scale flipped to avoid bias. But to compare the to parts here values we changed back. 
    #So 7 means written by a human, 1 written by a machine. 
    compare=compare.replace([1.0, 2.0, 3.0, 4.0,5.0,6.0,7.0], [7.0, 6.0, 5.0, 4.0,3.0,2.0,1.0])
    
    return compare

In [13]:
compare=frame_comparison(onlyanswers)

In [14]:

print(compare)

     1    2    3    4    5    6    7    8    9    10  ...   31   32   33   34  \
0   6.0  3.0  3.0  3.0  6.0  3.0  2.0  2.0  4.0  3.0  ...  1.0  3.0  1.0  1.0   
1   6.0  2.0  3.0  4.0  5.0  3.0  2.0  5.0  2.0  3.0  ...  2.0  1.0  1.0  1.0   
2   5.0  3.0  5.0  6.0  5.0  6.0  5.0  6.0  6.0  4.0  ...  6.0  6.0  5.0  5.0   
4   6.0  6.0  6.0  5.0  2.0  6.0  6.0  6.0  5.0  6.0  ...  6.0  6.0  5.0  2.0   
5   4.0  4.0  5.0  4.0  5.0  3.0  4.0  3.0  5.0  NaN  ...  5.0  5.0  4.0  5.0   
6   2.0  6.0  5.0  3.0  3.0  3.0  4.0  4.0  6.0  3.0  ...  1.0  3.0  3.0  4.0   
7   2.0  2.0  2.0  4.0  1.0  2.0  2.0  2.0  1.0  1.0  ...  1.0  2.0  1.0  1.0   
8   1.0  1.0  2.0  3.0  2.0  3.0  1.0  3.0  1.0  1.0  ...  1.0  1.0  1.0  3.0   
9   2.0  6.0  6.0  2.0  5.0  3.0  2.0  2.0  2.0  6.0  ...  6.0  6.0  6.0  6.0   
10  5.0  2.0  5.0  6.0  3.0  3.0  2.0  5.0  4.0  2.0  ...  1.0  1.0  1.0  1.0   
12  3.0  4.0  3.0  4.0  4.0  4.0  3.0  4.0  3.0  3.0  ...  3.0  3.0  3.0  3.0   
13  3.0  3.0  3.0  3.0  2.0 

In [15]:
compare.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,31,32,33,34,35,36,37,38,39,40
0,6.0,3.0,3.0,3.0,6.0,3.0,2.0,2.0,4.0,3.0,...,1.0,3.0,1.0,1.0,2.0,2.0,3.0,2.0,2.0,2.0
1,6.0,2.0,3.0,4.0,5.0,3.0,2.0,5.0,2.0,3.0,...,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
2,5.0,3.0,5.0,6.0,5.0,6.0,5.0,6.0,6.0,4.0,...,6.0,6.0,5.0,5.0,5.0,5.0,6.0,5.0,6.0,5.0
4,6.0,6.0,6.0,5.0,2.0,6.0,6.0,6.0,5.0,6.0,...,6.0,6.0,5.0,2.0,2.0,6.0,5.0,2.0,3.0,5.0
5,4.0,4.0,5.0,4.0,5.0,3.0,4.0,3.0,5.0,,...,5.0,5.0,4.0,5.0,,4.0,5.0,5.0,3.0,3.0


In [16]:

print("There are",natural.isna().sum().sum(),"questions with NAN in the natural set")
print("There are",compare.isna().sum().sum(),"questions with NAN in the comparison set")

There are 4 questions with NAN in the natural set
There are 6 questions with NAN in the comparison set


In [17]:
#Some participants skipped questions for easier evaluation the mean of the other participants for this question is added
natural=natural.apply(lambda x: x.fillna(int(x.mean())),axis=0)
compare=compare.apply(lambda x: x.fillna(int(x.mean())),axis=0)

In [18]:
def split_models(inputframe):
    
    tl=inputframe.iloc[:, : 10]
    mc=inputframe.iloc[:, 10 :20]
    real=inputframe.iloc[:, 20:30]
    rand=inputframe.iloc[:, 30:40]
    allframe=[tl,mc,real,rand]
    return tl,mc,real,rand,allframe

In [19]:
tlnat,mcnat,realnat,randnat,allnat=split_models(natural)
tlcom,mccom,realcom,randcom,allcom=split_models(compare)


In [20]:
print(tlnat)
print(tlcom)

    1    2   3   4   5   6   7   8   9   10
0    6  5.0   2   3   4   2   3   3   3   4
1    7  3.0   5   6   5   4   2   4   5   3
2    3  5.0   3   3   5   4   3   4   2   4
4    5  6.0   5   4   6   5   2   5   5   6
5    5  5.0   5   3   3   4   4   4   3   5
6    5  3.0   2   3   3   1   3   3   6   4
7    2  1.0   2   4   3   1   1   3   2   2
8    1  2.0   4   2   2   1   1   2   1   1
9    6  3.0   3   2   6   6   2   6   2   3
10   5  1.0   5   5   3   4   2   5   5   2
12   3  4.0   4   3   4   3   3   4   3   3
13   4  5.0   2   4   2   2   2   5   1   2
14   2  3.0   2   3   2   3   2   3   5   5
15   5  3.0   3   4   3   3   1   2   2   2
18   7  7.0   7   7   7   7   2   7   7   7
19   6  3.0   5   5   6   3   4   3   2   3
20   2  2.0   3   3   2   5   5   4   5   3
21   4  6.0   3   7   4   1   4   6   5   4
22   3  3.0   3   4   2   4   2   2   3   4
23   5  4.0   5   3   6   5   2   2   4   5
     1    2    3    4    5    6    7    8    9    10
0   6.0  3.0  3.0  3.0 

In [21]:
tlall=pd.concat([tlnat, tlcom], axis=1)
mcall=pd.concat([mcnat, mccom], axis=1)
realall=pd.concat([realnat, realcom], axis=1)
randall=pd.concat([randnat, randcom], axis=1)

allsorted=[tlall,mcall,realall,randall]

In [22]:
#Getting mean and std

In [23]:
def mean_and_std(dataframe):
    mean=round((dataframe.mean().mean()),2)
    std=round((dataframe.stack().std()),2)
    return mean,std

In [24]:
print("Mean and Std for Naturalness:")
for frame in allnat:
    print(mean_and_std(frame))

print()
print("Mean and Std for Comparison:")
for frame in allcom:
    print(mean_and_std(frame))
    
print()
print("Mean and Std for both:")
for frame in allsorted:
    print(mean_and_std(frame))

Mean and Std for Naturalness:
(3.64, 1.59)
(3.58, 1.48)
(5.3, 1.46)
(2.51, 1.45)

Mean and Std for Comparison:
(3.8, 1.73)
(3.91, 1.6)
(5.55, 1.58)
(2.79, 1.7)

Mean and Std for both:
(3.71, 1.66)
(3.75, 1.55)
(5.43, 1.53)
(2.65, 1.59)


In [25]:
#Getting ANOVA and Tukey 

In [26]:
index=range(1,41)
namena=pd.Series(["tlna","mcna","randna","realna"])
namena=namena.repeat(10)
namena.index = index

meanna=natural.mean()

combnat=pd.concat([namena,meanna ], axis=1,ignore_index=True)

combnat.columns = ['Method', 'Metascore']

In [27]:
index=range(1,41)
namecom=pd.Series(["tlcom","mccom","randcom","realcom"])
namecom=namecom.repeat(10)
namecom.index = index

meancom=compare.mean()

combcom=pd.concat([namecom,meancom], axis=1,ignore_index=True)
combcom.columns=["Method","Metascore"]

In [28]:
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [29]:
lm=ols("Metascore~Method",data=combnat).fit()
table=sm.stats.anova_lm(lm)
print(table)

lm=ols("Metascore~Method",data=combcom).fit()
table=sm.stats.anova_lm(lm)
print(table)

            df     sum_sq    mean_sq          F        PR(>F)
Method     3.0  39.957688  13.319229  68.092768  6.551074e-15
Residual  36.0   7.041750   0.195604        NaN           NaN
            df    sum_sq    mean_sq          F        PR(>F)
Method     3.0  39.04025  13.013417  66.277569  9.878368e-15
Residual  36.0   7.06850   0.196347        NaN           NaN


In [50]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [53]:
tukeynat = pairwise_tukeyhsd(endog=combnat['Metascore'],
                          groups=combnat['Method'],
                          alpha=0.05)

tukeycom = pairwise_tukeyhsd(endog=combcom['Metascore'],
                          groups=combcom['Method'],
                          alpha=0.05)

In [56]:
print(tukeynat)
print()
print(tukeycom)

Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj  lower   upper  reject
---------------------------------------------------
  mcna randna     1.72 0.001  1.1873  2.2527   True
  mcna realna   -1.075 0.001 -1.6077 -0.5423   True
  mcna   tlna     0.05   0.9 -0.4827  0.5827  False
randna realna   -2.795 0.001 -3.3277 -2.2623   True
randna   tlna    -1.67 0.001 -2.2027 -1.1373   True
realna   tlna    1.125 0.001  0.5923  1.6577   True
---------------------------------------------------

 Multiple Comparison of Means - Tukey HSD, FWER=0.05 
 group1  group2 meandiff p-adj  lower   upper  reject
-----------------------------------------------------
  mccom randcom     1.64 0.001  1.1063  2.1737   True
  mccom realcom   -1.115 0.001 -1.6487 -0.5813   True
  mccom   tlcom   -0.115   0.9 -0.6487  0.4187  False
randcom realcom   -2.755 0.001 -3.2887 -2.2213   True
randcom   tlcom   -1.755 0.001 -2.2887 -1.2213   True
realcom   tlcom      1.0 0.001  0.4663  1.5337 

In [30]:
#Spearmann Correlation between Naturalness and Written by a human

In [31]:
from scipy import stats
realcorr=stats.spearmanr(realnat.stack(),realcom.stack())
print(realcorr)
randcorr=stats.spearmanr(randnat.stack(),randcom.stack())
print(randcorr)

#The Spearman Correlation results suggest a statistically significant correlation between a sentence that is percieved as natural 
#and perceived as written by a human.

SpearmanrResult(correlation=0.5484845330623838, pvalue=4.191372566150168e-17)
SpearmanrResult(correlation=0.6074746058123445, pvalue=1.4691614334082227e-21)


In [32]:
#T-Test for all conditions

In [33]:
from scipy import stats

In [34]:
def t_test(a,b):
        stat,pvalue=stats.ttest_ind(a,b)
        return stat,pvalue

1. tl-tl
2. tl-mc
3. tl-real
4. tl-rand

5. mc-tl
6. mc-mc
7. mc-real
8. mc-rand

9. real-tl
10. real-mc
11. real-real
12. real-rand

13. rand-tl
14. rand-mc
15. rand-real
16. rand-rand

In [35]:
counter=0
for framea in allsorted:
    
    for frameb in allsorted:
        counter= counter+1
        stat,pvalue=t_test(framea.stack(),frameb.stack())
        print(counter,stat,pvalue)
        print()
        if pvalue>0.05 and pvalue!=1.0:
            print("Not statistically significant")
            print()



1 0.0 1.0

2 -0.28613490532648755 0.7748490612220023

Not statistically significant

3 -15.179186531955269 6.5912361326893736e-46

4 9.253394974720063 1.9537843447779377e-19

5 0.28613490532648755 0.7748490612220023

Not statistically significant

6 0.0 1.0

7 -15.432440347506489 3.251594991291461e-47

8 9.870479755293582 9.276245938327111e-22

9 15.179186531955269 6.5912361326893736e-46

10 15.432440347506489 3.251594991291461e-47

11 0.0 1.0

12 25.19184803560781 1.69315421167061e-103

13 -9.253394974720063 1.9537843447779377e-19

14 -9.870479755293582 9.276245938327111e-22

15 -25.19184803560781 1.69315421167061e-103

16 0.0 1.0



In [36]:
#This part is still unfinished

In [37]:
noknow=compl[compl['No prior experience with machine generated text (1) - Frequent interaction with machine generated text (7)']<4.0]
moreknow=compl[compl['No prior experience with machine generated text (1) - Frequent interaction with machine generated text (7)']>4.0]

In [38]:
natural_noknow=frame_natural(noknow)
compare_noknow=frame_comparison(noknow)

natural_moreknow=frame_natural(moreknow)
compare_moreknow=frame_comparison(moreknow)




In [39]:
print(natural_noknow)
print(natural)

    1    2   3   4   5   6   7   8   9   10  ...  31  32  33  34  35   36  37  \
2    3  5.0   3   3   5   4   3   4   2   4  ...   3   4   3   4   3  4.0   3   
5    5  5.0   5   3   3   4   4   4   3   5  ...   4   4   5   5   4  4.0   3   
8    1  2.0   4   2   2   1   1   2   1   1  ...   2   1   1   1   1  1.0   1   
11   2  5.0   5   1   7   4   6   4   6   4  ...   4   7   4   4   6  4.0   5   
12   3  4.0   4   3   4   3   3   4   3   3  ...   3   4   3   4   4  3.0   4   
13   4  5.0   2   4   2   2   2   5   1   2  ...   2   3   3   1   4  2.0   4   
18   7  7.0   7   7   7   7   2   7   7   7  ...   1   1   1   1   1  2.0   2   
19   6  3.0   5   5   6   3   4   3   2   3  ...   2   4   1   2   6  3.0   5   
21   4  6.0   3   7   4   1   4   6   5   4  ...   1   2   1   2   1  1.0   1   

    38  39  40  
2    3   4   3  
5    4   4   5  
8    1   1   2  
11   4   7   3  
12   4   4   4  
13   3   1   6  
18   1   1   1  
19   2   5   1  
21   1   2   2  

[9 rows x 40 colum

In [40]:
print("There are",natural_noknow.isna().sum().sum(),"questions with NAN in the natural no machine knowledge set")
print("There are",compare_noknow.isna().sum().sum(),"questions with NAN in the comparison no machine knowledge set")
print("There are",natural_moreknow.isna().sum().sum(),"questions with NAN in the natural with machine knowledge set")
print("There are",compare_moreknow.isna().sum().sum(),"questions with NAN in the comparison with machine knowledge set")

There are 1 questions with NAN in the natural no machine knowledge set
There are 5 questions with NAN in the comparison no machine knowledge set
There are 0 questions with NAN in the natural with machine knowledge set
There are 0 questions with NAN in the comparison with machine knowledge set


In [41]:
#Some participants skipped questions for easier evaluation the mean of the other participants for this question is added
natural_noknow=natural_noknow.apply(lambda x: x.fillna(int(x.mean())),axis=0)
compare_noknow=compare_noknow.apply(lambda x: x.fillna(int(x.mean())),axis=0)
natural_moreknow=natural_moreknow.apply(lambda x: x.fillna(int(x.mean())),axis=0)
compare_moreknow=compare_moreknow.apply(lambda x: x.fillna(int(x.mean())),axis=0)

In [42]:
t,_,_,_,allnoknow_nat=split_models(natural_noknow)
_,_,_,_,allnoknow_com=split_models(compare_noknow)

s,_,_,_,allmoreknow_nat=split_models(natural_moreknow)
_,_,_,_,allmoreknow_com=split_models(compare_moreknow)

alllist=[allnoknow_nat,allnoknow_com,allmoreknow_nat,allmoreknow_com]

In [43]:
#print(allnoknow_nat)
print(t)
print(s)

    1    2   3   4   5   6   7   8   9   10
2    3  5.0   3   3   5   4   3   4   2   4
5    5  5.0   5   3   3   4   4   4   3   5
8    1  2.0   4   2   2   1   1   2   1   1
11   2  5.0   5   1   7   4   6   4   6   4
12   3  4.0   4   3   4   3   3   4   3   3
13   4  5.0   2   4   2   2   2   5   1   2
18   7  7.0   7   7   7   7   2   7   7   7
19   6  3.0   5   5   6   3   4   3   2   3
21   4  6.0   3   7   4   1   4   6   5   4
    1    2   3   4   5   6   7   8   9   10
0    6  5.0   2   3   4   2   3   3   3   4
4    5  6.0   5   4   6   5   2   5   5   6
9    6  3.0   3   2   6   6   2   6   2   3
10   5  1.0   5   5   3   4   2   5   5   2
14   2  3.0   2   3   2   3   2   3   5   5
20   2  2.0   3   3   2   5   5   4   5   3
22   3  3.0   3   4   2   4   2   2   3   4
23   5  4.0   5   3   6   5   2   2   4   5


In [44]:

for lists in alllist:
    print("new list")
    for frame in lists:
        print("new frame")
        print(mean_and_std(frame))

new list
new frame
(3.89, 1.76)
new frame
(3.54, 1.47)
new frame
(5.03, 1.53)
new frame
(2.88, 1.59)
new list
new frame
(3.71, 1.74)
new frame
(3.7, 1.52)
new frame
(5.2, 1.58)
new frame
(3.01, 1.6)
new list
new frame
(3.69, 1.41)
new frame
(4.03, 1.41)
new frame
(5.53, 1.36)
new frame
(2.84, 1.5)
new list
new frame
(4.17, 1.64)
new frame
(4.41, 1.52)
new frame
(5.51, 1.75)
new frame
(3.19, 1.89)


In [45]:
for frame in allnoknow_com:
    print(mean_and_std(frame))

(3.71, 1.74)
(3.7, 1.52)
(5.2, 1.58)
(3.01, 1.6)


In [46]:
for frame in allmoreknow_com:
    print(mean_and_std(frame))

(4.17, 1.64)
(4.41, 1.52)
(5.51, 1.75)
(3.19, 1.89)


In [47]:
 for a,b in zip(allnoknow_nat,allmoreknow_nat):
        print(t_test(a.stack(),b.stack()))

(0.8153589682677646, 0.4160212972508185)
(-2.1658939187904447, 0.03172904374661525)
(-2.201632305546412, 0.029054046506510004)
(0.1693385202613752, 0.8657341044401139)


In [48]:
 for a,b in zip(allnoknow_com,allmoreknow_com):
        print(t_test(a.stack(),b.stack()))

(-1.7785671681391513, 0.07711926893556227)
(-3.0500333791154803, 0.002659498376250776)
(-1.2232245519927765, 0.22295791077108487)
(-0.6580854064379996, 0.5113840158381089)
