# sample generator for embedded questions
This notebook is for creating the random sample .txt files for the experiment.

In [93]:
import pandas as pd
import numpy as np

 # Contents
 1. [Constrain the data set to stimuli set](#Constrain-the-dataset-to-stimuli-set)
 2. [Figuring out the distribution of factors per list](#Figuring-out-the-distribution-of-factors-per-list)
 3. [Figure out how to collapse the matrix verb columns](#Figure-out-how-to-collapse-the-matrix-verb-columns)
 4. [Add in the paraphrases](#Add-in-the-paraphrases)
 5. [Controls](#Controls)
 6. [Balancing factors](#Balancing-factors)
     1. [Modal Balancing](#Modal-Balancing)
     2. [Wh Balancing](#Wh-Balancing)
         1. [Who](#Who)
         2. [What](#What)
         3. [Where](#Where)
         4. [When](#When)
         5. [How](#How)
         6. [Why](#Why)
 7. [Generating-random-samples](#Generating-random-samples)
     1. [First Iteration](#First-Iteration)
     2. [Second Iteration](#Second-Itreation)
     3. [Third Iteration](#Third-Iteration)
     4. [Fourth Iteration](#Fourth-Iteration)
     5. [Fifth Iteration](#Fifth-Iteration)
     6. [Sixth Iteration](#Sixth-Iteration)
     7. [Final Set](#Final-Set)
 8. [Pilot Samples](#Pilot-Samples)

In [94]:
# import the database file from the TGrep2 searching
df = pd.read_csv("../results/swbd.tab", sep='\t', engine='python')

In [95]:
# This makes the display show more info
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [340]:
df.pivot_table(index=['QuestionType'], values="Question", aggfunc=len).groupby(["QuestionType"]).Question.transform(lambda x: x/len(df)).reset_index()

Unnamed: 0,QuestionType,Question
0,adjunct,0.075988
1,cleft,0.063732
2,embadjunct,0.237082
3,embedded,0.165899
4,exclamation,0.003334
5,fragment,0.012354
6,relative,0.135111
7,root,0.168546
8,subject,0.137857


# Constrain the dataset to stimuli set
for experimental mock-up

First we have to remove the questions that we don;t want to include:
1. embedded questions only
2. no degree questions
3. no identity questions
4. generally only monomorphemic wh-phrases
5. only who-, what-, where-, when-, how-, and why-questions

In [108]:
critical = df[(df['QuestionType'] == 'embedded') # only root questions
              & 
              (df['DegreeQ'] == 'no' ) # no degree questions
              &
              (df['IdentityQ'] == "no") # no identity questions
              &
              (df['WhPhaseType'] == "monomorphemic") # no identity questions
              &
              (df['Wh'].isin(['how','How','where','Where','who','Who','what','What','why','Why','when','When']))] # just these wh-words]

In [109]:
len(critical)

1073

### Figuring out how many lists

1073/30 = 35.76

35 lists x 30 = 1050

1073-1050 = 23

35 lists of 30, 1 list of 23


### Number of participants
36 lists x 30 participants per list = 1080 participants

# Figuring out the distribution of factors per list

In [110]:
critical.pivot_table(index=['ModalPresent'], values="Question", aggfunc=len).groupby(["ModalPresent"]).Question.transform(lambda x: x/len(critical)).reset_index()

Unnamed: 0,ModalPresent,Question
0,no,0.846226
1,yes,0.153774


In [111]:
critical.pivot_table(index=['Wh'], values="Question", aggfunc=len).groupby(["Wh"]).Question.transform(lambda x: x/len(critical)).reset_index()

Unnamed: 0,Wh,Question
0,how,0.269338
1,what,0.416589
2,when,0.025163
3,where,0.158434
4,who,0.053122
5,why,0.077353


In [112]:
critical.pivot_table(index=['Wh','ModalPresent'], values="Question", aggfunc=len).groupby(["Wh"]).Question.transform(lambda x: x/len(critical)*100).reset_index()

Unnamed: 0,Wh,ModalPresent,Question
0,how,no,23.112768
1,how,yes,3.821062
2,what,no,36.346692
3,what,yes,5.312209
4,when,no,1.95713
5,when,yes,0.55918
6,where,no,11.835974
7,where,yes,4.007456
8,who,no,5.125815
9,who,yes,0.186393


# Figure out how to collapse the matrix verb columns

In [113]:
critical = critical.assign(Matrix = critical.MatrixPredVerb.astype(str) + ' ' + \
                critical.MatrixPredOther.astype(str) + ' ' +\
  critical.MatrixPredParticle.astype(str))

In [114]:
df['ColumnA'] = df[df.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1)

In [76]:
critical.columns

Index(['Item_ID', 'Sentence', 'HaveNeedTo', 'Finite', 'ModalPresent',
       'QuestionType', 'DegreeQ', 'SubjectAuxInv', 'WhAll', 'MatrixNegPresent',
       'EmbeddedNegPresent', 'SbarNomPresent', 'QuantifiedSubject',
       'QuantifiedPredicate', 'Wh', 'MatrixNegation', 'InvertedAuxVerb',
       'MatrixPredAux', 'MatrixPredVerb', 'MatrixPredOther',
       'MatrixPredParticle', 'MatrixPred2', 'Modal', 'EmbeddedNegation',
       'Verb1', 'Verb2', 'Verb3', 'DeterminerSubject', 'DeterminerNonSubject',
       'FullWhPhrase', 'JustMatrixClause', 'DeterminerSubjPresent',
       'DeterminerNonSubjPresent', 'WhNode', 'WhParse', 'Question',
       'SentenceParse', 'WhPhaseType', 'IdentityQ', 'Matrix'],
      dtype='object')

In [77]:
critical.Matrix

8                 know nan nan
28                know nan nan
58                know nan nan
65              wonder nan nan
66          understand nan nan
69                know nan nan
70              wonder nan nan
77              nan nan passed
111               know nan nan
114               hear nan nan
143               know nan nan
167               knew nan nan
180              nan nan dress
186               tell nan nan
192          wondering nan nan
222               know nan nan
233                nan nan nan
252            nan nan depends
260                 is nan nan
263               know nan nan
267          wondering nan nan
273               know nan nan
281                nan nan nan
290               says nan nan
295               said nan nan
298               know nan nan
325               like nan nan
327                 go nan nan
334            analyze nan nan
336              going nan nan
373               know nan nan
376            explain nan nan
377     

In [47]:
def verb_label(df):
    if df["MatrixPredVerb"] != "NaN":
        df["Matrix"] = df.loc(["MatrixPredVerb"])
    elif df["MatrixPredOther"] != "NaN":
        df["Matrix"] = df["MatrixPredOther"]
    elif df["MatrixPredParticle"] != "NaN":
        df["Matrix"] = df["MatrixPredParticle"]

In [49]:
critical["Matrix"] == ""

8        False
28       False
58       False
65       False
66       False
69       False
70       False
77       False
111      False
114      False
143      False
167      False
180      False
186      False
192      False
222      False
233      False
252      False
260      False
263      False
267      False
273      False
281      False
290      False
295      False
298      False
325      False
327      False
334      False
336      False
373      False
376      False
377      False
379      False
392      False
397      False
407      False
420      False
429      False
445      False
448      False
449      False
451      False
452      False
462      False
463      False
464      False
476      False
485      False
491      False
520      False
523      False
530      False
535      False
545      False
564      False
572      False
577      False
581      False
583      False
594      False
602      False
659      False
660      False
661      False
663      False
689      F

In [53]:
critical["Matrix"] = critical["MatrixPredVerb"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  critical["Matrix"] = critical["MatrixPredVerb"]


In [52]:
critical.apply(lambda x: verb_label(x))

KeyError: 'MatrixPredVerb'

In [21]:
critical.columns

Index(['Item_ID', 'Sentence', 'HaveNeedTo', 'Finite', 'ModalPresent',
       'QuestionType', 'DegreeQ', 'SubjectAuxInv', 'WhAll', 'MatrixNegPresent',
       'EmbeddedNegPresent', 'SbarNomPresent', 'QuantifiedSubject',
       'QuantifiedPredicate', 'Wh', 'MatrixNegation', 'InvertedAuxVerb',
       'MatrixPredAux', 'MatrixPredVerb', 'MatrixPredOther',
       'MatrixPredParticle', 'MatrixPred2', 'Modal', 'EmbeddedNegation',
       'Verb1', 'Verb2', 'Verb3', 'DeterminerSubject', 'DeterminerNonSubject',
       'FullWhPhrase', 'JustMatrixClause', 'DeterminerSubjPresent',
       'DeterminerNonSubjPresent', 'WhNode', 'WhParse', 'Question',
       'SentenceParse', 'WhPhaseType', 'IdentityQ'],
      dtype='object')

# Add in the paraphrases
this should take as input the entire constrained dataframe from the above section, and then generate the paraphrases

For Who questions: Who is a person...? / Who is some person...? / "Who is every person..." / "Who is the person..."

In [115]:
# read in df with contexts
cntxts = pd.read_csv("swbd_contexts.csv")

In [116]:
cntxts = cntxts.drop(columns="FollowingContext")

In [117]:
# get the indixes from critical
crit_index = critical.Item_ID

In [136]:
critical.columns

Index(['Item_ID', 'Sentence', 'HaveNeedTo', 'Finite', 'ModalPresent',
       'QuestionType', 'DegreeQ', 'SubjectAuxInv', 'WhAll', 'WhAdvP',
       'MatrixNegPresent', 'EmbeddedNegPresent', 'SbarNomPresent',
       'QuantifiedSubject', 'QuantifiedPredicate', 'Wh', 'MatrixSubject',
       'MatrixNegation', 'InvertedAuxVerb', 'MatrixPredAux', 'MatrixPredVerb',
       'MatrixPredOther', 'MatrixPredParticle', 'MatrixPred2', 'Modal',
       'EmbeddedSubject', 'EmbeddedNegation', 'Verb1', 'Verb2', 'Verb3',
       'DeterminerSubject', 'DeterminerNonSubject', 'FullWhPhrase',
       'JustMatrixClause', 'DeterminerSubjPresent', 'DeterminerNonSubjPresent',
       'WhNode', 'WhParse', 'Question', 'SentenceParse', 'WhPhaseType',
       'IdentityQ', 'Matrix'],
      dtype='object')

### Merge back in Wh and ModalPresent colums

In [137]:
df_WhMod = critical[["Item_ID","Wh","ModalPresent","Question"]].rename(columns={"Item_ID": "TGrepID"})

In [119]:
# subset to the items that are just the ones filtered in the previos section

# otherwise, if using the database file with contexts directly in there, then this step
# is not necessary
df_valid = cntxts[cntxts["TGrepID"].isin(set(crit_index))]

In [120]:
len(df_valid)

1073

In [121]:
# Merge
df_valid = df_valid.merge(df_WhMod, how = 'inner', indicator=False)

In [122]:
len(df_valid)

1073

In [139]:
df_valid.columns

Index(['TGrepID', 'EntireSentence', 'PreceedingContext', 'Wh', 'ModalPresent',
       'Question'],
      dtype='object')

In [123]:
df_valid.pivot_table(index=['Wh'], values="EntireSentence", aggfunc=len).groupby(["Wh"]).EntireSentence.transform(lambda x: x/len(df_valid)).reset_index()

Unnamed: 0,Wh,EntireSentence
0,how,0.269338
1,what,0.416589
2,when,0.025163
3,where,0.158434
4,who,0.053122
5,why,0.077353


In [140]:
who = df_valid[df_valid["Wh"] == "who"]
where = df_valid[df_valid["Wh"] == "where"]
how = df_valid[df_valid["Wh"] == "how"]
when = df_valid[df_valid["Wh"] == "when"]
why = df_valid[df_valid["Wh"] == "why"]
what = df_valid[df_valid["Wh"] == "what"]

In [141]:
who["AResponse"] = "...who is a person...?"
# who["SomeResponse"] = "Who is some person...?"
who["AllResponse"] = "...who is every person...?"
who["TheResponse"] = "...who is the person...?"


where["AResponse"] = "...what is a place...?"
# where["SomeResponse"] = "What is some place...?"
where["AllResponse"] = "...what is every place...?"
where["TheResponse"] = "...what is the place...?"


how["AResponse"] = "...what is a way...?"
# how["SomeResponse"] = "What is some way...?"
how["AllResponse"] = "...what is every way...?"
how["TheResponse"] = "...what is the way...?"

when["AResponse"] = "...what is a time...?"
# when["SomeResponse"] = "What is some time...?"
when["AllResponse"] = "...what is every time...?"
when["TheResponse"] = "...what is the time...?"


why["AResponse"] = "...what is a reason...?"
# why["SomeResponse"] = "What is some reason...?"
why["AllResponse"] = "...what is every reason...?"
why["TheResponse"] = "...what is the reason...?"


what["AResponse"] = "...what is a thing...?"
# what["SomeResponse"] = "What is some thing...?"
what["AllResponse"] = "...what is every thing...?"
what["TheResponse"] = "...what is the thing...?"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  who["AResponse"] = "...who is a person...?"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  who["AllResponse"] = "...who is every person...?"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  who["TheResponse"] = "...who is the person...?"
A value is trying to be set on a copy of a slice from a DataFra

In [142]:
df_final = pd.concat([who,where,how,why,when,what])

In [143]:
len(df_final)

1073

# Split EntireSentence on Question 
This is necessary because we need to bold the question

In [158]:
# split EntireSentence
df_final["Matrix"] = df_final.apply(lambda x: x['EntireSentence'].replace(x['Question'],"").strip(),axis=1)

In [163]:
# split the punctuation from Matrix and add it to Question
df_final["Matrix2"] = df_final.apply(lambda x: x['Matrix'].replace(r"\.|\?").strip(), axis=1)

TypeError: replace expected at least 2 arguments, got 1

In [160]:
df_final.Matrix

1                                                                                                                                                                                                                                                                                                                                                                                                                                                      do you know ?
10                                                                                                                                                                                                                                                                                                                                                                                                                                           and he, sho-, he knew ,
39                                                                                            

Unnamed: 0,TGrepID,EntireSentence,PreceedingContext,Wh,ModalPresent,Question,AResponse,AllResponse,TheResponse,Matrix
1,770:13,"do you know who the guy *ich*-3 was *t*-2 that *t*-1 was playing the, uh, the, the wagon driver?","###but i, i don't think 0 he really knew 0 it was going *-1 to be as big as it was *?*.###i think 0 it was something that he really wanted *-2 to do *t*-1.###he wanted *-1 to direct it,###he wanted *-1 to, to star in it.###you know, he, he enjoyed the story line###and i think 0 he just really w-, he really wanted it.###and whether it, whether it won all kinds of awards or whether it just was okay at the box office, i think 0 he would have been happy.###because i think that, i think 0 he did a good job### and, and the self-satisfaction 0 he got *t*-2 out of it is much greater than any awards that they can give *t*-1 him.###speakera51.",who,no,"who the guy *ich*-3 was *t*-2 that *t*-1 was playing the, uh, the, the wagon driver",...who is a person...?,...who is every person...?,...who is the person...?,do you know ?
10,2512:34,"and he, sho-, he knew who *t*-1 worked there,","###speakera17.###uh-huh.###speakerb18.###we wore shorts.###we didn't know 0 that was a big no, no.###speakera19.###oh no.###speakerb20.###and the supervisor from another area came up to us###and, you know, this area where record retention is *t*-1, there's only a few people that *t*-2 work there,",who,no,who *t*-1 worked there,...who is a person...?,...who is every person...?,...who is the person...?,"and he, sho-, he knew ,"
39,7535:30,"and, uh, it *exp*-4 doesn't matter who they put *t*-3 in office, they're going *-1 to fall subject to, uh, the pressure that the lobbyists can put *t*-2,","###but, uh,###speakera30.###yeah,###that's true that the different places in america that, uh, you know, different issues would be a lot more important than say in another place.###speakerb31.###yeah,###i can understand why, why some of the, the rural areas, the voter turnout isn't as much because it does seem sometimes like the lobbyists in d c are like controlling things for the, the midwest *t*-1,###speakera32.###uh-huh.###speakerb33.",who,no,who they put *t*-3 in office,...who is a person...?,...who is every person...?,...who is the person...?,"and, uh, it *exp*-4 doesn't matter , they're going *-1 to fall subject to, uh, the pressure that the lobbyists can put *t*-2,"
43,7772:19,i don't even know who *t*-1 won the pennant last year.,"###and a lot more games, right?###speakerb14.###yeah,###that's exactly right.###speakera15.###uh-huh.###speakerb16.###but i think,###* let 0's see,###the teams that *t*-1 were there last year were, see, somebody from california,",who,no,who *t*-1 won the pennant last year,...who is a person...?,...who is every person...?,...who is the person...?,i don't even know .
45,7981:19,"you don't even know who *-1 to payoff *t*-2, huh?.","###speakerb90.###yeah,###speakera91.###most of the time.###speakerb92.###but the politics, the politics gets worse in the small towns sometimes.###speakera93.###oh man, in dallas you don't even know who *t*-1's in, in administration,###there's so many of them.###speakerb94.",who,no,who *-1 to payoff *t*-2,...who is a person...?,...who is every person...?,...who is the person...?,"you don't even know , huh?."


# Controls

In [147]:
controls = pd.read_csv("../../experiments/clean_corpus/controls.csv")

In [148]:
# Add columns to make merging datasets easier
controls["Wh"] = "none"
controls["ModalPresent"] = "no"
controls["Question"] = controls["EntireSentence"]

In [149]:
controls = controls[["TGrepID","EntireSentence","PreceedingContext","Question","Wh","ModalPresent","AResponse","AllResponse","TheResponse"]]

# Balancing factors

In [327]:
len(df_final)/32

31.09375

In [141]:
# 32 lists of 30, 1 list of 35
30*32 +35

995

## Modal Balancing
- Lists 1-29: 4
    - List 1-15: 4
    - List 16-29: 4
- List 30-32: 1
- List 33: 2

In [144]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     874
yes    121
Name: ModalPresent, dtype: int64

Modals

In [145]:
121/33

3.6666666666666665

In [161]:
# modal leftovers
121 - 4*29

5

## Wh Balancing

In [306]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
how      184
what     585
when      15
where     97
who       54
why       60
Name: Wh, dtype: int64

### When
- List 1-15: 1

### Who
- List 1-15: 2
- List 15-21: 2
- List 22-33: 1

In [189]:
54/33

1.6363636363636365

In [197]:
21*2 + 12

54

### Why
- List 1-15: 2
- List 16-21: 2
- List 22-27: 2
- List 28-33: 1

In [199]:
60/33

1.8181818181818181

In [205]:
27*2 + 6

60

### Where
- List 1-15: 3
- List 16-31: 3
- List 32-33: 2

In [208]:
97/33

2.9393939393939394

In [212]:
31*3 + 2*2

97

### How
- List 1-15: 6
- List 16-27: 6
- List 28-31: 4
- List 32-33: 3

In [264]:
27*6 + 4*4 + 2*3

184

### What
- Lists 1-15: 16
- List 16-29: 18**
- Lists 30-32: 16**
- List 33: 15**

In [328]:
29*18+3*16+1*15

585

In [330]:
15*16 + 6*17 + 6*18 + 2*21 + 2*21 + 1*23 + 1*28

557

In [331]:
585-557

28

Lists 1-15 (30):
    1 when
    2 who
    2 why
    3 where
    6 how
    16 what

List 16-21 (30):
    0 when
    2 who
    2 why
    3 where
    6 how
    17 what


List 22-27 (30):
    0 when
    1 who
    2 why
    3 where
    6 how
    18 what

List 28-29 (30):
    0 when
    1 who
    1 why
    3 where
    4 how
    21 what

List 30-31 (30):
    0 when
    1 who
    1 why
    3 where
    4 how
    21 what

List 32 (30):
    0 when
    1 who
    1 why
    2 where
    3 how
    23 what

List 33 (35):
    1 who
    1 why
    2 where
    3 how
    28 what


In [398]:
df_final = pd.concat([who,where,how,why,when,what])

# Generating random samples

## First Iteration

In [399]:
for n in range(1,16):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(4)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    j = len(mod_sample[mod_sample["Wh"] == "when"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    when_sample = df_final[df_final["Wh"] == "when"].sample(1-j)
    df_final = df_final.drop(when_sample.index)  

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(6-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(16-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,when_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

38

In [400]:
len(df_final)

545

In [396]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
how       94
what     345
where     52
who       24
why       30
Name: Wh, dtype: int64

who
9/2 = 4.5

who
9/2 = 3.5

how
49/2 = 24.5

In [533]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     52
yes    13
Name: ModalPresent, dtype: int64

In [506]:
13/2

6.5

## Second iteration

In [401]:
for n in range(16,22):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(4)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(6-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(17-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [29]:
23+4+3

30

In [402]:
len(df_final)

365

In [107]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     874
yes    121
Name: ModalPresent, dtype: int64

In [110]:
995/30

33.166666666666664

In [108]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
how      184
what     585
when      15
where     97
who       54
why       60
Name: Wh, dtype: int64

## Third iteration

In [403]:
for n in range(22,28):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(4)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(6-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(18-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [None]:
len(d_final)

## Fourth Iteration

In [404]:
for n in range(28,30):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(4)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(4-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(21-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [None]:
len(d_final)

## Fifth Iteration

In [405]:
for n in range(30,32):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(1)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(4-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(21-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [407]:
len(df_final)

65

## Sixth Iteration

In [408]:
for n in range(32,33):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(1)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    l = len(mod_sample[mod_sample["Wh"] == "how"])
    m = len(mod_sample[mod_sample["Wh"] == "who"])
    o = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-m)
    df_final = df_final.drop(who_sample.index)    
    
    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
    ].sample(1-i)
    df_final = df_final.drop(why_sample.index)
    
    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")
    ].sample(2-o)
    df_final = df_final.drop(where_sample.index)

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
    ].sample(3-l)
    df_final = df_final.drop(how_sample.index)    
    
    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")
    ].sample(23-k)
    df_final = df_final.drop(what_sample.index)
    
    total = pd.concat([mod_sample,why_sample,what_sample,how_sample,who_sample,where_sample,controls])

    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [410]:
len(df_final)

35

## Final Set

In [411]:
last = pd.concat([df_final,controls])

In [412]:
last.to_csv("../../experiments/clean_corpus/03_experiment/corpus_33.txt",header=True,sep="\t",index=False)

# Pilot Samples

In [40]:
df_final.pivot_table(index=['Wh'], values="EntireSentence", aggfunc=len).groupby(["Wh"]).EntireSentence.transform(lambda x: x/len(df_final)).reset_index()

Unnamed: 0,Wh,EntireSentence
0,how,0.256433
1,what,0.411713
2,when,0.023957
3,where,0.150843
4,which,0.032831
5,who,0.050577
6,why,0.073647


In [164]:
eq_pilot = df_final.sample(10,random_state=666)

In [166]:
eqp = pd.concat([eq_pilot,controls])

In [167]:
eqp.to_csv("../../experiments/clean_corpus/04_experiment/pilot.txt",header=True,sep="\t",index=False)

99.5

In [125]:
mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(1)

i = len(mod_sample[mod_sample["Wh"] == "why"])
j = len(mod_sample[mod_sample["Wh"] == "when"])
k = len(mod_sample[mod_sample["Wh"] == "what"])
l = len(mod_sample[mod_sample["Wh"] == "how"])
m = len(mod_sample[mod_sample["Wh"] == "who"])
n = len(mod_sample[mod_sample["Wh"] == "where"])
n = len(mod_sample[mod_sample["Wh"] == "which"])
df_final = df_final.drop(mod_sample.index)

why_sample = df_final[
    (df_final["Wh"] == "why") &
    (df_final["ModalPresent"] == "no")
                     ].sample(1-i)
df_final = df_final.drop(why_sample.index)

when_sample = df_final[
    (df_final["Wh"] == "when") &
    (df_final["ModalPresent"] == "no")].sample(1-j)
df_final = df_final.drop(when_sample.index)

what_sample = df_final[
    (df_final["Wh"] == "what") &
    (df_final["ModalPresent"] == "no")].sample(5-k)
df_final = df_final.drop(what_sample.index)

how_sample = df_final[
    (df_final["Wh"] == "how") &
    (df_final["ModalPresent"] == "no")
                     ].sample(1-l)
df_final = df_final.drop(how_sample.index)

who_sample = df_final[
    (df_final["Wh"] == "who") &
    (df_final["ModalPresent"] == "no")].sample(1-m)
df_final = df_final.drop(who_sample.index)

where_sample = df_final[
    (df_final["Wh"] == "where") &
    (df_final["ModalPresent"] == "no")].sample(1-n)
df_final = df_final.drop(where_sample.index)


In [128]:
total = pd.concat([mod_sample,why_sample,when_sample,what_sample,how_sample,who_sample,where_sample,controls])

# save to file


In [129]:
total.to_csv("../../experiments/clean_corpus/04_experiment/pilot.txt",header=True,sep="\t",index=False)