# sample generator for when, why, what
This notebook is for creating the random sample .txt files for the experiment.

In [1]:
import pandas as pd
import numpy as np

 ## Contents
 1. [Create separate file with contexts](#Create-separate-file-with-contexts)
 2. [Create separate file without contexts](#Create-separate-file-without-contexts)
 3. [Creating the files for the experiment](#Creating-files-for-the-experiment)
 4. [Automate Paraphrase Generator](#Automate-pharaphrase-generator)


In [3]:
# import the database file from the TGrep2 searching
df = pd.read_csv("../results/swbd.tab", sep='\t', engine='python')

In [4]:
# This makes the display show more info
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

# Creating the files for the experiment

## Constrain dataset
for experimental mock-up

First we have to remove the questions that we don;t want to include:
1. non-embedded or root questions
2. no degree questions
3. no identity questions
4. generally only monomorphemic wh-phrases
5. only what-, when-, and why-questions

In [7]:
# distribution before 

In [9]:
df.columns

Index(['Item_ID', 'Sentence', 'HaveNeedTo', 'Finite', 'ModalPresent',
       'QuestionType', 'DegreeQ', 'SubjectAuxInv', 'WhAll', 'MatrixNegPresent',
       'EmbeddedNegPresent', 'QuantifiedSubject', 'QuantifiedPredicate', 'Wh',
       'MatrixNegation', 'MatrixDoSupport', 'MatrixPred1', 'MatrixPred2',
       'MatVerbPart', 'Modal', 'EmbeddedNegation', 'Verb1', 'Verb2', 'Verb3',
       'DeterminerSubject', 'DeterminerNonSubject', 'FullWhPhrase',
       'DeterminerSubjPresent', 'DeterminerNonSubjPresent', 'WhNode',
       'WhParse', 'Question', 'SentenceParse', 'WhPhaseType', 'IdentityQ'],
      dtype='object')

In [5]:
df.pivot_table(index=['QuestionType'], values="Question", aggfunc=len).groupby(["QuestionType"]).Question.transform(lambda x: x/len(df)).reset_index()

Unnamed: 0,QuestionType,Question
0,adjunct,0.075988
1,cleft,0.063732
2,embadjunct,0.237082
3,embedded,0.165899
4,exclamation,0.003334
5,fragment,0.012354
6,relative,0.135111
7,root,0.168546
8,subject,0.137857


In [6]:
df.pivot_table(index=['QuestionType'], values="Question", aggfunc=len).groupby(["QuestionType"]).Question.transform(lambda x: x/len(df)).reset_index()

Unnamed: 0,QuestionType,Question
0,adjunct,0.075988
1,cleft,0.063732
2,embadjunct,0.237082
3,embedded,0.165899
4,exclamation,0.003334
5,fragment,0.012354
6,relative,0.135111
7,root,0.168546
8,subject,0.137857


In [7]:
critical = df[(df['QuestionType'] == 'root') # only root questions
              & 
              (df['DegreeQ'] == 'no' ) # no degree questions
              &
              (df['IdentityQ'] == "no") # no identity questions
              &
              (df['Wh'].isin(['what','What','why','Why','when','When'])) # just these wh-words
              &
              (df['WhPhaseType'] == 'monomorphemic') # monomorphic wh only (might get anything not go by degQ)
             ]

In [11]:
# Number of lists (30 items per list)
len(critical)/30

22.0

In [8]:
critical.pivot_table(index=['ModalPresent'], values="Question", aggfunc=len).groupby(["ModalPresent"]).Question.transform(lambda x: x/len(critical)).reset_index()

Unnamed: 0,ModalPresent,Question
0,no,0.877273
1,yes,0.122727


In [9]:
critical.pivot_table(index=['Wh'], values="Question", aggfunc=len).groupby(["Wh"]).Question.transform(lambda x: x/len(critical)).reset_index()

Unnamed: 0,Wh,Question
0,what,0.886364
1,when,0.022727
2,why,0.090909


In [12]:
critical.pivot_table(index=['Wh','ModalPresent'], values="Question", aggfunc=len).groupby(["Wh"]).Question.transform(lambda x: x/len(critical)*100).reset_index()

Unnamed: 0,Wh,ModalPresent,Question
0,what,no,78.484848
1,what,yes,10.151515
2,when,no,2.121212
3,when,yes,0.151515
4,why,no,7.121212
5,why,yes,1.969697


## Paraphrase Generator
this should take as input the entire constrained dataframe from the above section, and then generate the paraphrases

For Who questions: Who is a person...? / Who is some person...? / "Who is every person..." / "Who is the person..."

In [13]:
# read in df with contexts
cntxts = pd.read_csv("swbd_contexts.csv")

In [14]:
cntxts = cntxts.drop(columns="FollowingContext")

In [15]:
# get the indixes from critical
crit_index = critical.Item_ID

Merge back in Wh and ModalPresent colums

In [16]:
df_WhMod = critical[["Item_ID","Wh","ModalPresent"]].rename(columns={"Item_ID": "TGrepID"})

In [17]:
# subset to the items that are just the ones filtered in the previos section

# otherwise, if using the database file with contexts directly in there, then this step
# is not necessary
df_valid = cntxts[cntxts["TGrepID"].isin(set(crit_index))]

In [18]:
len(df_valid)

660

In [19]:
df_valid = df_valid.merge(df_WhMod, how = 'inner', indicator=False)

In [20]:
len(df_valid)

660

In [21]:
df_valid.pivot_table(index=['Wh'], values="EntireSentence", aggfunc=len).groupby(["Wh"]).EntireSentence.transform(lambda x: x/len(df_valid)).reset_index()

Unnamed: 0,Wh,EntireSentence
0,what,0.886364
1,when,0.022727
2,why,0.090909


In [22]:
when = df_valid[df_valid["Wh"] == "when"]
why = df_valid[df_valid["Wh"] == "why"]
what = df_valid[df_valid["Wh"] == "what"]

In [23]:
when["AResponse"] = "What is a time...?"
# when["SomeResponse"] = "What is some time...?"
when["AllResponse"] = "What is every time...?"
when["TheResponse"] = "What is the time...?"


why["AResponse"] = "What is a reason...?"
# why["SomeResponse"] = "What is some reason...?"
why["AllResponse"] = "What is every reason...?"
why["TheResponse"] = "What is the reason...?"


what["AResponse"] = "What is a thing...?"
# what["SomeResponse"] = "What is some thing...?"
what["AllResponse"] = "What is every thing...?"
what["TheResponse"] = "What is the thing...?"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  when["AResponse"] = "What is a time...?"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  when["AllResponse"] = "What is every time...?"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  when["TheResponse"] = "What is the time...?"
A value is trying to be set on a copy of a slice from a DataFrame.
Try u

In [24]:
df_final = pd.concat([why,when,what])

In [25]:
len(df_final)

660

In [26]:
df_final.pivot_table(index=['Wh'], values="EntireSentence", aggfunc=len).groupby(["Wh"]).EntireSentence.transform(lambda x: x/len(df_final)).reset_index()

Unnamed: 0,Wh,EntireSentence
0,what,0.886364
1,when,0.022727
2,why,0.090909


In [27]:
df_final.pivot_table(index=['ModalPresent'], values="EntireSentence", aggfunc=len).groupby(["ModalPresent"]).EntireSentence.transform(lambda x: x/len(df_final)).reset_index()

Unnamed: 0,ModalPresent,EntireSentence
0,no,0.877273
1,yes,0.122727


## Controls

In [30]:
controls = pd.read_csv("../../experiments/clean_corpus/controls.csv")

In [31]:
controls["Wh"] = "none"
controls["ModalPresent"] = "no"

In [32]:
controls = controls[["TGrepID","EntireSentence","PreceedingContext","Wh","ModalPresent","AResponse","AllResponse","TheResponse"]]

## Create randomly sampled files

### TEST SAMPLE

In [34]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     579
yes     81
Name: ModalPresent, dtype: int64

In [35]:
81/22

3.6818181818181817

In [41]:
# modal leftovers
81 - 19*4

5

In [42]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
what    585
when     15
why      60
Name: Wh, dtype: int64

In [44]:
# what
585/22

26.59090909090909

In [46]:
# what leftovers
585 - 22*26

13

In [47]:
# when
22-15

7

In [48]:
# why
60/22

2.727272727272727

In [52]:
60 - 19*3

3

In [55]:
60 - 15*3

15

Lists 1-15:
    1 when
    3 why
    26 what

remaining:
    0 when
    195 what
    15 why
    
Lists 16-19
    0 when
    3 why
    26 what

remaining:
    0 when
    195 what
    15 why
    
Lists 20-22
    0 when

In [79]:
df_final = pd.concat([who,where,how])

In [491]:
print(df_final.groupby(["ModalPresent"])["ModalPresent"].count())

ModalPresent
no     295
yes     40
Name: ModalPresent, dtype: int64


In [62]:
3*2 + (22-6)*1

22

In [19]:
len(test)

335

In [20]:
len(df_final)

335

In [25]:
# ids = pd.DataFrame(columns=df_final.columns)
for n in range(1,16):
#     print(df_final.groupby(["ModalPresent"])["ModalPresent"].count())
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(1)

    i = len(mod_sample[mod_sample["Wh"] == "why"])
    j = len(mod_sample[mod_sample["Wh"] == "when"])
    k = len(mod_sample[mod_sample["Wh"] == "what"])
    df_final = df_final.drop(mod_sample.index)

    why_sample = df_final[
        (df_final["Wh"] == "why") &
        (df_final["ModalPresent"] == "no")
                         ].sample(15-i)
    df_final = df_final.drop(how_sample.index)

    when_sample = df_final[
        (df_final["Wh"] == "when") &
        (df_final["ModalPresent"] == "no")].sample(5-j)
    df_final = df_final.drop(who_sample.index)

    what_sample = df_final[
        (df_final["Wh"] == "what") &
        (df_final["ModalPresent"] == "no")].sample(10-k)
    df_final = df_final.drop(where_sample.index)

    total = pd.concat([mod_sample,why_sample,when_sample,what_sample,controls])
    
    # save to file
    filename = f"../../experiments/clean_corpus/03_experiment/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

In [26]:
len(df_final)

65

In [27]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
how      49
where     7
who       9
Name: Wh, dtype: int64

who
9/2 = 4.5

who
9/2 = 3.5

how
49/2 = 24.5

In [533]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     52
yes    13
Name: ModalPresent, dtype: int64

In [506]:
13/2

6.5

Second iteration

In [28]:
for n in range(10,11):
    mod_sample = df_final[df_final["ModalPresent"] == "yes"].sample(6)
#     print(len(mod_sample))
    i = len(mod_sample[mod_sample["Wh"] == "how"])
    j = len(mod_sample[mod_sample["Wh"] == "who"])
    k = len(mod_sample[mod_sample["Wh"] == "where"])
    df_final = df_final.drop(mod_sample.index)
#     print(len(df_final))

    how_sample = df_final[
        (df_final["Wh"] == "how") &
        (df_final["ModalPresent"] == "no")
                         ].sample(23-i)
    df_final = df_final.drop(how_sample.index)
    print(len(df_final))

    who_sample = df_final[
        (df_final["Wh"] == "who") &
        (df_final["ModalPresent"] == "no")].sample(4-j)
    df_final = df_final.drop(who_sample.index)
    print(len(df_final))

    where_sample = df_final[
        (df_final["Wh"] == "where") &
        (df_final["ModalPresent"] == "no")].sample(3-k)
    df_final = df_final.drop(where_sample.index)

    total = pd.concat([mod_sample,how_sample,who_sample,where_sample,controls])
    print(len(total))
#     save to file
    filename = f"../../experiments/clean_corpus/corpus_{n}.txt".format(n=n)
    total.to_csv(filename,header=True,sep="\t",index=False)

39
37
36


In [29]:
23+4+3

30

In [30]:
len(df_final)

35

In [31]:
df_final.groupby(["ModalPresent"])["ModalPresent"].count()

ModalPresent
no     28
yes     7
Name: ModalPresent, dtype: int64

In [32]:
df_final.groupby(["Wh"])["Wh"].count()

Wh
how      26
where     4
who       5
Name: Wh, dtype: int64

In [33]:
last = pd.concat([df_final,controls])

In [34]:
last.to_csv("../../experiments/clean_corpus/corpus_11.txt",header=True,sep="\t",index=False)

## Pilot Samples

### Pilot1

In [63]:
# since df_final changed, it's a different sample set now.
# pilot = df_final.sample(10, random_state=333)
pilot = df_final.sample(10, random_state=333)

In [65]:
cont = controls.drop(columns=["Wh","ModalPresent"])

In [66]:
pilott = pd.concat([pilot,cont])

In [68]:
pilott.to_csv("../../experiments/clean_corpus/03_experiment/pilot1.txt",header=True,sep="\t",index=False)