# Import Packages

In [2]:
!pip install inflect

Collecting inflect
  Downloading inflect-7.0.0-py3-none-any.whl (34 kB)
Installing collected packages: inflect
Successfully installed inflect-7.0.0
[0m

In [3]:
import pandas as pd
import nltk
import re
import inflect
p = inflect.engine()

# Load Data

In [4]:
which_object_relatives = pd.read_csv("Data/object_relative_which.csv", header = None, sep = "\t")
which_object_relatives.columns = ["original_sentence"]
which_object_relatives["relative_pronoun"] = "which" #assign relative pronoun

In [5]:
which_object_relatives_animate_np2 = pd.read_csv("Data/which_relative_clause_animate_NP2.csv", header = None, sep = "\t")
which_object_relatives_animate_np2.columns = ["original_sentence"]
which_object_relatives_animate_np2["relative_pronoun"] = "which" #assign relative pronoun

In [6]:
print("Number of sentences with which: ",len(which_object_relatives) + len(which_object_relatives_animate_np2))

Number of sentences with which:  124


In [7]:
who_object_relatives = pd.read_csv("Data/object_relative_who.csv", header = None, sep = "\t")
who_object_relatives.columns = ["original_sentence"]
who_object_relatives["relative_pronoun"] = "who" #assign relative pronoun
print("Number of sentences with who: ",len(who_object_relatives)) 

Number of sentences with who:  280


# Preprocess data

In [8]:
def preprocess(sentence, pronoun):

    tokens = nltk.word_tokenize(sentence)
    tag = nltk.pos_tag(tokens)
    
    for i in range(len(tag)):
        if (tag[i][1] == "WDT" or tag[i][1] == "IN") and tag[i][0] == "that":
            tokens[i] = pronoun #exchange "that" with either "who" or "which"
        if tag[i][0] == "and":
            tokens[i] = "of" #exchange "and" with "of", as "and" only appears between NP1 and NP2 in the current material

    tokens = ' '.join(tokens)
    tokens = re.sub(",", "", tokens) #delete commas
    tokens = re.sub(";", "", tokens) #delete semicolons 

    return tokens

In [9]:
#apply preprocessing
which_object_relatives["original_sentence"] = which_object_relatives.apply(lambda x: preprocess(x["original_sentence"], x["relative_pronoun"]), 
                        axis = 1)
which_object_relatives_animate_np2["original_sentence"] = which_object_relatives_animate_np2.apply(lambda x: preprocess(x["original_sentence"], 
                        x["relative_pronoun"]), axis = 1)
who_object_relatives["original_sentence"] = who_object_relatives.apply(lambda x: preprocess(x["original_sentence"], x["relative_pronoun"]), 
                        axis = 1)

In [10]:
which_object_relatives.head()

Unnamed: 0,original_sentence,relative_pronoun
0,The tourists admired the museum of the city which,which
1,John smashed the car of the company which,which
2,Several men moved the machines of the factorie...,which
3,The brokers sold the stocks of the fund which,which
4,The governor bought some books for the library...,which


# Retrieve Words for Human and Non-Human Referents

In [12]:
def extract_referents(words):
    
    word_list = []
    unique_words = ""

    token = nltk.word_tokenize(words)
    token_lower = [i.lower() for i in token]
    tag = nltk.pos_tag(token_lower)
    
    #extract NP2
    for i in range(len(tag)):
        if (tag[i][1] == "NN" or tag[i][1] == "NNS") and (tag[i+1][0] == "who" or tag[i+1][0] == "which"):
            word_list.append(tag[i][0])
    
    #keep only unique words
    for i in word_list:
        if i not in unique_words:
            unique_words += i
    
    return unique_words

In [13]:
#create the lists with animate and inanimate nouns
which_words = list(which_object_relatives["original_sentence"].apply(extract_referents))
who_words = list(who_object_relatives["original_sentence"].apply(extract_referents))

In [14]:
#add more nouns to the lists
extra_which_words = ["palace", "hotel", "hospital", "inheritence", "group", "band", "office", "town", "neighborhood", "book", "wedding", 
                     "party", "conference", "reunion", "university", "laboratory", "school", "class", "salon"]
extra_who_words = ["creator", "writer", "author"]
which_words.extend(extra_which_words)
who_words.extend(extra_who_words)

# Combine Data

In [15]:
data = pd.concat([which_object_relatives, which_object_relatives_animate_np2, who_object_relatives]) #combine the datasets
data["transformed_sentence"], data["N1"], data["N2"], data["exchanged_N2"] = " ", " ", " ", " "

#source: https://stackoverflow.com/questions/45685254/how-to-efficiently-assign-unique-id-to-individuals-with-multiple-entries-based-o
data = data.assign(id=(data["original_sentence"]).astype("category").cat.codes) #assign sentence id 
data = data.sort_values("id") #sort the dataset by sentence id

data.head()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
268,A client looked at the clerk of the solicitor who,who,,,,,0
262,A client noticed the hairdresser of the actres...,who,,,,,1
278,A customer frowned at the assistant of the pha...,who,,,,,2
265,A fan looked at the guitarist of the singer who,who,,,,,3
274,A man asked for the apprentice of the builder who,who,,,,,4


In [16]:
print("Number of sentences:", len(data))

Number of sentences: 404


In [17]:
#source: https://www.geeksforgeeks.org/find-duplicate-rows-in-a-dataframe-based-on-all-or-selected-columns/
duplicates = data[data.duplicated()] #retrieve duplicates
duplicates

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
121,Someone shot the servant of the actress who,who,,,,,138
2,The nurse trusted the doctor of the teacher who,who,,,,,280


In [18]:
#source: https://www.geeksforgeeks.org/python-pandas-dataframe-drop_duplicates/
data = (data.drop_duplicates()) #remove one of the duplicate sentences
print("Number of unique sentences:", len(data))

Number of unique sentences: 402


# Retrieve Referents of NP1 and NP2

In [19]:
def get_referents(sent):
    
    N1 = ""
    N2 = ""
    
    tokens = nltk.word_tokenize(sent)
    tag = nltk.pos_tag(tokens)
    
    #retrieve N1
    for i in range(len(tag)):
        if tag[i-1][0] == "the" and tag[i+1][0] == "of": #retrieve the token between "the" and "of"
            N1 += tag[i][0]
    
    #retrieve N2
    for i in range(len(tag)):
        if tag[i][0] == "who" or tag[i][0] == "which": #retrieve the token preceding the relativizer
            N2 += tag[i-1][0]
            
    return pd.Series([N1, N2])

In [20]:
#retrieve N1 and N2 for every sentence
data[["N1", "N2"]] = data["original_sentence"].apply(get_referents)

In [21]:
data.head()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
268,A client looked at the clerk of the solicitor who,who,,clerk,solicitor,,0
262,A client noticed the hairdresser of the actres...,who,,hairdresser,actress,,1
278,A customer frowned at the assistant of the pha...,who,,assistant,pharmacist,,2
265,A fan looked at the guitarist of the singer who,who,,guitarist,singer,,3
274,A man asked for the apprentice of the builder who,who,,apprentice,builder,,4


# Exchange N2

In [22]:
#source: https://www.geeksforgeeks.org/python-word-similarity-using-spacy/
import spacy
import en_core_web_lg

nlp = en_core_web_lg.load()

which_tokens = []
who_tokens = []

#retrieve vectors for the inanimate referents
for i in which_words:
    token = nlp(i)
    which_tokens.append(token)

#retrieve vectors for the animate referents
for i in who_words:
    token = nlp(i)
    who_tokens.append(token)

In [23]:
#source: https://www.geeksforgeeks.org/python-word-similarity-using-spacy/
def most_similar(target, relative_pronoun):
    
    similarities = {}
    
    token = nlp(target)
    
    #retrieve an animate referent for N2, when the relativizer is "which"
    if relative_pronoun == "which":
        for j in who_tokens:
            result = j.similarity(token)
            similarities[j] = result
            
    #retrieve an inanimate referent for N2, when the relativizer is "who"
    if relative_pronoun == "who":
        for j in which_tokens:
            result = j.similarity(token) 
            similarities[j] = result

    #source: https://datagy.io/python-get-dictionary-key-with-max-value/
    most_similar_word = str(max(similarities, key = similarities.get)) #choose the referent that is the most similar to N1

    return most_similar_word

In [24]:
#retrieve the new N2 that is the most similar to N1 and has the opposing animacy feature
data["exchanged_N2"] = data.apply(lambda x: most_similar(x["N1"], x["relative_pronoun"]), 
                        axis = 1)

  result = j.similarity(token)
  result = j.similarity(token)


In [25]:
data.tail()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
89,Yesterday I dropped the tray of the perfume which,which,,tray,perfume,cook,397
172,Yesterday I met the friend of the councillor who,who,,friend,councillor,house,398
133,Yesterday I met with the girlfriend of the cou...,who,,girlfriend,councillor,wedding,399
110,Yesterday I saw the consultant of the director...,who,,consultant,director,company,400
82,Zack recognized the daughter of the shopkeeper...,who,,daughter,shopkeepers,house,401


In [26]:
def exchange_token(tokens,target, similar):
    
    revised_sentences = [ ]
    
    token = nltk.word_tokenize(tokens)
    
    for items in token:
        revised_sentences.append(items)
    
    for words in range(len(revised_sentences)):
        if revised_sentences[words] == target: #find the original N2
            revised_sentences[words] = similar #exhange it with the new referent
                
    revised_sentences = ' '.join(revised_sentences)

    return revised_sentences

In [27]:
#exhange N2 with the new referent
data["transformed_sentence"] = data.apply(lambda x: exchange_token(
                                x["original_sentence"], x["N2"], x["exchanged_N2"]), 
                                axis=1)

In [28]:
data.head()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
268,A client looked at the clerk of the solicitor who,who,A client looked at the clerk of the office who,clerk,solicitor,office,0
262,A client noticed the hairdresser of the actres...,who,A client noticed the hairdresser of the salon who,hairdresser,actress,salon,1
278,A customer frowned at the assistant of the pha...,who,A customer frowned at the assistant of the lab...,assistant,pharmacist,laboratory,2
265,A fan looked at the guitarist of the singer who,who,A fan looked at the guitarist of the band who,guitarist,singer,band,3
274,A man asked for the apprentice of the builder who,who,A man asked for the apprentice of the college who,apprentice,builder,college,4


# Create Singular and Plural Version

In [29]:
#split dataset into singular and plural version
singular_N1 = data.copy()
plural_N1 = data.copy()

In [30]:
plural_N1.head()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id
268,A client looked at the clerk of the solicitor who,who,A client looked at the clerk of the office who,clerk,solicitor,office,0
262,A client noticed the hairdresser of the actres...,who,A client noticed the hairdresser of the salon who,hairdresser,actress,salon,1
278,A customer frowned at the assistant of the pha...,who,A customer frowned at the assistant of the lab...,assistant,pharmacist,laboratory,2
265,A fan looked at the guitarist of the singer who,who,A fan looked at the guitarist of the band who,guitarist,singer,band,3
274,A man asked for the apprentice of the builder who,who,A man asked for the apprentice of the college who,apprentice,builder,college,4


In [31]:
def singular_transformation(sent):
    
    tokens = nltk.word_tokenize(sent)
    tag = nltk.pos_tag(tokens)
    
    #add "was" to the end of the sentence
    for i in range(len(tag)):
        if tag[i][0] == "who" or tag[i][0] == "which":
            tokens.append("was")
            
            for j in range(len(tag)):
                #if N1 is singular, pluralize N2
                if tag[j][1] == "NN" and tag[j+1][0] == "of":
                    if tag[i-1][1] == "NN":
                        tokens[i-1] = p.plural(tokens[i-1])
                
                #if N1 is plural, singuralize N1 and pluralize N2
                if tag[j][1] == "NNS" and tag[j+1][0] == "of":
                    tokens[j] = p.singular_noun(tokens[j])
                    if tag[i-1][1] == "NN":
                        tokens[i-1] = p.plural(tokens[i-1])
    
    tokens = ' '.join(tokens)
    
    return tokens

In [32]:
def plural_transformation(data):
    
    tokens = nltk.word_tokenize(data)
    tag = nltk.pos_tag(tokens)
    
    #add "were" to the end of the sentence
    for i in range(len(tag)):
        if tag[i][0] == "who" or tag[i][0] == "which":
            tokens.append("were")
            
            for j in range(len(tag)):
                #if N1 is plural, singuralize N2
                if tag[j][1] == "NNS" and tag[j+1][0] == "of":
                    if tag[i-1][1] == "NNS":
                        tokens[i-1] = p.singular_noun(tokens[i-1])
                
                #if N1 is singular, pluralize N1 and singuralize N2
                if tag[j][1] == "NN" and tag[j+1][0] == "of":
                    tokens[j] = p.plural(tokens[j])
                    if tag[i-1][1] == "NNS":
                        tokens[i-1] = p.singular_noun(tokens[i-1])
                        
    tokens = ' '.join(tokens)

    return tokens

In [33]:
#assign number changes to the sentences, so N1 and N2 always differ in number
singular_N1["transformed_sentence"] = singular_N1["transformed_sentence"].apply(singular_transformation)
plural_N1["transformed_sentence"] = plural_N1["transformed_sentence"].apply(plural_transformation)

In [34]:
#assign number condition
singular_N1["number"] = "singular"
plural_N1["number"] = "plural"

In [35]:
plural_N1.head()

Unnamed: 0,original_sentence,relative_pronoun,transformed_sentence,N1,N2,exchanged_N2,id,number
268,A client looked at the clerk of the solicitor who,who,A client looked at the clerks of the office wh...,clerk,solicitor,office,0,plural
262,A client noticed the hairdresser of the actres...,who,A client noticed the hairdressers of the salon...,hairdresser,actress,salon,1,plural
278,A customer frowned at the assistant of the pha...,who,A customer frowned at the assistants of the la...,assistant,pharmacist,laboratory,2,plural
265,A fan looked at the guitarist of the singer who,who,A fan looked at the guitarists of the band who...,guitarist,singer,band,3,plural
274,A man asked for the apprentice of the builder who,who,A man asked for the apprentices of the college...,apprentice,builder,college,4,plural


# Save the Transformed Data

In [36]:
transformed_data = pd.concat([singular_N1, plural_N1]) #combine data

In [37]:
print("Overall number of sentences:", len(transformed_data))

Overall number of sentences: 804


In [38]:
#adjust the order of the columns
transformed_data = transformed_data[["id", "original_sentence", "transformed_sentence", "relative_pronoun",
                                     "number","N1", "N2", "exchanged_N2"]]

In [39]:
#check for duplicate sentences
duplicates = transformed_data[transformed_data.transformed_sentence.duplicated()]
duplicates

Unnamed: 0,id,original_sentence,transformed_sentence,relative_pronoun,number,N1,N2,exchanged_N2
125,12,A student stared at the friend of the teacher who,A student stared at the friend of the houses w...,who,singular,friend,teacher,house
48,166,The cleaning lady noticed the chief of the pla...,The cleaning lady noticed the chief of the off...,who,singular,chief,players,office
235,168,The clerk asked for the consultants of the eco...,The clerk asked for the consultant of the comp...,who,singular,consultants,economist,company
251,173,The coach looked at the physiotherapist of the...,The coach looked at the physiotherapist of the...,who,singular,physiotherapist,players,laboratory
258,186,The dean was thinking about the researchers of...,The dean was thinking about the researcher of ...,who,singular,researchers,professor,laboratory
13,192,The director congratulated the instructor of t...,The director congratulated the instructor of t...,who,singular,instructor,schoolboy,laboratory
270,230,The inspector glanced at the deputy of the pol...,The inspector glanced at the deputy of the off...,who,singular,deputy,policeman,office
158,247,The journalist interviewed the daughter of the...,The journalist interviewed the daughter of the...,who,singular,daughter,deputy,house
124,297,The police arrested the sister of the porter who,The police arrested the sister of the houses w...,who,singular,sister,porter,house
241,301,The professor frowned at the technicians of th...,The professor frowned at the technician of the...,who,singular,technicians,physicist,laboratory


In [40]:
transformed_data.drop_duplicates(subset = ["transformed_sentence"], keep = "first", inplace = True) #drop the duplicates

In [41]:
print("Number of unique sentences:", len(transformed_data))

Number of unique sentences: 780


In [171]:
output_file_path = "Data/transformed_data.csv"
transformed_data.to_csv(output_file_path, index = False) #save the adjusted data