### Getting the verb lemmas

In [3]:
def get_non_single_token_verbs_verbose(model, tokenizer, verbs):
    """
    Identify which verbs in a list are not tokenized as single tokens by the given model's tokenizer,
    and print how each verb is tokenized.
    
    Args:
        model: The language model (not directly used in this function, but included for context).
        tokenizer: The tokenizer corresponding to the model.
        verbs (list of str): A list of verb lemmas to check.
    
    Returns:
        list: A list of verbs that are not tokenized as single tokens.
    """
    non_single_token_verbs = []

    for verb in verbs:
        # Tokenize the verb
        tokenized_verb = tokenizer.tokenize(verb)
        
        # Print tokenization details
        print(f"Verb: {verb} -> Tokens: {tokenized_verb}")
        
        # Check if the verb is split into multiple tokens
        if len(tokenized_verb) > 1:
            non_single_token_verbs.append(verb)

    return non_single_token_verbs


In [5]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import re


# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForMaskedLM.from_pretrained("roberta-large")

# List of verbs
fname = "C:/Users/pcass/manres2vec/data/english_manner_result.csv"
fh = open(fname, "r")
content = fh.readlines()
rootlist = []
p = re.compile("^(.*?),")
for line in content:
    if p.match(line): #for English (no freq counts)
        rootlist.append(p.match(line).group(1))
# Get verbs that are not tokenized as single words
non_single_token_verbs = get_non_single_token_verbs_verbose(model, tokenizer, rootlist)

print("Verbs not tokenized as single tokens:")
print(non_single_token_verbs)


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['ad', 'mit']
Verb: approach -> Tokens: ['appro', 'ach']
Verb: arrive -> Tokens: ['ar', 'rive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['b', 'ellow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['d', 'ance']
Verb: declare -> Tokens: ['decl', 'are']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', 'our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['f', 'aint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['fl', 'utter']
Verb: freeze -> Tokens: ['free', 'ze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['incre', 'ase']
Verb: jog -> Tokens: ['j', 'og']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill

In [7]:
from transformers import BertTokenizer, BertForMaskedLM

# Load BERT tokenizer and model
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['admit']
Verb: approach -> Tokens: ['approach']
Verb: arrive -> Tokens: ['arrive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['bell', '##ow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['dance']
Verb: declare -> Tokens: ['declare']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', '##our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['faint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['flutter']
Verb: freeze -> Tokens: ['freeze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['increase']
Verb: jog -> Tokens: ['jo', '##g']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill']
Verb: laugh -> Tokens: ['la

In [4]:
from transformers import BertTokenizer, BertForMaskedLM
import re
fname = "C:/Users/pcass/manres2vec/data/english_manres_past_tense.csv"
fh = open(fname, "r")
content = fh.readlines()
rootlist = []
p = re.compile("^(.*?),")
for line in content:
    if p.match(line): #for English (no freq counts)
        rootlist.append(p.match(line).group(1))


# Load BERT tokenizer and model
del rootlist[0]
rootlist.insert(0, 'ate')
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: ate -> Tokens: ['ate']
Verb: admitted -> Tokens: ['admitted']
Verb: approached -> Tokens: ['approached']
Verb: arrived -> Tokens: ['arrived']
Verb: bashed -> Tokens: ['bash', '##ed']
Verb: bellowed -> Tokens: ['bellowed']
Verb: broke -> Tokens: ['broke']
Verb: cleaned -> Tokens: ['cleaned']
Verb: cleared -> Tokens: ['cleared']
Verb: came -> Tokens: ['came']
Verb: covered -> Tokens: ['covered']
Verb: danced -> Tokens: ['danced']
Verb: declared -> Tokens: ['declared']
Verb: destroyed -> Tokens: ['destroyed']
Verb: devoured -> Tokens: ['dev', '##oured']
Verb: died -> Tokens: ['died']
Verb: emptied -> Tokens: ['emptied']
Verb: entered -> Tokens: ['entered']
Verb: fainted -> Tokens: ['faint', '##ed']
Verb: fell -> Tokens: ['fell']
Verb: filled -> Tokens: ['filled']
Verb: fluttered -> Tokens: ['fluttered']
Verb: froze -> Tokens: ['froze']
Verb: went -> Tokens: ['went']
Verb: hit -> Tokens: ['hit']
Verb: increased -> Tokens: ['increased']
Verb: jogged -> Tokens: ['jogged']
Verb: jumped 

In [21]:
from transformers import BertTokenizer, BertForMaskedLM
import re
import pandas as pd
import pandas as pd
df = pd.read_csv("C:/Users/pcass/dl_manres/data/manres_verbs.csv")
rootlist = list(df['verb'])
# fname = "C:/Users/pcass/dl_manres/data/manres_verbs.csv"
# fh = open(fname, "r")
# content = fh.readlines()
# rootlist = []
# p = re.compile("^(.*?),")
# for line in content:
#     if p.match(line): #for English (no freq counts)
#         rootlist.append(p.match(line).group(1))

#print(rootlist)
# Load BERT tokenizer and model
#del rootlist[0]
#rootlist.insert(0, 'ate')
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: ate -> Tokens: ['ate']
Verb: admitted -> Tokens: ['admitted']
Verb: approached -> Tokens: ['approached']
Verb: arrived -> Tokens: ['arrived']
Verb: bellowed -> Tokens: ['bellowed']
Verb: broke -> Tokens: ['broke']
Verb: cleaned -> Tokens: ['cleaned']
Verb: crawled -> Tokens: ['crawled']
Verb: cleared -> Tokens: ['cleared']
Verb: came -> Tokens: ['came']
Verb: covered -> Tokens: ['covered']
Verb: danced -> Tokens: ['danced']
Verb: declared -> Tokens: ['declared']
Verb: destroyed -> Tokens: ['destroyed']
Verb: died -> Tokens: ['died']
Verb: emptied -> Tokens: ['emptied']
Verb: entered -> Tokens: ['entered']
Verb: fell -> Tokens: ['fell']
Verb: filled -> Tokens: ['filled']
Verb: fluttered -> Tokens: ['fluttered']
Verb: froze -> Tokens: ['froze']
Verb: went -> Tokens: ['went']
Verb: hopped -> Tokens: ['hopped']
Verb: hit -> Tokens: ['hit']
Verb: increased -> Tokens: ['increased']
Verb: jogged -> Tokens: ['jogged']
Verb: jumped -> Tokens: ['jumped']
Verb: killed -> Tokens: ['killed']


### Identifying word senses

In [7]:
s = 'break.v.1'
print(wn.synset(s).definition())
print(wn.synset(s).examples())

terminate
['She interrupted her pregnancy', 'break a lucky streak', 'break the cycle of poverty']


In [79]:
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

verb = wn.synsets('yell', pos=wn.VERB)  # Get all verb senses of 'break'
for sense in verb:
    #if 'break' in sense.name():
     print(f"{sense.name()}: {sense.definition()}")
     print(f"Examples: {sense.examples()}\n")


shout.v.02: utter a sudden loud cry
Examples: ['she cried with pain when the doctor inserted the needle', "I yelled to her from the window but she couldn't hear me"]

yell.v.02: utter or declare in a very loud voice
Examples: ["You don't have to yell--I can hear you just fine"]



In [26]:
eat = wn.lemma('eat.v.0.eat')
print(eat.key())
wn.lemma_from_key(eat.key())

eat%2:37:00::


Lemma('eat.v.04.eat')

In [3]:
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

# selected senses
target_senses = ['break.v.02', 'break.v.03', 'break.v.04', 'break.v.05', 'break.v.01', 'break.v.15']

# Retrieve all verb senses for "break"
verb_senses = wn.synsets('break', pos=wn.VERB)

# Iterate through the senses and match with the target senses
for sense in verb_senses:
    if sense.name() in target_senses:
        print(f"Sense: {sense.name()} - {sense.definition()}")
        print("Examples:")
        for example in sense.examples():
            print(f"  - {example}")
        print("\n")


Sense: break.v.02 - become separated into pieces or fragments
Examples:
  - The figurine broke
  - The freshly baked loaf fell apart


Sense: break.v.03 - render inoperable or ineffective
Examples:
  - You broke the alarm clock when you took it apart!


Sense: break.v.04 - ruin completely
Examples:
  - He busted my radio!


Sense: break.v.05 - destroy the integrity of; usually by force; cause to separate into pieces or fragments
Examples:
  - He broke the glass plate
  - She broke the match




### Create mini-corpus

In [124]:
import re

def find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag='VB'):
    """
    Search a SemCor file for sentences containing a specific word, part of speech, and WordNet sense.

    Args:
        file_path (str): Path to the SemCor file.
        target_word (str): The token to search for (e.g., "broke").
        pos_tag (str): The part of speech tag (e.g., "VB" for verbs).
        target_wnsns (list): The target WordNet senses number (e.g., "1" for sense 01).
    
    Returns:
        list: A list of sentences (str) containing the target word with the specified POS and sense.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = ""
        inside_sentence = False
        for line in file:
            # Check if this is the start of a sentence
            if re.match(r"<s\s+snum=\d+>", line):
                inside_sentence = True
                current_sentence = ""

            # Check if this is the end of a sentence
            elif "</s>" in line:
                inside_sentence = False
                if current_sentence:
                    sentences.append(current_sentence.strip())
                current_sentence = ""

            # If inside a sentence, add the line to the current sentence
            elif inside_sentence:
                current_sentence += line.strip() + " "

    # Filter sentences containing the target word, pos, and sense
    matching_sentences = []
    for sentence in sentences:
        # Use a regex to find words matching the target criteria
        senses_pattern = "|".join(map(re.escape, target_wnsns))
        pattern = re.compile(
            rf'<wf[^>]*pos={pos_tag}[^>]*lemma={target_word}[^>]*wnsn=({senses_pattern})[^>]*>[^<]+</wf>'
        )
        if pattern.search(sentence):
            # Remove XML tags to extract the plain text sentence
            plain_sentence = re.sub(r"<[^>]+>", "", sentence)
            plain_sentence = plain_sentence.replace("_", " ")
            matching_sentences.append(plain_sentence.strip())
    
    return matching_sentences

# Example usage
file_path = "../data/corpus/semcor3.0/brown1/tagfiles/br-a01"
target_word = "say"
pos_tag = "VB"
target_wnsns = ['1','2','3']

sentences = find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag)
for i, sent in enumerate(sentences):
    print(f"Sentence {i + 1}: {sent}")


Sentence 1: The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place .
Sentence 2: The jury further said in term end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
Sentence 3: `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
Sentence 4: The jury said it did find that many of Georgia 's registration and election laws `` are outmoded or inadequate and often ambiguous '' .
Sentence 5: The grand jury commented on a number of other topics , among them the Atlanta and Fulton County purchasing departments which it said `` are well operated and follow generally accepted practices which inure to the best 

In [128]:
import os

def gather_sentences_with_sense(base_dir, target_word, target_wnsns,  pos_tag='VB'):
    """
    Gathers all sentences containing a specific word with a specific WordNet sense
    across all files in the semcor3.0 directory.

    Args:
        base_dir (str): Path to the semcor3.0 directory.
        pos_tag (str): Part-of-speech tag (e.g., "VB").
        target_wnsn (str): Target WordNet sense number (e.g., "1").
        target_word (str): The exact token to search for (e.g., "broke").

    Returns:
        List[str]: A list of all sentences that match the criteria.
    """
    all_sentences = []

    # Traverse the subdirectories (brown1, brown2, brownv)
    for folder in ["brown1", "brown2", "brownv"]:
        tagfiles_dir = os.path.join(base_dir, folder, "tagfiles")

        # Ensure the tagfiles directory exists
        if not os.path.exists(tagfiles_dir):
            print(f"Directory not found: {tagfiles_dir}")
            continue

        # Iterate through all files in the tagfiles directory
        for filename in os.listdir(tagfiles_dir):
            file_path = os.path.normpath(os.path.join(tagfiles_dir, filename))

            # Skip if not a file
            if not os.path.isfile(file_path):
                continue

            # Call the find_sentences_with_sense function and extend the results
            try:
                sentences = find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag)
                all_sentences.extend(sentences)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    return all_sentences


# Base directory containing the semcor3.0 corpus
base_dir = "../data/corpus/semcor3.0"

# Parameters for the search
pos_tag = "VB"        # Part-of-speech tag (e.g., verb)
target_wnsns = ['1','2','3']     # WordNet sense number
target_word = "break" # The exact word token to match

# Call the function to gather all matching sentences
#find_sentences_with_sense(file_path, target_word, pos_tag, target_wnsn)
sentences = gather_sentences_with_sense(base_dir, target_word, target_wnsns)

# Output the number of sentences found and some examples
print(f"Found {len(sentences)} sentences.")
for sentence in sentences[:5]:
    print(sentence)


Found 113 sentences.
If the Orioles are to break their losing streak within the next two days , it will have to be at the expense of the American League champion New York Yankees , who come in here tomorrow for a night game and a single test Sunday afternoon .
He hopes to melt off an additional eight pounds before the Flock breaks camp three weeks hence .
The crowd at the twenty-first annual K. of C. Games , final indoor meet of the season , got a thrill a few minutes earlier when a slender , bespectacled woman broke the one week old world record in the half-mile run .
He broke that boy ( Air Force fullback Nick Arshinkoff ) in two and knocked him loose from the football '' .
`` He timed it just right and broke through there before the boy ( halfback Terry Isaacson ) had time to turn around .


In [137]:
import pandas as pd
def get_examples_from_file(filepath):

    df_senses = pd.read_csv(filepath, dtype='str')
    exs_from_verb = {}
    
    for i, row in df_senses.iterrows():
        verb = row['verb']
        senses = list(row[['sense_1', 'sense_2', 'sense_3', 'sense_4', 'sense_5']].dropna())
        sentences = gather_sentences_with_sense(base_dir="../data/corpus/semcor3.0", target_word=verb, target_wnsns=senses)
        exs_from_verb[verb] = sentences

    
    df_examples = pd.DataFrame.from_dict(exs_from_verb, orient='index')
    
    max_columns = df_examples.shape[1]
    column_names = ["example_" + str(i) for i in range(max_columns)]

    df_examples
    df_examples.fillna("", inplace=True)

    
    df_examples.columns = column_names

    df_examples.to_csv("../data/diagnostics/example_usage/manres_examples.csv", index=True)

    return df_examples


df_examples = get_examples_from_file("../data/manres_senses.csv")
    

In [142]:
df_examples.iloc[:, 1:].apply(lambda row: row.astype(bool).sum(), axis=1)

eat           73
admit         55
approach      44
arrive        70
bellow         4
break         67
clean         34
crawl         25
clear         28
come         642
cover         56
dance         28
declare       61
destroy       67
die          124
empty          9
enter        145
fall         106
fill          77
flutter        0
freeze        14
go           627
hop          100
hit           43
increase     135
jog            0
jump          23
kill          81
laugh         53
melt           5
near          10
open          82
pour          25
proclaim      24
propose       47
remove        95
rise          72
roll          22
rub           15
run          175
say         1888
scream        19
shatter       10
scrub          4
shake         65
shout         48
spin          13
sweep         20
swim          12
walk         180
whisper       18
wipe          20
yell          16
dtype: int64

In [170]:
df = pd.read_csv('../data/corpus/desert_island_discs_corpus/whisper_concordance_ukwac_full_20250129005846.csv')

def combine_text_columns(df):
    """
    Combine the 'Left', 'KWIC', and 'Right' columns of a DataFrame into a new column 'Full_Sentence'.

    Args:
        df (pd.DataFrame): The DataFrame with 'Left', 'KWIC', and 'Right' columns.

    Returns:
        pd.DataFrame: The original DataFrame with a new column 'Full_Sentence' added.
    """
    # Combine the columns with a space in between
    df['Example'] = df['Left'].str.strip() + " " + df['KWIC'].str.strip() + " " + df['Right'].str.strip()
    return df

# Combine text into the new column
df = combine_text_columns(df)

# Display the DataFrame
df = df.drop(['Left', 'KWIC', 'Right'], axis=1)
df.to_csv('../data/corpus/desert_island_discs_corpus/whisper.csv')

In [7]:
import pandas as pd
df_examples = pd.read_csv("../data/diagnostics/example_usage/manres_examples.csv", index_col=0)
df_examples

Unnamed: 0,example_0,example_1,example_2,example_3,example_4,example_5,example_6,example_7,example_8,example_9,...,example_1879,example_1880,example_1881,example_1882,example_1883,example_1884,example_1885,example_1886,example_1887,example_1888
eat,"The dialogue is sharp , witty and candid - typ...",`` Along with the fruit they did also fall und...,"In any case , though they had been promised im...","And unlike other fruits , one cannot eat the s...",Since strong nociceptive stimuli produce an ex...,There are signs that it has struggled for days...,They ate the cafeteria food with its orange sa...,"He had eaten almost nothing on the crested , t...","`` You do n't eat enough , honey .","He ate what he felt like , slept as much or as...",...,,,,,,,,,,
admit,The Protestants themselves are the first to ad...,"It should be admitted , too , that there is a ...",The Woonsocket Patriot admitted that John Brow...,Although admitting Brown 's guilt on legal gro...,Rousseau had to admit that though he could n't...,"I am getting deaf , I must admit it '' .",To act otherwise would be to admit his helples...,"But , admit or not , Bonner was helpless .","`` I was wrong '' , I admitted .",I took a deep breath and an even deeper swallo...,...,,,,,,,,,,
approach,I think that we are here also talking of the k...,"If an automobile were approaching him , he wou...","Also , planetary gravitational attraction incr...","On C , from the point P at **f to the point Q ...",As s approaches T the square will be outside C...,MacPherson boldly approached the fortified ado...,"The patrol snaked around in back of the cave ,...","Another man approached , this one fully dressed .","The veterans , idling on their benches in the ...","While this was being discussed , we saw the mi...",...,,,,,,,,,,
arrive,"Hansen arrived just before nightfall , two hou...","Since arriving here , however , I have formed ...","This , of course , depends on the character of...",When we arrive at the events concerned in the ...,"As the robbers leave the looted train , the fi...",We do not arrive at spatial images by means of...,"Arriving at daybreak , they found Julio in his...","Traveling all night , Clark and twelve men arr...",A half hour later the Vice President arrived .,"The music arrived , taking him , its rhythm st...",...,,,,,,,,,,
bellow,"It was Mr. Jack , bellowing out in the hall .","`` Got the upstairs guy '' , he bellows .",He bellowed orders and watched the alert respo...,"While he was at it , the philosopher of the Kr...",Musmanno bellowed to his Italian crewmen .,"Another choice of record, please, Dennis. Well...",I think he was possibly the most entertaining ...,"Her name was Dan Jokes, and she provided in on...",And then a voice came from the loudspeaker the...,He had a show with this fantastic five-inch di...,...,,,,,,,,,,
break,He broke that boy ( Air Force fullback Nick Ar...,"End Gene Raesz , who broke a hand in the Owl '...","Tonight at 8 o'clock the Cardinals , who gave ...",The company which performed the Pulitzer Prize...,"a train rushes straight at the audience , or a...","It must be remembered , however , that there a...",The argument against this last approach is com...,Not a bullet touched Cook who was nearer the a...,When they were refused entrance to his brother...,"With a supreme effort , he broke it off .",...,,,,,,,,,,
clean,The brush moves up and down and is small enoug...,The brushes can be cleaned and sterilized by b...,"No , other than cleaning out the pores and mak...","It is brought to packing houses , cleaned and ...",An area may have been partially logged and req...,His problem then became one of restraining the...,Who would clean up the mess when the war was o...,Clean the place up .,"`` Cleaned all them Rebs out'n the hills , the...",Whenever the place was cleaned or a meal serve...,...,,,,,,,,,,
crawl,From the east to the west coast of the Korean ...,The crawling men tried to rise and fell again .,The third crawling man forced himself erect .,Then he began to crawl again .,If only there was a clock for him to crawl aga...,He turned slowly and began to crawl back up th...,Watson pounded to the crawling man and stopped...,"How far could it be , Watson thought bleakly ,...",`` Too many of our writers seem to be interest...,But this is a public park and it 's a city ord...,...,,,,,,,,,,
clear,In doing so science has unquestionably cleared...,"Some shrubs may be of good landscaping value ,...",He was merely clearing a way to what he had to...,"After the storm , the sky cleared blue and coo...","It was a delayed moon , but now the sky had cl...","After a moment of thought , her mind cleared a...",Dear Sirs : Let me begin by clearing up any po...,"It did something to clear the ground , but it ...",Some years later the bank handling the Mercer ...,It occurred to me that you might be interested...,...,,,,,,,,,,
come,The TEA estimated there would be 182 scholasti...,If the Orioles are to break their losing strea...,While working out in Sylvania a swelling devel...,Moritz came to Texas in 1954 but his freshman ...,Because of its important game with Arkansas co...,"Rabb , the former Louisiana State field genera...",Chico Ruiz made a spectacular play on Alusik '...,He came all the way around on Gardner 's hit b...,"The world 's best golfer , shooting below par ...","This knowledge has come in handy , too .",...,,,,,,,,,,


In [6]:
import pandas as pd
import os

def update_example_sentences(df_examples_path, verbs):
    """
    Update df_examples.csv by adding sentences from verb-specific CSV files into empty example columns.

    Args:
        df_examples_path (str): Path to df_examples.csv.
        verb_files (list): List of paths to verb-specific CSV files (e.g., ['whisper.csv', 'yell.csv']).
    """
    # Load df_examples.csv
    df_examples = pd.read_csv(df_examples_path, index_col=0)

    # Get example columns
    example_cols = [col for col in df_examples.columns if col.startswith("example_")]
    v_files = [os.path.join("../data/corpus/desert_island_discs_corpus/", filepath +'.csv') for filepath in verbs]

    for verb, verb_file in (zip(verbs, v_files)):
        try:
            # Extract the verb name from the filename (e.g., 'whisper.csv' -> 'whisper')
            print(verb)
            # Load the verb-specific file
            df_verb = pd.read_csv(verb_file, encoding='cp1252')
    
            if verb not in df_examples.index:
                print(f"Skipping {verb} - not found in df_examples.")
                continue
    
            # Find the number of existing examples for this verb
            existing_examples = df_examples.loc[verb, example_cols].notna().sum()
    
            # If already 20 examples, skip
            if existing_examples >= 20:
                print(f"{verb} already has 20 examples. Skipping.")
                continue
    
            # Extract sentences from the 'Text' column
            new_sentences = df_verb['Example'].dropna().tolist()
    
            # Determine how many new sentences can be added
            remaining_slots = 20 - existing_examples
            sentences_to_add = new_sentences[:remaining_slots]  # Limit to available slots
    
            # Fill empty example columns
            for i, col in enumerate(example_cols):
                if pd.isna(df_examples.loc[verb, col]):  # If empty, add sentence
                    if i < len(sentences_to_add):
                        df_examples.at[verb, col] = sentences_to_add[i]
                    else:
                        break
        except Exception as e:
            print(f"Skipping {verb_file} due to encoding or read error: {e}")
            continue

    # Save the updated df_examples.csv
    
    df_examples.to_csv("../data/diagnostics/example_usage/manres_examples_complete.csv")
    return(df_examples)
    print(f"Updated {df_examples_path} successfully.")

# Example usage:
df = update_example_sentences("../data/diagnostics/example_usage/manres_examples.csv", ["empty", "flutter", "freeze", "melt", "jog", "near", "rub", "scream", "shatter", "spin", "swim", "whisper", "yell"])


empty
flutter
freeze
melt
Skipping ../data/corpus/desert_island_discs_corpus/melt.csv due to encoding or read error: [Errno 2] No such file or directory: '../data/corpus/desert_island_discs_corpus/melt.csv'
jog
near
rub
scream
scream already has 20 examples. Skipping.
shatter
spin
swim
whisper
Skipping ../data/corpus/desert_island_discs_corpus/whisper.csv due to encoding or read error: 'charmap' codec can't decode byte 0x9d in position 34792: character maps to <undefined>
yell


In [25]:
df_examples_path = "../data/diagnostics/example_usage/manres_examples.csv" 
verb_files = ["empty", "flutter.csv", "freeze.csv", "jog.csv", "near.csv", "rub.csv", "scream.csv", "shatter.csv", "spin.csv", "swim.csv", "whisper.csv", "yell.csv"]
print([os.path.join("../data/corpus/desert_island_discs_corpus/", filepath + '.csv') for filepath in verb_files])

df_examples = pd.read_csv(df_examples_path, index_col=0)

['../data/corpus/desert_island_discs_corpus/empty.csv', '../data/corpus/desert_island_discs_corpus/flutter.csv.csv', '../data/corpus/desert_island_discs_corpus/freeze.csv.csv', '../data/corpus/desert_island_discs_corpus/jog.csv.csv', '../data/corpus/desert_island_discs_corpus/melt.csv.csv', '../data/corpus/desert_island_discs_corpus/near.csv.csv', '../data/corpus/desert_island_discs_corpus/rub.csv.csv', '../data/corpus/desert_island_discs_corpus/scream.csv.csv', '../data/corpus/desert_island_discs_corpus/shatter.csv.csv', '../data/corpus/desert_island_discs_corpus/spin.csv.csv', '../data/corpus/desert_island_discs_corpus/swim.csv.csv', '../data/corpus/desert_island_discs_corpus/whisper.csv.csv', '../data/corpus/desert_island_discs_corpus/yell.csv.csv']


In [21]:
df_examples.index

Index(['eat', 'admit', 'approach', 'arrive', 'bellow', 'break', 'clean',
       'crawl', 'clear', 'come', 'cover', 'dance', 'declare', 'destroy', 'die',
       'empty', 'enter', 'fall', 'fill', 'flutter', 'freeze', 'go', 'hop',
       'hit', 'increase', 'jog', 'jump', 'kill', 'laugh', 'melt', 'near',
       'open', 'pour', 'proclaim', 'propose', 'remove', 'rise', 'roll', 'rub',
       'run', 'say', 'scream', 'shatter', 'scrub', 'shake', 'shout', 'spin',
       'sweep', 'swim', 'walk', 'whisper', 'wipe', 'yell'],
      dtype='object')

In [20]:
df_examples_path = "../data/diagnostics/example_usage/manres_examples.csv"
verb_files = ["empty.csv", "flutter.csv", "freeze.csv", "jog.csv", "melt.csv", "near.csv", "rub.csv", "scream.csv", "shatter.csv", "spin.csv", "swim.csv", "whisper.csv", "yell.csv"]


df_examples = pd.read_csv(df_examples_path, index_col=0)

    # Get example columns
#example_cols = [col for col in df_examples.columns if col.startswith("example_")]
v_files = [os.path.join("../data/corpus/desert_island_discs_corpus/", filepath) for filepath in verb_files]

for verb, verb_file in (zip(verb_files, v_files)):
        # Extract the verb name from the filename (e.g., 'whisper.csv' -> 'whisper')
    print(verb,  verb_file)
    verb = verb_file.replace('.csv', '')

empty.csv ../data/corpus/desert_island_discs_corpus/empty.csv
flutter.csv ../data/corpus/desert_island_discs_corpus/flutter.csv
freeze.csv ../data/corpus/desert_island_discs_corpus/freeze.csv
jog.csv ../data/corpus/desert_island_discs_corpus/jog.csv
melt.csv ../data/corpus/desert_island_discs_corpus/melt.csv
near.csv ../data/corpus/desert_island_discs_corpus/near.csv
rub.csv ../data/corpus/desert_island_discs_corpus/rub.csv
scream.csv ../data/corpus/desert_island_discs_corpus/scream.csv
shatter.csv ../data/corpus/desert_island_discs_corpus/shatter.csv
spin.csv ../data/corpus/desert_island_discs_corpus/spin.csv
swim.csv ../data/corpus/desert_island_discs_corpus/swim.csv
whisper.csv ../data/corpus/desert_island_discs_corpus/whisper.csv
yell.csv ../data/corpus/desert_island_discs_corpus/yell.csv


In [14]:
X = []
y = []
for verb, embedding in embeddings.items():
    X.append(embedding)
    y.append(verb)


    

In [20]:
X

[array([-0.9022919 , -0.39300144, -0.4231461 , ..., -0.16986285,
         0.24384359, -0.27937418], dtype=float32),
 array([-0.46495014, -0.4137651 , -0.36234015, ..., -0.48298556,
         0.07146331,  0.11682661], dtype=float32),
 array([-0.27853656, -0.13335775, -0.28377387, ..., -0.6003925 ,
        -0.0046896 , -0.11468933], dtype=float32),
 array([-0.49035662, -0.13851042, -0.23269132, ..., -0.5813302 ,
         0.43353248, -0.26170322], dtype=float32),
 array([-0.4535616 , -0.24775176, -0.06907265, ..., -0.38574827,
         0.04116874, -0.07826604], dtype=float32),
 array([-0.59016913, -0.2726337 ,  0.05222418, ..., -0.1777431 ,
         0.13128963, -0.15662393], dtype=float32),
 array([-0.89900255, -0.09798597, -0.13783084, ..., -0.6872528 ,
         0.24963847,  0.13631806], dtype=float32),
 array([ 0.06586035, -0.49369922, -0.39447886, ..., -0.53766084,
         0.17356706, -0.11079353], dtype=float32),
 array([-0.9003893 , -0.07531289, -0.23934555, ..., -0.34078363,
       

In [12]:
import pickle
embedding_file = '../data/embs/verb_embeddings.pkl'
with open(embedding_file, "rb") as f:
    embeddings = pickle.load(f)

for verb in embeddings.keys()
embeddings['whisper'].shape

(1024,)

In [1]:
import chardet
file_path = "../data/diagnostics/example_usage/manres_examples_complete.csv"

# Read a portion of the file to detect encoding
with open(file_path, "rb") as f:
    raw_data = f.read(100000)  # Read first 100KB
    result = chardet.detect(raw_data)

print(f"Detected encoding: {result['encoding']}")

Detected encoding: ascii


In [14]:
file_name = "bert-base-uncased_6_verb_embeddings.pkl"
embs_model = file_name.split("_")[0]
embs_layer = file_name.split("_")[1]
model 
print(str.upper(model))

BERT-BASE-UNCASED


In [3]:
import pickle 
def load_embeddings(embedding_file):
    with open(embedding_file, "rb") as f:
        embeddings = pickle.load(f)
    return embeddings

file = "../data/embs/base_verb_embeddings.pkl"

embs = load_embeddings(file)
embs.keys()



dict_keys(['eat', 'admit', 'approach', 'arrive', 'bellow', 'break', 'clean', 'crawl', 'clear', 'come', 'cover', 'dance', 'declare', 'destroy', 'die', 'empty', 'enter', 'fall', 'fill', 'flutter', 'freeze', 'go', 'hop', 'hit', 'increase', 'jog', 'jump', 'kill', 'laugh', 'melt', 'near', 'open', 'pour', 'proclaim', 'propose', 'remove', 'rise', 'roll', 'rub', 'run', 'say', 'scream', 'shatter', 'scrub', 'shake', 'shout', 'spin', 'sweep', 'swim', 'walk', 'whisper', 'wipe', 'yell'])