### Getting the verb lemmas

In [3]:
def get_non_single_token_verbs_verbose(model, tokenizer, verbs):
    """
    Identify which verbs in a list are not tokenized as single tokens by the given model's tokenizer,
    and print how each verb is tokenized.
    
    Args:
        model: The language model (not directly used in this function, but included for context).
        tokenizer: The tokenizer corresponding to the model.
        verbs (list of str): A list of verb lemmas to check.
    
    Returns:
        list: A list of verbs that are not tokenized as single tokens.
    """
    non_single_token_verbs = []

    for verb in verbs:
        # Tokenize the verb
        tokenized_verb = tokenizer.tokenize(verb)
        
        # Print tokenization details
        print(f"Verb: {verb} -> Tokens: {tokenized_verb}")
        
        # Check if the verb is split into multiple tokens
        if len(tokenized_verb) > 1:
            non_single_token_verbs.append(verb)

    return non_single_token_verbs


In [5]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import re


# Load RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
model = RobertaForMaskedLM.from_pretrained("roberta-large")

# List of verbs
fname = "C:/Users/pcass/manres2vec/data/english_manner_result.csv"
fh = open(fname, "r")
content = fh.readlines()
rootlist = []
p = re.compile("^(.*?),")
for line in content:
    if p.match(line): #for English (no freq counts)
        rootlist.append(p.match(line).group(1))
# Get verbs that are not tokenized as single words
non_single_token_verbs = get_non_single_token_verbs_verbose(model, tokenizer, rootlist)

print("Verbs not tokenized as single tokens:")
print(non_single_token_verbs)


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['ad', 'mit']
Verb: approach -> Tokens: ['appro', 'ach']
Verb: arrive -> Tokens: ['ar', 'rive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['b', 'ellow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['d', 'ance']
Verb: declare -> Tokens: ['decl', 'are']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', 'our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['f', 'aint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['fl', 'utter']
Verb: freeze -> Tokens: ['free', 'ze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['incre', 'ase']
Verb: jog -> Tokens: ['j', 'og']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill

In [7]:
from transformers import BertTokenizer, BertForMaskedLM

# Load BERT tokenizer and model
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: eat -> Tokens: ['eat']
Verb: admit -> Tokens: ['admit']
Verb: approach -> Tokens: ['approach']
Verb: arrive -> Tokens: ['arrive']
Verb: bash -> Tokens: ['bash']
Verb: bellow -> Tokens: ['bell', '##ow']
Verb: break -> Tokens: ['break']
Verb: clean -> Tokens: ['clean']
Verb: clear -> Tokens: ['clear']
Verb: come -> Tokens: ['come']
Verb: cover -> Tokens: ['cover']
Verb: dance -> Tokens: ['dance']
Verb: declare -> Tokens: ['declare']
Verb: destroy -> Tokens: ['destroy']
Verb: devour -> Tokens: ['dev', '##our']
Verb: die -> Tokens: ['die']
Verb: empty -> Tokens: ['empty']
Verb: enter -> Tokens: ['enter']
Verb: faint -> Tokens: ['faint']
Verb: fall -> Tokens: ['fall']
Verb: fill -> Tokens: ['fill']
Verb: flutter -> Tokens: ['flutter']
Verb: freeze -> Tokens: ['freeze']
Verb: go -> Tokens: ['go']
Verb: hit -> Tokens: ['hit']
Verb: increase -> Tokens: ['increase']
Verb: jog -> Tokens: ['jo', '##g']
Verb: jump -> Tokens: ['jump']
Verb: kill -> Tokens: ['kill']
Verb: laugh -> Tokens: ['la

In [4]:
from transformers import BertTokenizer, BertForMaskedLM
import re
fname = "C:/Users/pcass/manres2vec/data/english_manres_past_tense.csv"
fh = open(fname, "r")
content = fh.readlines()
rootlist = []
p = re.compile("^(.*?),")
for line in content:
    if p.match(line): #for English (no freq counts)
        rootlist.append(p.match(line).group(1))


# Load BERT tokenizer and model
del rootlist[0]
rootlist.insert(0, 'ate')
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: ate -> Tokens: ['ate']
Verb: admitted -> Tokens: ['admitted']
Verb: approached -> Tokens: ['approached']
Verb: arrived -> Tokens: ['arrived']
Verb: bashed -> Tokens: ['bash', '##ed']
Verb: bellowed -> Tokens: ['bellowed']
Verb: broke -> Tokens: ['broke']
Verb: cleaned -> Tokens: ['cleaned']
Verb: cleared -> Tokens: ['cleared']
Verb: came -> Tokens: ['came']
Verb: covered -> Tokens: ['covered']
Verb: danced -> Tokens: ['danced']
Verb: declared -> Tokens: ['declared']
Verb: destroyed -> Tokens: ['destroyed']
Verb: devoured -> Tokens: ['dev', '##oured']
Verb: died -> Tokens: ['died']
Verb: emptied -> Tokens: ['emptied']
Verb: entered -> Tokens: ['entered']
Verb: fainted -> Tokens: ['faint', '##ed']
Verb: fell -> Tokens: ['fell']
Verb: filled -> Tokens: ['filled']
Verb: fluttered -> Tokens: ['fluttered']
Verb: froze -> Tokens: ['froze']
Verb: went -> Tokens: ['went']
Verb: hit -> Tokens: ['hit']
Verb: increased -> Tokens: ['increased']
Verb: jogged -> Tokens: ['jogged']
Verb: jumped 

In [21]:
from transformers import BertTokenizer, BertForMaskedLM
import re
import pandas as pd
import pandas as pd
df = pd.read_csv("C:/Users/pcass/dl_manres/data/manres_verbs.csv")
rootlist = list(df['verb'])
# fname = "C:/Users/pcass/dl_manres/data/manres_verbs.csv"
# fh = open(fname, "r")
# content = fh.readlines()
# rootlist = []
# p = re.compile("^(.*?),")
# for line in content:
#     if p.match(line): #for English (no freq counts)
#         rootlist.append(p.match(line).group(1))

#print(rootlist)
# Load BERT tokenizer and model
#del rootlist[0]
#rootlist.insert(0, 'ate')
tokenizer_bert = BertTokenizer.from_pretrained("bert-large-uncased")
model_bert = BertForMaskedLM.from_pretrained("bert-large-uncased")

# List of verbs

# Check which verbs are not tokenized as single tokens by BERT
non_single_token_verbs_bert = get_non_single_token_verbs_verbose(model_bert, tokenizer_bert, rootlist)

print("BERT - Verbs not tokenized as single tokens:")
print(non_single_token_verbs_bert)

Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Verb: ate -> Tokens: ['ate']
Verb: admitted -> Tokens: ['admitted']
Verb: approached -> Tokens: ['approached']
Verb: arrived -> Tokens: ['arrived']
Verb: bellowed -> Tokens: ['bellowed']
Verb: broke -> Tokens: ['broke']
Verb: cleaned -> Tokens: ['cleaned']
Verb: crawled -> Tokens: ['crawled']
Verb: cleared -> Tokens: ['cleared']
Verb: came -> Tokens: ['came']
Verb: covered -> Tokens: ['covered']
Verb: danced -> Tokens: ['danced']
Verb: declared -> Tokens: ['declared']
Verb: destroyed -> Tokens: ['destroyed']
Verb: died -> Tokens: ['died']
Verb: emptied -> Tokens: ['emptied']
Verb: entered -> Tokens: ['entered']
Verb: fell -> Tokens: ['fell']
Verb: filled -> Tokens: ['filled']
Verb: fluttered -> Tokens: ['fluttered']
Verb: froze -> Tokens: ['froze']
Verb: went -> Tokens: ['went']
Verb: hopped -> Tokens: ['hopped']
Verb: hit -> Tokens: ['hit']
Verb: increased -> Tokens: ['increased']
Verb: jogged -> Tokens: ['jogged']
Verb: jumped -> Tokens: ['jumped']
Verb: killed -> Tokens: ['killed']


### Identifying word senses

In [7]:
s = 'break.v.1'
print(wn.synset(s).definition())
print(wn.synset(s).examples())

terminate
['She interrupted her pregnancy', 'break a lucky streak', 'break the cycle of poverty']


In [79]:
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

verb = wn.synsets('yell', pos=wn.VERB)  # Get all verb senses of 'break'
for sense in verb:
    #if 'break' in sense.name():
     print(f"{sense.name()}: {sense.definition()}")
     print(f"Examples: {sense.examples()}\n")


shout.v.02: utter a sudden loud cry
Examples: ['she cried with pain when the doctor inserted the needle', "I yelled to her from the window but she couldn't hear me"]

yell.v.02: utter or declare in a very loud voice
Examples: ["You don't have to yell--I can hear you just fine"]



In [26]:
eat = wn.lemma('eat.v.0.eat')
print(eat.key())
wn.lemma_from_key(eat.key())

eat%2:37:00::


Lemma('eat.v.04.eat')

In [3]:
import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet as wn

# selected senses
target_senses = ['break.v.02', 'break.v.03', 'break.v.04', 'break.v.05', 'break.v.01', 'break.v.15']

# Retrieve all verb senses for "break"
verb_senses = wn.synsets('break', pos=wn.VERB)

# Iterate through the senses and match with the target senses
for sense in verb_senses:
    if sense.name() in target_senses:
        print(f"Sense: {sense.name()} - {sense.definition()}")
        print("Examples:")
        for example in sense.examples():
            print(f"  - {example}")
        print("\n")


Sense: break.v.02 - become separated into pieces or fragments
Examples:
  - The figurine broke
  - The freshly baked loaf fell apart


Sense: break.v.03 - render inoperable or ineffective
Examples:
  - You broke the alarm clock when you took it apart!


Sense: break.v.04 - ruin completely
Examples:
  - He busted my radio!


Sense: break.v.05 - destroy the integrity of; usually by force; cause to separate into pieces or fragments
Examples:
  - He broke the glass plate
  - She broke the match




### Create mini-corpus

In [124]:
import re

def find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag='VB'):
    """
    Search a SemCor file for sentences containing a specific word, part of speech, and WordNet sense.

    Args:
        file_path (str): Path to the SemCor file.
        target_word (str): The token to search for (e.g., "broke").
        pos_tag (str): The part of speech tag (e.g., "VB" for verbs).
        target_wnsns (list): The target WordNet senses number (e.g., "1" for sense 01).
    
    Returns:
        list: A list of sentences (str) containing the target word with the specified POS and sense.
    """
    sentences = []
    with open(file_path, 'r', encoding='utf-8') as file:
        current_sentence = ""
        inside_sentence = False
        for line in file:
            # Check if this is the start of a sentence
            if re.match(r"<s\s+snum=\d+>", line):
                inside_sentence = True
                current_sentence = ""

            # Check if this is the end of a sentence
            elif "</s>" in line:
                inside_sentence = False
                if current_sentence:
                    sentences.append(current_sentence.strip())
                current_sentence = ""

            # If inside a sentence, add the line to the current sentence
            elif inside_sentence:
                current_sentence += line.strip() + " "

    # Filter sentences containing the target word, pos, and sense
    matching_sentences = []
    for sentence in sentences:
        # Use a regex to find words matching the target criteria
        senses_pattern = "|".join(map(re.escape, target_wnsns))
        pattern = re.compile(
            rf'<wf[^>]*pos={pos_tag}[^>]*lemma={target_word}[^>]*wnsn=({senses_pattern})[^>]*>[^<]+</wf>'
        )
        if pattern.search(sentence):
            # Remove XML tags to extract the plain text sentence
            plain_sentence = re.sub(r"<[^>]+>", "", sentence)
            plain_sentence = plain_sentence.replace("_", " ")
            matching_sentences.append(plain_sentence.strip())
    
    return matching_sentences

# Example usage
file_path = "../data/corpus/semcor3.0/brown1/tagfiles/br-a01"
target_word = "say"
pos_tag = "VB"
target_wnsns = ['1','2','3']

sentences = find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag)
for i, sent in enumerate(sentences):
    print(f"Sentence {i + 1}: {sent}")


Sentence 1: The Fulton County Grand Jury said Friday an investigation of Atlanta 's recent primary election produced `` no evidence '' that any irregularities took place .
Sentence 2: The jury further said in term end presentments that the City Executive Committee , which had over-all charge of the election , `` deserves the praise and thanks of the City of Atlanta '' for the manner in which the election was conducted .
Sentence 3: `` Only a relative handful of such reports was received '' , the jury said , `` considering the widespread interest in the election , the number of voters and the size of this city '' .
Sentence 4: The jury said it did find that many of Georgia 's registration and election laws `` are outmoded or inadequate and often ambiguous '' .
Sentence 5: The grand jury commented on a number of other topics , among them the Atlanta and Fulton County purchasing departments which it said `` are well operated and follow generally accepted practices which inure to the best 

In [128]:
import os

def gather_sentences_with_sense(base_dir, target_word, target_wnsns,  pos_tag='VB'):
    """
    Gathers all sentences containing a specific word with a specific WordNet sense
    across all files in the semcor3.0 directory.

    Args:
        base_dir (str): Path to the semcor3.0 directory.
        pos_tag (str): Part-of-speech tag (e.g., "VB").
        target_wnsn (str): Target WordNet sense number (e.g., "1").
        target_word (str): The exact token to search for (e.g., "broke").

    Returns:
        List[str]: A list of all sentences that match the criteria.
    """
    all_sentences = []

    # Traverse the subdirectories (brown1, brown2, brownv)
    for folder in ["brown1", "brown2", "brownv"]:
        tagfiles_dir = os.path.join(base_dir, folder, "tagfiles")

        # Ensure the tagfiles directory exists
        if not os.path.exists(tagfiles_dir):
            print(f"Directory not found: {tagfiles_dir}")
            continue

        # Iterate through all files in the tagfiles directory
        for filename in os.listdir(tagfiles_dir):
            file_path = os.path.normpath(os.path.join(tagfiles_dir, filename))

            # Skip if not a file
            if not os.path.isfile(file_path):
                continue

            # Call the find_sentences_with_sense function and extend the results
            try:
                sentences = find_sentences_with_sense(file_path, target_word, target_wnsns, pos_tag)
                all_sentences.extend(sentences)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    return all_sentences


# Base directory containing the semcor3.0 corpus
base_dir = "../data/corpus/semcor3.0"

# Parameters for the search
pos_tag = "VB"        # Part-of-speech tag (e.g., verb)
target_wnsns = ['1','2','3']     # WordNet sense number
target_word = "break" # The exact word token to match

# Call the function to gather all matching sentences
#find_sentences_with_sense(file_path, target_word, pos_tag, target_wnsn)
sentences = gather_sentences_with_sense(base_dir, target_word, target_wnsns)

# Output the number of sentences found and some examples
print(f"Found {len(sentences)} sentences.")
for sentence in sentences[:5]:
    print(sentence)


Found 113 sentences.
If the Orioles are to break their losing streak within the next two days , it will have to be at the expense of the American League champion New York Yankees , who come in here tomorrow for a night game and a single test Sunday afternoon .
He hopes to melt off an additional eight pounds before the Flock breaks camp three weeks hence .
The crowd at the twenty-first annual K. of C. Games , final indoor meet of the season , got a thrill a few minutes earlier when a slender , bespectacled woman broke the one week old world record in the half-mile run .
He broke that boy ( Air Force fullback Nick Arshinkoff ) in two and knocked him loose from the football '' .
`` He timed it just right and broke through there before the boy ( halfback Terry Isaacson ) had time to turn around .


In [137]:
import pandas as pd
def get_examples_from_file(filepath):

    df_senses = pd.read_csv(filepath, dtype='str')
    exs_from_verb = {}
    
    for i, row in df_senses.iterrows():
        verb = row['verb']
        senses = list(row[['sense_1', 'sense_2', 'sense_3', 'sense_4', 'sense_5']].dropna())
        sentences = gather_sentences_with_sense(base_dir="../data/corpus/semcor3.0", target_word=verb, target_wnsns=senses)
        exs_from_verb[verb] = sentences

    
    df_examples = pd.DataFrame.from_dict(exs_from_verb, orient='index')
    
    max_columns = df_examples.shape[1]
    column_names = ["example_" + str(i) for i in range(max_columns)]

    df_examples
    df_examples.fillna("", inplace=True)

    
    df_examples.columns = column_names

    df_examples.to_csv("../data/diagnostics/example_usage/manres_examples.csv", index=True)

    return df_examples


df_examples = get_examples_from_file("../data/manres_senses.csv")
    

In [142]:
df_examples.iloc[:, 1:].apply(lambda row: row.astype(bool).sum(), axis=1)

eat           73
admit         55
approach      44
arrive        70
bellow         4
break         67
clean         34
crawl         25
clear         28
come         642
cover         56
dance         28
declare       61
destroy       67
die          124
empty          9
enter        145
fall         106
fill          77
flutter        0
freeze        14
go           627
hop          100
hit           43
increase     135
jog            0
jump          23
kill          81
laugh         53
melt           5
near          10
open          82
pour          25
proclaim      24
propose       47
remove        95
rise          72
roll          22
rub           15
run          175
say         1888
scream        19
shatter       10
scrub          4
shake         65
shout         48
spin          13
sweep         20
swim          12
walk         180
whisper       18
wipe          20
yell          16
dtype: int64

In [148]:
df = pd.read_csv('../data/corpus/desert_island_discs_corpus/empty_concordance_desert_island_discs_20250128190117.csv')

def combine_text_columns(df):
    """
    Combine the 'Left', 'KWIC', and 'Right' columns of a DataFrame into a new column 'Full_Sentence'.

    Args:
        df (pd.DataFrame): The DataFrame with 'Left', 'KWIC', and 'Right' columns.

    Returns:
        pd.DataFrame: The original DataFrame with a new column 'Full_Sentence' added.
    """
    # Combine the columns with a space in between
    df['Example'] = df['Left'].str.strip() + " " + df['KWIC'].str.strip() + " " + df['Right'].str.strip()
    return df

# Combine text into the new column
#df = combine_text_columns(df)

# Display the DataFrame
print(df)


    Reference                                               Left      KWIC  \
0     text#86  , and the highlight of our act was a handbag c...     empty   
1    text#141  are very bad, and it was considered more appro...  emptying   
2    text#223  a certain difficulty, yes. In fact, I''m... of...  emptying   
3    text#496  . Oh, yes, there was this guy from the Salvati...     empty   
4    text#508  in the left eye. It''s very complicated. I mea...     empty   
5    text#534  merchant one of those small miniature bottles ...   emptied   
6    text#566  , the lot. So why did you tell them to... beca...   emptied   
7    text#566  to... because I was really excited about havin...   emptied   
8    text#573  rich girls. And they''re there on the weekends...  emptying   
9    text#584  aeroplane out of graphene composites... and th...     empty   
10   text#630  an enemy plane once, didn''t you? I pursued an...   emptied   
11   text#682  president. I take the number six bus to the Le...