#### Advanced NLP Assignment 1 - Feature Extraction Code

In [1]:
# Imports
import spacy, benepar
import pandas as pd
from nltk.corpus import brown

In [2]:
# Load spaCy English language model
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

# Get the text from the Brown Corpus
testing_text = brown.sents(fileids=['ca01'])

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:
# Code block written by Furong

# Adding features as follows:
# 1. Token
# 2. Lemma
# 3. PoS tag
# 4. Dependency relation to head
# 5. Head
# 6. Dependents
# 7. Number of dependents

# Initialize an empty list to store the extracted features
features_list = []

# Iterate over all sentences and perform feature extraction
for sentence in testing_text:
    # Combine the tokens into a single sentence
    sentence_text = ' '.join(sentence)
    
    # Perform feature extraction using spaCy
    doc = nlp(sentence_text)

    # Writing features in CoNLL format
    for token in doc:
        # Get dependents as a comma-separated string
        dependents = ', '.join([dep.text for dep in token.children])

        # Use sum() to count the number of dependents
        num_dependents = sum(1 for child in token.children)
            
        ## Create a dictionary to store the features of the token
        token_features = {
            'Token': token.text,
            'Lemma': token.lemma_,
            'POS': token.pos_,
            'Dependency Relation': token.dep_,
            'Head_Text': token.head.text,
            'Dependents': dependents,
            'Num_Dependents': num_dependents
        }
        
        # Append the token features to the list
        features_list.append(token_features)          
        
        # Adding 【lemma of the head word】 as a feature
        # output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}\t{token.head.lemma_}\t{dependents}\t{num_dependents}\n")            

        # Adding 【POS of the head word】 as a feature
        # output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}\t{token.head.lemma_}\t{token.head.pos_}\t{dependents}\t{num_dependents}\n")
        
# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(features_list)


# Display the DataFrame
display(df)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Unnamed: 0,Token,Lemma,POS,Dependency Relation,Head_Text,Dependents,Num_Dependents
0,The,the,DET,det,Jury,,0
1,Fulton,Fulton,PROPN,compound,County,,0
2,County,County,PROPN,compound,Jury,Fulton,1
3,Grand,Grand,PROPN,compound,Jury,,0
4,Jury,Jury,PROPN,nsubj,said,"The, County, Grand",3
...,...,...,...,...,...,...,...
2321,bit,bit,NOUN,attr,was,"a, of",2
2322,of,of,ADP,prep,bit,trouble,1
2323,trouble,trouble,NOUN,pobj,of,,0
2324,'','',PUNCT,punct,was,,0


In [4]:
# Testing the above implementation

from spacy import displacy

# Print the the dependency tree to doble check if we get the right dependents in our output CoNLL file

# Process the first sentence with spaCy
sentence_text = ' '.join(testing_text[0])
doc = nlp(sentence_text)

# Visualize the dependency tree using displacy and render to HTML
html_code = displacy.render(doc, style='dep', options={'distance': 100})



In [5]:
# Code block written by Nur

def generate_ngrams(df):
    
    # Forward bigram
    forward_bigram_list = [(df.at[i, 'Token'], df.at[i+1, 'Token']) for i in range(len(df) - 1)]
    # Backward bigram
    backward_bigram_list = [(df.at[i-1, 'Token'], df.at[i, 'Token']) for i in range(1, len(df))]
    
    # Forward trigram
    forward_trigram_list = [(df.at[i, 'Token'], df.at[i+1, 'Token'], df.at[i+2, 'Token']) for i in range(len(df) - 2)]
    # Backward trigram
    backward_trigram_list = [(df.at[i-2, 'Token'], df.at[i-1, 'Token'], df.at[i, 'Token']) for i in range(2, len(df))]

    # Forward POS trigram
    forward_pos_trigram_list = [(df.at[i, 'POS'], df.at[i+1, 'POS'], df.at[i+2, 'POS']) for i in range(len(df) - 2)]
    # Backward POS trigram
    backward_pos_trigram_list = [(df.at[i-2, 'POS'], df.at[i-1, 'POS'], df.at[i, 'POS']) for i in range(2, len(df))]

    # Update df with the generated n-grams
    df['Forward_Bigram'] = forward_bigram_list + [None]
    df['Backward_Bigram'] = [None] + backward_bigram_list
    df['Forward_Trigram'] = forward_trigram_list + [None, None]
    df['Backward_Trigram'] = [None, None] + backward_trigram_list
    df['Forward_POS_Trigram'] = forward_pos_trigram_list + [None, None]
    df['Backward_POS_Trigram'] = [None, None] + backward_pos_trigram_list
    
    return df

df = generate_ngrams(df)
display(df)

Unnamed: 0,Token,Lemma,POS,Dependency Relation,Head_Text,Dependents,Num_Dependents,Forward_Bigram,Backward_Bigram,Forward_Trigram,Backward_Trigram,Forward_POS_Trigram,Backward_POS_Trigram
0,The,the,DET,det,Jury,,0,"(The, Fulton)",,"(The, Fulton, County)",,"(DET, PROPN, PROPN)",
1,Fulton,Fulton,PROPN,compound,County,,0,"(Fulton, County)","(The, Fulton)","(Fulton, County, Grand)",,"(PROPN, PROPN, PROPN)",
2,County,County,PROPN,compound,Jury,Fulton,1,"(County, Grand)","(Fulton, County)","(County, Grand, Jury)","(The, Fulton, County)","(PROPN, PROPN, PROPN)","(DET, PROPN, PROPN)"
3,Grand,Grand,PROPN,compound,Jury,,0,"(Grand, Jury)","(County, Grand)","(Grand, Jury, said)","(Fulton, County, Grand)","(PROPN, PROPN, VERB)","(PROPN, PROPN, PROPN)"
4,Jury,Jury,PROPN,nsubj,said,"The, County, Grand",3,"(Jury, said)","(Grand, Jury)","(Jury, said, Friday)","(County, Grand, Jury)","(PROPN, VERB, PROPN)","(PROPN, PROPN, PROPN)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2321,bit,bit,NOUN,attr,was,"a, of",2,"(bit, of)","(a, bit)","(bit, of, trouble)","(n't, a, bit)","(NOUN, ADP, NOUN)","(PART, DET, NOUN)"
2322,of,of,ADP,prep,bit,trouble,1,"(of, trouble)","(bit, of)","(of, trouble, '')","(a, bit, of)","(ADP, NOUN, PUNCT)","(DET, NOUN, ADP)"
2323,trouble,trouble,NOUN,pobj,of,,0,"(trouble, '')","(of, trouble)","(trouble, '', .)","(bit, of, trouble)","(NOUN, PUNCT, PUNCT)","(NOUN, ADP, NOUN)"
2324,'','',PUNCT,punct,was,,0,"('', .)","(trouble, '')",,"(of, trouble, '')",,"(ADP, NOUN, PUNCT)"


In [6]:
# Code block written by Martin

def sbar(head):
    """ Find a node possibly marked as SBAR
    """
    if not head.children:
        return None
    for node in head.children:
        pass

In [7]:
def get_full_constituent(doc, token):
    """Given a token in a constituency parse tree, extracts the tokens and the POS-tags of its governing constituent.
    :param doc: the spaCy Doc object
    :param token: the token (Word) object in the spaCy Doc
    :return: two lists, one with all tokens in the constituent (str), and one with all POS-tags in the constituent (str)
    """
    # Create lists to be filled with the tokens and POS-tags that are in the constituent
    constituent_tokens = []
    constituent_pos = []

    # Find the token in the Doc object
    for sent in doc.sents:
        for word in sent:
            if word == token:
                # Found the token, get its parent
                parent = word.head
                # Find all children of the parent, add their tokens and POS-tags to constituent_tokens and constituent_pos
                for child in parent.subtree:
                    constituent_tokens.append(child.text.lower())
                    constituent_pos.append(child.pos_)
                break

    return constituent_tokens, constituent_pos

In [8]:
# Testing the above 2 implementations

text = "The quick brown fox jumps over the lazy dog."

# Process the text with spaCy
doc = nlp(text)

# Choose a token (word) from the processed document
token = doc[0]

# Call the get_whole_constituent function
constituent_tokens, constituent_pos = get_full_constituent(doc, token)

# Print the extracted tokens and POS-tags
print("Tokens in the constituent:", constituent_tokens)
print("POS-tags in the constituent:", constituent_pos)

Tokens in the constituent: ['the', 'quick', 'brown', 'fox']
POS-tags in the constituent: ['DET', 'ADJ', 'ADJ', 'NOUN']


