In [1]:
# works -- adding features as follows:
# 1. token
# 2. lemma
# 3. pos tag
# 4. dependency relation to head
# 5. head
# lemma of head (optional)
# pos tag of head (optional)
# 6. dependents
# 7. number of dependents


import spacy
from nltk.corpus import brown


# Load spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Get the text from the Brown Corpus
testing_text = brown.sents(fileids=['ca01'])

# Output file path for saving CoNLL format features
output_file_path = '7featues.conll'



# Open the output file for writing
with open(output_file_path, 'w', encoding='utf-8') as output_file:
    # Iterate over all sentences and perform feature extraction
    for sentence in testing_text:
        # Combine the tokens into a single sentence
        sentence_text = ' '.join(sentence)
        
        # Perform feature extraction using spaCy
        doc = nlp(sentence_text)

        # Writing features in CoNLL format
        for token in doc:
            # Get dependents as a comma-separated string
            dependents = ', '.join([dep.text for dep in token.children])

            # Use sum() to count the number of dependents
            num_dependents = sum(1 for child in token.children)
                        

                
            # Adding 7 features
            output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}\t{dependents}\t{num_dependents}\n")            
            
            # Adding 【lemma of the head word】 as a feature
            # output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}\t{token.head.lemma_}\t{dependents}\t{num_dependents}\n")            
 
            # Adding 【POS of the head word】 as a feature
            # output_file.write(f"{token.text}\t{token.lemma_}\t{token.pos_}\t{token.dep_}\t{token.head.text}\t{token.head.lemma_}\t{token.head.pos_}\t{dependents}\t{num_dependents}\n")
            
            
            
        # Add a newline to separate sentences
        output_file.write('\n')
        




In [2]:
# import spacy
# from nltk.corpus import brown
from spacy import displacy

# # Load spaCy English language model
# nlp = spacy.load('en_core_web_sm')

# # Get the text from the Brown Corpus
# testing_text = brown.sents(fileids=['ca01'])





# print the the dependency tree to doble check if we get the right dependents in our output CoNLL file

# Process the first sentence with spaCy
sentence_text = ' '.join(testing_text[0])
doc = nlp(sentence_text)

# Visualize the dependency tree using displacy and render to HTML
html_code = displacy.render(doc, style='dep', options={'distance': 100})
