<img align="left" src="https://upload.wikimedia.org/wikipedia/commons/e/e1/CC_BY_icon.svg"><br />

Created by Lorenzo Babini and under [Creative Commons CC BY License](https://creativecommons.org/licenses/by/4.0/)<br />
For questions/comments/improvements, email lorenzo.babini@unicatt.it.<br />

### Intro

##### Opening and print a text file

In [None]:
text_name = "/text.txt" # insert the url of the text file

In [None]:
with open(text_name, 'r', encoding='utf-8') as f:
    text = f.read()
text = text.replace('\n\n', '\n')
print(text)

# spaCy

### Lemmatizer & POS Tagger

##### Importing and setting spaCy

spaCy models (like "it_core_news_sm") have to be previously downloaded following this instructions: https://spacy.io/usage/models.

You can find and download other models from Hugging Face. In this case you have to indicate the url path of the the donwloaded model.

In [None]:
import spacy
nlp = spacy.load('it_core_news_sm')

In [None]:
my_doc = nlp(text)

##### Printing token, lemma and POS tag in two different ways

In [None]:
print("TOKEN".ljust(16), "LEMMA", "POS TAG".rjust(15))
for token in my_doc :
    if str(token) != '\n':
        print(str(token).ljust(10), '|', str(token.lemma_).rjust(10), '|', str(token.pos_).rjust(10))

In [None]:
for token in my_doc :
    if str(token) != '\n':
        print(f'word: {str(token).ljust(10)}\tlemma: {str(token.lemma_.rjust(10))}\tpos tag: {str(token.pos_)}')

##### Creating and save a CSV with structured data (token, lemma, pos)

In [None]:
import csv
w = csv.writer(open(("/spacy_data.csv"), "w", newline="")) #insert the url of the CSV file to save
for token in my_doc:
    if token.text.isalpha() == True :
        w.writerow([token, token.lemma_, token.pos_])

### NER

##### Importing and setting spaCy (see above)

In [None]:
# Using spacy.load().
import spacy
nlp_800 = spacy.load("") ## insert model name or url path

In [None]:
my_other_doc = nlp_800(text)

##### Printing entitites

In [None]:
for ent in my_other_doc.ents :
    print(ent.text, '|', ent.label_)

##### Creating and save a CSV with entities (entity, entity label)

In [None]:
import csv
w = csv.writer(open(("/spacy_data_ent.csv"), "w", newline="")) #insert the url of the CSV file to save
for ent in my_other_doc.ents :
    w.writerow([ent.text, ent.label_])

# Stanza

For more details, see: https://stanfordnlp.github.io/stanza/usage.html

##### Import and setting Stanza

In [None]:
import stanza

In [None]:
nlp_stanza = stanza.Pipeline(lang='it', processors='tokenize,mwt,pos,lemma,ner')

In [None]:
my_second_doc = nlp_stanza(text)

##### Printing token, lemma, POS tag and grammar features

In [None]:
print(*[f'word: {word.text.ljust(10)}\tlemma: {word.lemma.rjust(10)}\tpos: {word.upos}\tfeats: {word.feats if word.feats else "_"}' for sent in my_second_doc.sentences for word in sent.words], sep='\n')

##### Creating and save a CSV with structured data (token, lemma, POS, grammar features)

In [None]:
import csv
w = csv.writer(open(("/stanza_data.csv"), "w", newline="")) #insert the url of the CSV file to save
for sent in my_second_doc.sentences:
    for word in sent.words:
        w.writerow([word.text, word.lemma, word.upos, word.feats if word.feats else "_"])

##### Printing entities and type

In [None]:
print(*[f'entity: {ent.text.ljust(9)}\ttype: {ent.type}' for ent in my_second_doc.ents], sep='\n')

##### Creating and save a CSV with entities (entity, entity type)

In [None]:
import csv
w = csv.writer(open(("/stanza_data_ent.csv"), "w", newline="")) #insert the url of the CSV file to save
for ent in my_second_doc.ents:
    w.writerow([ent.text, ent.type])