## PART OF SPEECH TAGGING

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "NLP is fun. I'm studying NLP. Help me on this."

doc = nlp(text)

In [2]:
for sentence in doc.sents:
    print(sentence)

NLP is fun.
I'm studying NLP.
Help me on this.


In [3]:
for token in doc:
    print(f"{token.text}\t{token.idx}\t{token.lemma_}")

NLP	0	NLP
is	4	be
fun	7	fun
.	10	.
I	12	I
'm	13	be
studying	16	study
NLP	25	NLP
.	28	.
Help	30	help
me	35	I
on	38	on
this	41	this
.	45	.


In [4]:
# POS tagging

for token in doc:
    print(f"{token.text}\t\t{token.idx}\t\t{token.lemma_}\t\t{token.pos_}\t\t{spacy.explain(token.pos_)}")

NLP		0		NLP		PROPN		proper noun
is		4		be		AUX		auxiliary
fun		7		fun		ADJ		adjective
.		10		.		PUNCT		punctuation
I		12		I		PRON		pronoun
'm		13		be		AUX		auxiliary
studying		16		study		VERB		verb
NLP		25		NLP		PROPN		proper noun
.		28		.		PUNCT		punctuation
Help		30		help		VERB		verb
me		35		I		PRON		pronoun
on		38		on		ADP		adposition
this		41		this		PRON		pronoun
.		45		.		PUNCT		punctuation


In [5]:
print(nlp.get_pipe("tagger").labels)

('$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``')


In [6]:
spacy.explain("-LRB-")

'left round bracket'

In [7]:
#morphology analyze
for token in doc:
    print(f"{token.text}\t\t{token.idx}\t\t{token.lemma_}\t\t{token.pos_}\t\t{spacy.explain(token.pos_)}\t\t{token.morph}")

NLP		0		NLP		PROPN		proper noun		Number=Sing
is		4		be		AUX		auxiliary		Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin
fun		7		fun		ADJ		adjective		Degree=Pos
.		10		.		PUNCT		punctuation		PunctType=Peri
I		12		I		PRON		pronoun		Case=Nom|Number=Sing|Person=1|PronType=Prs
'm		13		be		AUX		auxiliary		Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin
studying		16		study		VERB		verb		Aspect=Prog|Tense=Pres|VerbForm=Part
NLP		25		NLP		PROPN		proper noun		Number=Sing
.		28		.		PUNCT		punctuation		PunctType=Peri
Help		30		help		VERB		verb		VerbForm=Inf
me		35		I		PRON		pronoun		Case=Acc|Number=Sing|Person=1|PronType=Prs
on		38		on		ADP		adposition		
this		41		this		PRON		pronoun		Number=Sing|PronType=Dem
.		45		.		PUNCT		punctuation		PunctType=Peri


## NAMED ENTITY RECOGNITION

In [9]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "NLP is fun. I'm studying NLP. Help me on this."

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text}\t{ent.label_}")

NLP	ORG
NLP	ORG


In [14]:
spacy.explain("ORG")

'Companies, agencies, institutions, etc.'

In [15]:
#visualizing

from spacy import displacy

displacy.render(doc, style='ent', jupyter=True, options={'distance': 150})

An example - check if the given text parts are in the right format

In [16]:
# desired output : 

# text = "I will visit you on 2024 June at your office."
# output = date and place are in the right format. / date might be in the wrong format.

In [36]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = "I will visit you on 2024-June at Teknopark."

doc = nlp(text)

for ent in doc.ents:
    print(f"{ent.text}\t{ent.label_}")

2024-June	DATE
Teknopark	GPE


In [37]:
for token in doc:
    print(f"{token.text}\t\t{token.idx}\t\t{token.pos_}")

I		0		PRON
will		2		AUX
visit		7		VERB
you		13		PRON
on		17		ADP
2024		20		NUM
-		24		SYM
June		25		PROPN
at		30		ADP
Teknopark		33		PROPN
.		42		PUNCT


In [38]:
liste = text.split(" ")
liste

['I', 'will', 'visit', 'you', 'on', '2024-June', 'at', 'Teknopark.']

In [54]:
import re

text = "June-1974 19.06.2023 monday"
remove_puncts = re.sub(r"[^\w\s]", " ", text)
print(remove_puncts.lower())

june 1974 19 06 2023 monday


In [60]:
# all possible outcome lchecker:  -->  mon is true for monday but mo is not.

def all_days(word):
    word = word.lower()
    days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
    
    if word.isdigit() and 1 <= int(word) <= 31:
        return True
    
    return any(day.startswith(word) for day in days)

def all_months(word):
    word = word.lower()
    months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
    return any(month.startswith(word) for month in months)


In [62]:
import re

def check_date_format(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "DATE":
            remove_puncts = re.sub(r"[^\w\s]", " ", ent.text)
            remove_puncts = remove_puncts.lower().split()

            day_found = False
            month_found = False
            for token in remove_puncts:
                if all_days(token):
                    day_found = True
                if all_months(token):
                    month_found = True

            if day_found and month_found:
                return f"'{ent.text}' is in the right format."
            else:
                return f"'{ent.text}' might be in the wrong format."
    return "No date found."

def check_place(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "GPE":
            if ent.text.lower() in ["teknopark", "mugla", "msku"]:
                return f"'{ent.text}' is a valid place."
            else:
                return f"'{ent.text}' might not be a valid place."
    return "No place found."

text = "I will visit Mugla on June 12, 2023, and then travel to Teknopark."

print(check_date_format(text))
print(check_place(text))

'June 12, 2023' is in the right format.
'Teknopark' is a valid place.
