## POS and NER

In [6]:
import spacy

In [24]:
# Load English model
nlp = spacy.load("en_core_web_sm")

In [35]:

# Your input sentence
sentence = "Barack Obama was born in Hawaii and served as the 44th President of the United States."

# Process the sentence
doc = nlp(sentence)

# POS tagging
print("Part-of-Speech Tags:")
for token in doc:
    print(f"{token.text} -> {token.pos_} ({token.tag_})")

# Named Entity Recognition
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")


Part-of-Speech Tags:
Barack -> PROPN (NNP)
Obama -> PROPN (NNP)
was -> AUX (VBD)
born -> VERB (VBN)
in -> ADP (IN)
Hawaii -> PROPN (NNP)
and -> CCONJ (CC)
served -> VERB (VBD)
as -> ADP (IN)
the -> DET (DT)
44th -> ADJ (JJ)
President -> PROPN (NNP)
of -> ADP (IN)
the -> DET (DT)
United -> PROPN (NNP)
States -> PROPN (NNP)
. -> PUNCT (.)

Named Entities:
Barack Obama -> PERSON
Hawaii -> GPE
44th -> ORDINAL
the United States -> GPE


## Topic Segmentation

In [31]:
import nltk
from nltk.tokenize import TextTilingTokenizer

# 🔧 Tell NLTK where to find your offline corpora
nltk.data.path.insert(0, "/home/rohank__iitp/nltk_data")

# 🧾 Sample long text (needs enough content for topic segmentation)
text = """
Artificial Intelligence is transforming industries. It’s used in healthcare for diagnostics and treatment suggestions.
It’s also being used in agriculture to optimize crop yield and monitor soil conditions.

Meanwhile, climate change is becoming more severe. Governments are pushing green policies to curb emissions.
Technological innovations are focusing on renewable energy and carbon capture solutions.

In sports, the Olympics will feature new events like skateboarding and surfing.
Athletes are using AI-based tools to analyze performance and prevent injuries.

Space exploration is also advancing. Private companies are launching missions to the Moon and Mars.
There’s a renewed interest in deep space communication and building space habitats.
"""

# text = """
# I am looking for a comprehensive policy for my Tesla car with roadside assistance.

# This policy should also cover long-distance driving and emergency towing.

# Electric vehicle insurance should include roadside assistance and battery failure response.
# """

# 🧠 Create the tokenizer — with relaxed settings to handle short input
tt = TextTilingTokenizer(w=20, k=5, smoothing_width=1)

# 🔍 Tokenize into topic segments
segments = tt.tokenize(text)

# 🖨️ Print each segment
for i, segment in enumerate(segments, 1):
    print(f"\n--- Segment {i} ---\n{segment}")



--- Segment 1 ---

Artificial Intelligence is transforming industries. It’s used in healthcare for diagnostics and treatment suggestions.
It’s also being used in agriculture to optimize crop yield and monitor soil conditions.

--- Segment 2 ---


Meanwhile, climate change is becoming more severe. Governments are pushing green policies to curb emissions.
Technological innovations are focusing on renewable energy and carbon capture solutions.

In sports, the Olympics will feature new events like skateboarding and surfing.
Athletes are using AI-based tools to analyze performance and prevent injuries.

Space exploration is also advancing. Private companies are launching missions to the Moon and Mars.
There’s a renewed interest in deep space communication and building space habitats.



## Language Detection

In [48]:
from langdetect import detect, detect_langs
text="Hi, I am a human."
text = "Hi, I am a human. I live on Earth and speak English fluently."
text="Okay, that sounds pretty good. What would the premium be for the comprehensive policy?"
# text = "Ceci est un exemple de texte en français."
print("Detected language:", detect(text))        # e.g., 'fr'
print("Language probabilities:", detect_langs(text))  # e.g., [fr:0.9999969999949514]


Detected language: en
Language probabilities: [en:0.999997542940732]


## Dependency Parsing

In [41]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog.")

def print_dependencies(token, level=0):
    print("  " * level + f"{token.text} ({token.dep_})")
    for child in token.children:
        print_dependencies(child, level + 1)

# Start from the root (main verb)
for token in doc:
    if token.head == token:  # root
        print_dependencies(token)


jumps (ROOT)
  fox (nsubj)
    The (det)
    quick (amod)
    brown (amod)
  over (prep)
    dog (pobj)
      the (det)
      lazy (amod)
  . (punct)


In [33]:


import spacy
from spacy import displacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The quick brown fox jumps over the lazy dog.")

# Print dependency relationships
for token in doc:
    print(f"{token.text:10} -> {token.dep_:10} -> {token.head.text}")

# Visualize in browser
displacy.serve(doc, style="dep")


The        -> det        -> fox
quick      -> amod       -> fox
brown      -> amod       -> fox
fox        -> nsubj      -> jumps
jumps      -> ROOT       -> jumps
over       -> prep       -> jumps
the        -> det        -> dog
lazy       -> amod       -> dog
dog        -> pobj       -> over
.          -> punct      -> jumps


ImportError: cannot import name 'display' from 'IPython.core.display' (/home/rohank__iitp/.conda/envs/rohan12/lib/python3.12/site-packages/IPython/core/display.py)

## Relation extraction

In [44]:
import spacy
from spacy.tokens import Token

nlp = spacy.load("en_core_web_sm")

def get_SVO(doc):
    """
    Yield (subject, verb, object) triples from a spaCy Doc.
    """
    for token in doc:
        # skip non‑verbs
        if token.pos_ != "VERB":
            continue

        # subjects: nominal (nsubj) or passive (nsubjpass)
        subjects = [w for w in token.lefts if w.dep_ in ("nsubj", "nsubjpass")]
        if not subjects:
            continue  # no subject, skip

        # objects can be:
        #   • direct object  (dobj)
        #   • object of prep (pobj) reached via a preposition
        #   • attribute or complement (attr) e.g. "is president"
        objects = [w for w in token.rights if w.dep_ == "dobj"]

        # look for prep → pobj
        for prep in (w for w in token.rights if w.dep_ == "prep"):
            objects.extend([w for w in prep.rights if w.dep_ == "pobj"])

        # attribute/complement ("is president")
        objects.extend([w for w in token.rights if w.dep_ == "attr"])

        if subjects and objects:
            for s in subjects:
                for o in objects:
                    yield (s.text, token.lemma_, o.text)

# text = "Barack Obama was born in Hawaii and served as the president of the United States."
text="I am looking for a comprehensive policy for my Tesla car with roadside assistance."

# text="Okay, that sounds good. What about roadside assistance?"
text="Hi, I am interested in getting motor insurance for my bike. I just bought a new 2024 Royal Enfield Classic 350."
doc = nlp(text)

for triple in get_SVO(doc):
    print("Relation:", triple)


Relation: ('I', 'buy', 'Classic')
