<a href="https://colab.research.google.com/github/parthibhan007/NLP_DSA0320/blob/main/Copy_of_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re

def main():
    # Sample text
    text = """
    John Doe: john.doe@example.com
    Jane Smith: jane_smith123@gmail.com
    Sam Brown: sam.brown@yahoo.com
    """

    # Pattern to match email addresses
    email_pattern = r'[\w\.-]+@[\w\.-]+\.\w+'

    # Find all email addresses in the text
    emails = re.findall(email_pattern, text)
    print("Emails found:")
    for email in emails:
        print(email)

    # Pattern to match names followed by email addresses
    name_email_pattern = r'(\w+\s\w+):\s([\w\.-]+@[\w\.-]+\.\w+)'

    # Search and extract names and emails
    matches = re.findall(name_email_pattern, text)
    print("\nNames and Emails found:")
    for name, email in matches:
        print(f"Name: {name}, Email: {email}")

    # Check if a specific email exists in the text
    specific_email = "sam.brown@yahoo.com"
    if re.search(re.escape(specific_email), text):
        print(f"\nThe email {specific_email} exists in the text.")
    else:
        print(f"\nThe email {specific_email} does not exist in the text.")

if __name__ == "__main__":
    main()

Emails found:
john.doe@example.com
jane_smith123@gmail.com
sam.brown@yahoo.com

Names and Emails found:
Name: John Doe, Email: john.doe@example.com
Name: Jane Smith, Email: jane_smith123@gmail.com
Name: Sam Brown, Email: sam.brown@yahoo.com

The email sam.brown@yahoo.com exists in the text.


In [None]:
class FiniteStateAutomaton:
    def __init__(self):
        # Define states and transitions
        self.states = {
            0: {'a': 1},
            1: {'b': 2},
            2: {}  # Accept state, no outgoing transitions needed
        }
        self.start_state = 0
        self.accept_state = 2
        self.current_state = self.start_state

    def reset(self):
        """Reset the automaton to the start state."""
        self.current_state = self.start_state

    def process(self, string):
        """
        Process the input string character by character.
        Return True if the string ends in 'ab', otherwise False.
        """
        self.reset()
        for char in string:
            if char in self.states[self.current_state]:
                self.current_state = self.states[self.current_state][char]
            else:
                self.current_state = self.start_state  # Reset on invalid input

        return self.current_state == self.accept_state


# Example usage
if __name__ == "__main__":
    fsa = FiniteStateAutomaton()

    test_strings = [
        "helloab",   # Ends with 'ab' - should match
        "abab",      # Ends with 'ab' - should match
        "hello",     # Does not end with 'ab' - no match
        "a",         # Does not end with 'ab' - no match
        "b",         # Does not end with 'ab' - no match
        "abcab",     # Ends with 'ab' - should match
    ]

    for string in test_strings:
        result = fsa.process(string)
        print(f"'{string}' -> {'Matched' if result else 'Not Matched'}")

'helloab' -> Matched
'abab' -> Not Matched
'hello' -> Not Matched
'a' -> Not Matched
'b' -> Not Matched
'abcab' -> Matched


In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')

def morphological_analysis(text):
    # Tokenize the input text
    words = word_tokenize(text)

    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    # Perform stemming and lemmatization
    print(f"{'Word':<15}{'Stemmed':<15}{'Lemmatized'}")
    print("-" * 40)
    for word in words:
        stemmed = stemmer.stem(word)
        lemmatized = lemmatizer.lemmatize(word)
        print(f"{word:<15}{stemmed:<15}{lemmatized}")

if __name__ == "__main__":
    # Input text for analysis
    input_text = """
    Running runners ran quickly.
    The foxes are jumping over the fences.
    She studies hard to improve her studying.
    """

    print("Morphological Analysis:")
    morphological_analysis(input_text)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Morphological Analysis:
Word           Stemmed        Lemmatized
----------------------------------------
Running        run            Running
runners        runner         runner
ran            ran            ran
quickly        quickli        quickly
.              .              .
The            the            The
foxes          fox            fox
are            are            are
jumping        jump           jumping
over           over           over
the            the            the
fences         fenc           fence
.              .              .
She            she            She
studies        studi          study
hard           hard           hard
to             to             to
improve        improv         improve
her            her            her
studying       studi          studying
.              .              .


In [None]:
class MorphologicalFSM:
    def __init__(self):
        # Define states and transitions
        self.transitions = {
            "start": self.handle_start,
            "add_s": self.add_s,
            "add_es": self.add_es,
            "replace_y": self.replace_y,
            "accept": self.accept
        }
        self.state = "start"  # Initial state
        self.result = ""  # Stores the transformed word

    def handle_start(self, word):
        """Determine the transition based on the last character(s)."""
        if word.endswith(("s", "x", "z", "ch", "sh")):
            self.state = "add_es"
        elif word.endswith("y") and not word[-2].lower() in "aeiou":
            self.state = "replace_y"
        else:
            self.state = "add_s"

    def add_s(self, word):
        """Add 's' to the word."""
        self.result = word + "s"
        self.state = "accept"

    def add_es(self, word):
        """Add 'es' to the word."""
        self.result = word + "es"
        self.state = "accept"

    def replace_y(self, word):
        """Replace the ending 'y' with 'ies'."""
        self.result = word[:-1] + "ies"
        self.state = "accept"

    def accept(self, word):
        """Accept state; do nothing."""
        pass

    def generate_plural(self, word):
        """Generate the plural form of the given word."""
        self.state = "start"
        self.result = ""
        while self.state != "accept":
            self.transitions[self.state](word)
        return self.result


# Example usage
if __name__ == "__main__":
    fsm = MorphologicalFSM()

    # Test words
    words = [
        "cat",    # Regular plural
        "dog",    # Regular plural
        "bus",    # Ends with 's'
        "box",    # Ends with 'x'
        "buzz",   # Ends with 'z'
        "church", # Ends with 'ch'
        "brush",  # Ends with 'sh'
        "baby",   # Ends with consonant + 'y'
        "toy",    # Ends with vowel + 'y'
    ]

    print("Morphological Parsing (Pluralization):")
    for word in words:
        plural = fsm.generate_plural(word)
        print(f"{word} -> {plural}")

Morphological Parsing (Pluralization):
cat -> cats
dog -> dogs
bus -> buses
box -> boxes
buzz -> buzzes
church -> churches
brush -> brushes
baby -> babies
toy -> toys


In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Download NLTK data if needed (uncomment the next line if required)
# import nltk
# nltk.download('punkt')

def stem_words(words):
    # Initialize the Porter Stemmer
    porter_stemmer = PorterStemmer()

    print(f"{'Word':<15}{'Stemmed'}")
    print("-" * 25)

    # Perform stemming for each word
    for word in words:
        stemmed = porter_stemmer.stem(word)
        print(f"{word:<15}{stemmed}")


if __name__ == "__main__":
    # List of words to stem
    word_list = [
        "running", "jumps", "easily", "studying",
        "flies", "runningly", "connected", "connectivity",
        "happiness", "relational", "demonstration"
    ]

    print("Word Stemming using Porter Stemmer:")
    stem_words(word_list)

Word Stemming using Porter Stemmer:
Word           Stemmed
-------------------------
running        run
jumps          jump
easily         easili
studying       studi
flies          fli
runningly      runningli
connected      connect
connectivity   connect
happiness      happi
relational     relat
demonstration  demonstr


In [None]:
import random
from collections import defaultdict

class BigramModel:
    def __init__(self):
        self.bigram_counts = defaultdict(lambda: defaultdict(int))
        self.start_words = []

    def train(self, text):
        """
        Train the bigram model using input text.
        """
        words = text.split()
        self.start_words.append(words[0])  # Collect potential starting words
        for i in range(len(words) - 1):
            current_word = words[i]
            next_word = words[i + 1]
            self.bigram_counts[current_word][next_word] += 1

    def generate_text(self, length=10):
        """
        Generate text using the trained bigram model.
        """
        # Start with a random word
        current_word = random.choice(self.start_words)
        generated_words = [current_word]

        for _ in range(length - 1):
            next_words = list(self.bigram_counts[current_word].items())
            if not next_words:
                break  # Stop if there are no next words
            next_word = random.choices(
                [word for word, count in next_words],
                [count for word, count in next_words]
            )[0]
            generated_words.append(next_word)
            current_word = next_word

        return ' '.join(generated_words)


if __name__ == "__main__":
    # Example training text
    input_text = """
    The quick brown fox jumps over the lazy dog. The lazy dog barks at the quick fox.
    The fox is clever and the dog is loyal.
    """

    # Initialize and train the bigram model
    model = BigramModel()
    model.train(input_text)

    # Generate text using the trained bigram model
    print("Generated Text:")
    print(model.generate_text(length=15))

Generated Text:
The quick fox. The quick fox. The lazy dog. The quick brown fox is loyal.


In [None]:
import nltk

# Download required NLTK resources (if not already downloaded)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def pos_tag_text(text):
    """
    Performs Part-of-Speech tagging on the input text.

    Args:
        text: The input text as a string.

    Returns:
        A list of tuples, where each tuple contains a word and its POS tag.
    """
    # Tokenize the text into words
    tokens = nltk.word_tokenize(text)

    # Perform POS tagging
    tagged_words = nltk.pos_tag(tokens)

    return tagged_words

if __name__ == "__main__":
    # Example input text
    text = "The quick brown fox jumps over the lazy dog."

    # Perform POS tagging
    tagged_result = pos_tag_text(text)

    # Print the results in a formatted way
    print("POS Tagging Results:")
    for word, tag in tagged_result:
        print(f"{word:<10}{tag}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


LookupError: 
**********************************************************************
  Resource [93maveraged_perceptron_tagger_eng[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtaggers/averaged_perceptron_tagger_eng/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:

import nltk
from collections import defaultdict, Counter
import random

# Download necessary data
nltk.download('brown')
nltk.download('universal_tagset')

class SimplePOSTagger:
    def __init__(self):
        self.transition_probs = defaultdict(Counter)  # Transition probabilities (tag -> next_tag)
        self.emission_probs = defaultdict(Counter)  # Emission probabilities (tag -> word)
        self.tag_counts = Counter()  # Counts of each tag

    def train(self, tagged_sentences):
        """
        Train the tagger using tagged sentences.
        """
        for sentence in tagged_sentences:
            previous_tag = "<START>"
            for word, tag in sentence:
                # Update transition probabilities
                self.transition_probs[previous_tag][tag] += 1

                # Update emission probabilities
                self.emission_probs[tag][word.lower()] += 1

                # Update tag counts
                self.tag_counts[tag] += 1

                # Update previous tag
                previous_tag = tag

            # Handle end of sentence transition
            self.transition_probs[previous_tag]["<END>"] += 1

    def predict(self, sentence):
        """
        Predict POS tags for a given sentence using a stochastic approach.
        """
        previous_tag = "<START>"
        tags = []

        for word in sentence:
            word_lower = word.lower()

            # Calculate emission probabilities for the current word
            tag_probs = {
                tag: (self.transition_probs[previous_tag][tag] / sum(self.transition_probs[previous_tag].values())) *
                     (self.emission_probs[tag][word_lower] / sum(self.emission_probs[tag].values()))
                for tag in self.emission_probs
            }

            # Select the tag with the highest probability
            if tag_probs:
                predicted_tag = max(tag_probs, key=tag_probs.get)
            else:
                # Default to the most common tag if no probabilities are available
                predicted_tag = max(self.tag_counts, key=self.tag_counts.get)

            tags.append(predicted_tag)
            previous_tag = predicted_tag

        return tags


if __name__ == "__main__":
    # Load tagged sentences from the Brown corpus
    tagged_sentences = nltk.corpus.brown.tagged_sents(tagset="universal")

    # Split data into training and testing sets
    train_data = tagged_sentences[:40000]
    test_data = tagged_sentences[40000:40200]

    # Initialize and train the tagger
    tagger = SimplePOSTagger()
    tagger.train(train_data)

    # Test the tagger on a sentence
    test_sentence = [word for word, tag in test_data[0]]  # Extract a test sentence
    print("Input Sentence:", " ".join(test_sentence))

    predicted_tags = tagger.predict(test_sentence)
    print("\nPredicted POS Tags:")
    print(predicted_tags)

    # Compare with actual tags
    actual_tags = [tag for word, tag in test_data[0]]
    print("\nActual POS Tags:")
    print(actual_tags)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Unzipping taggers/universal_tagset.zip.


Input Sentence: The door was answered by a slender man in his sixties -- straight-backed , somewhat clerical in manner , wearing rimless glasses .

Predicted POS Tags:
['DET', 'NOUN', 'VERB', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', '.', 'DET', '.', 'ADV', 'ADJ', 'ADP', 'NOUN', '.', 'VERB', 'DET', 'NOUN', '.']

Actual POS Tags:
['DET', 'NOUN', 'VERB', 'VERB', 'ADP', 'DET', 'ADJ', 'NOUN', 'ADP', 'DET', 'NOUN', '.', 'ADJ', '.', 'ADV', 'ADJ', 'ADP', 'NOUN', '.', 'VERB', 'ADJ', 'NOUN', '.']


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import RegexpTagger

# Download necessary NLTK resources
nltk.download('punkt')

def rule_based_pos_tagging(text):
    """
    Perform rule-based part-of-speech tagging using regular expressions.
    """
    # Define rules for tagging
    patterns = [
        (r'^[Tt]he$', 'DT'),       # Determiner
        (r'^[Aa]nd$', 'CC'),       # Coordinating conjunction
        (r'^[Ii]s$', 'VBZ'),       # Verb, 3rd person singular present
        (r'^[Ii]n$', 'IN'),        # Preposition
        (r'.*ing$', 'VBG'),        # Gerund/Present participle
        (r'.*ed$', 'VBD'),         # Past tense verb
        (r'.*es$', 'VBZ'),         # Verb, 3rd person singular present
        (r'.*s$', 'NNS'),          # Plural noun
        (r'.*ly$', 'RB'),          # Adverb
        (r'.*able$', 'JJ'),        # Adjective
        (r'.*ness$', 'NN'),        # Noun formed from adjective
        (r'.*ment$', 'NN'),        # Noun formed from verb
        (r'^[A-Z].*$', 'NNP'),     # Proper noun
        (r'.+', 'NN')              # Default to noun
    ]

    # Initialize the RegexpTagger with the defined patterns
    tagger = RegexpTagger(patterns)

    # Tokenize the text into words
    tokens = word_tokenize(text)

    # Perform POS tagging
    tagged_words = tagger.tag(tokens)

    # Print the results
    print(f"{'Word':<15}{'POS Tag'}")
    print("-" * 30)
    for word, tag in tagged_words:
        print(f"{word:<15}{tag}")


if __name__ == "__main__":
    # Example text
    input_text = """
    The quick brown fox jumps over the lazy dog. It is running swiftly and gracefully.
    """

    print("Rule-Based Part-of-Speech Tagging:")
    rule_based_pos_tagging(input_text)

Rule-Based Part-of-Speech Tagging:
Word           POS Tag
------------------------------
The            DT
quick          NN
brown          NN
fox            NN
jumps          NNS
over           NN
the            DT
lazy           NN
dog            NN
.              NN
It             NNP
is             VBZ
running        VBG
swiftly        RB
and            CC
gracefully     RB
.              NN


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
#programs/prog_10_transformation tagging.py
import nltk
from nltk.tag import UnigramTagger, RegexpTagger
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize

# Download required NLTK resources
nltk.download('treebank')
nltk.download('punkt')

class TransformationBasedTagger:
    def __init__(self, initial_tagger):
        """
        Initialize with a baseline tagger.
        """
        self.initial_tagger = initial_tagger
        self.transformation_rules = []

    def apply_rule(self, rule, tagged_sentence):
        """
        Apply a single transformation rule to a tagged sentence.
        """
        for i, (word, tag) in enumerate(tagged_sentence):
            if rule['condition'](word, tag, i, tagged_sentence):
                tagged_sentence[i] = (word, rule['new_tag'])
        return tagged_sentence

    def tag(self, sentence):
        """
        Tag a sentence using the baseline tagger and transformation rules.
        """
        tagged_sentence = self.initial_tagger.tag(sentence)
        for rule in self.transformation_rules:
            tagged_sentence = self.apply_rule(rule, tagged_sentence)
        return tagged_sentence

    def add_rule(self, condition, new_tag):
        """
        Add a new transformation rule.
        """
        self.transformation_rules.append({'condition': condition, 'new_tag': new_tag})


if __name__ == "__main__":
    # Training a unigram tagger as the baseline tagger
    training_sentences = treebank.tagged_sents()
    baseline_tagger = UnigramTagger(training_sentences)

    # Initialize the transformation-based tagger
    tbt = TransformationBasedTagger(initial_tagger=baseline_tagger)

    # Add a simple transformation rule
    # Example rule: If a word is "is" and tagged as NN, change the tag to VBZ
    tbt.add_rule(
        condition=lambda word, tag, index, sentence: word.lower() == 'is' and tag == 'NN',
        new_tag='VBZ'
    )

    # Input sentence for tagging
    sentence = word_tokenize("This is a test sentence.")

    # Apply the transformation-based tagger
    tagged_sentence = tbt.tag(sentence)

    # Display the results
    print("Tagged Sentence:")
    for word, tag in tagged_sentence:
        print(f"{word:<10}{tag}")

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Tagged Sentence:
This      DT
is        VBZ
a         DT
test      NN
sentence  NN
.         .


In [None]:
#TOP DOWN CFG
class TopDownParser:
    def __init__(self, grammar):
        self.grammar = grammar
        self.tokens = []
        self.current_token_index = 0

    def parse(self, start_symbol, input_string):
        self.tokens = input_string.split()
        self.current_token_index = 0
        if self._parse_symbol(start_symbol):
            if self.current_token_index == len(self.tokens):  # Check if all tokens are consumed
                return True
            else:
                print("Error: Unconsumed tokens remaining.")
        return False

    def _parse_symbol(self, symbol):
        if symbol in self.grammar:  # Non-terminal
            for production in self.grammar[symbol]:
                saved_index = self.current_token_index
                if all(self._parse_symbol(sym) for sym in production):
                    return True
                self.current_token_index = saved_index  # Backtrack
            return False
        else:  # Terminal
            if self.current_token_index < len(self.tokens) and self.tokens[self.current_token_index] == symbol:
                self.current_token_index += 1
                return True
            return False


# Example Grammar
grammar = {
    "S": [["NP", "VP"]],
    "NP": [["Det", "N"]],
    "VP": [["V", "NP"], ["V"]],
    "Det": [["a"], ["the"]],
    "N": [["cat"], ["dog"]],
    "V": [["saw"], ["liked"]]
}

# Input example
input_string = "the cat saw a dog"
parser = TopDownParser(grammar)

if parser.parse("S", input_string):
    print("Input string is valid according to the grammar!")
else:
    print("Input string is invalid.")

Input string is valid according to the grammar!


In [None]:
#early parser
from collections import defaultdict, namedtuple

# Representing a state in the Earley Chart
State = namedtuple("State", ["lhs", "rhs", "dot", "start", "end"])

class EarleyParser:
    def __init__(self, grammar):
        self.grammar = grammar
        self.chart = []

    def parse(self, input_string, start_symbol):
        self.input = input_string.split()
        self.n = len(self.input)
        self.chart = [set() for _ in range(self.n + 1)]

        # Add initial state
        self.chart[0].add(State(start_symbol, tuple(["."] + self.grammar[start_symbol][0]), 0, 0, 0))

        # Process chart columns
        for i in range(self.n + 1):
            changes = True
            while changes:  # Keep iterating until no more states can be added
                changes = False
                current_states = list(self.chart[i])
                for state in current_states:
                    if self._is_complete(state):
                        changes |= self._completer(state, i)
                    elif self._is_non_terminal(state):
                        changes |= self._predictor(state, i)
                    else:
                        changes |= self._scanner(state, i)

        # Check if the final state is valid
        for state in self.chart[self.n]:
            if state.lhs == start_symbol and state.dot == len(state.rhs):
                return True
        return False

    def _is_complete(self, state):
        return state.dot == len(state.rhs)

    def _is_non_terminal(self, state):
        return state.rhs[state.dot] in self.grammar

    def _predictor(self, state, index):
        changes = False
        non_terminal = state.rhs[state.dot]
        for production in self.grammar[non_terminal]:
            new_state = State(non_terminal, tuple(["."] + production), 0, index, index)
            if new_state not in self.chart[index]:
                self.chart[index].add(new_state)
                changes = True
        return changes

    def _scanner(self, state, index):
        changes = False
        if index < self.n and self.input[index] == state.rhs[state.dot]:
            new_state = State(state.lhs, state.rhs, state.dot + 1, state.start, index + 1)
            if new_state not in self.chart[index + 1]:
                self.chart[index + 1].add(new_state)
                changes = True
        return changes

    def _completer(self, state, index):
        changes = False
        for prev_state in self.chart[state.start]:
            if prev_state.dot < len(prev_state.rhs) and prev_state.rhs[prev_state.dot] == state.lhs:
                new_state = State(prev_state.lhs, prev_state.rhs, prev_state.dot + 1, prev_state.start, index)
                if new_state not in self.chart[index]:
                    self.chart[index].add(new_state)
                    changes = True
        return changes


# Example Grammar
grammar = {
    "S": [["NP", "VP"]],
    "NP": [["Det", "N"]],
    "VP": [["V", "NP"], ["V"]],
    "Det": [["the"], ["a"]],
    "N": [["cat"], ["dog"]],
    "V": [["saw"], ["liked"]]
}

# Input string and start symbol
input_string = "the cat saw a dog"
start_symbol = "S"

# Parse
parser = EarleyParser(grammar)
if parser.parse(input_string, start_symbol):
    print("The input string is valid!")
else:
    print("The input string is invalid.")

The input string is invalid.


In [None]:
#parser tree
import nltk
from nltk import CFG
from nltk.parse import RecursiveDescentParser

# Define the grammar
grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N | Det Adj N
    VP -> V NP | V
    Det -> 'the' | 'a'
    N -> 'cat' | 'dog' | 'man' | 'telescope'
    Adj -> 'big' | 'small'
    V -> 'saw' | 'likes'
""")

# Define the input sentence
sentence = "the big dog saw a cat".split()

# Create a parser using the grammar
parser = RecursiveDescentParser(grammar)

# Parse the sentence and generate parse trees
print("Parse Trees:")
for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()  # Print the tree in a pretty format

Parse Trees:
(S
  (NP (Det the) (Adj big) (N dog))
  (VP (V saw) (NP (Det a) (N cat))))
         S                 
      ___|_______           
     |           VP        
     |        ___|___       
     NP      |       NP    
  ___|___    |    ___|___   
Det Adj  N   V  Det      N 
 |   |   |   |   |       |  
the big dog saw  a      cat



In [None]:
#programs/prog_14_check for agreement in sentences_CFG.py
import nltk
from nltk import CFG
from nltk.parse import RecursiveDescentParser

# Define the grammar with subject-verb agreement rules
grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N_S | Det N_P
    VP -> V_S | V_P
    Det -> 'the' | 'a'
    N_S -> 'dog' | 'cat' | 'child'
    N_P -> 'dogs' | 'cats' | 'children'
    V_S -> 'runs' | 'barks' | 'plays'
    V_P -> 'run' | 'bark' | 'play'
""")

# Function to check agreement in sentences
def check_agreement(sentence, grammar):
    parser = RecursiveDescentParser(grammar)
    tokens = sentence.split()
    valid = False
    for tree in parser.parse(tokens):
        valid = True
        break
    return valid

# Test sentences
sentences = [
    "the dog runs",        # Valid
    "the dogs run",        # Valid
    "the cat barks",       # Valid
    "the cats bark",       # Valid
    "the dog run",         # Invalid
    "the dogs barks",      # Invalid
    "the children play",   # Valid
    "a child plays",       # Valid
    "a child play",        # Invalid
]

# Check sentences and print results
print("Sentence Validity Check:")
for sentence in sentences:
    if check_agreement(sentence, grammar):
        print(f"'{sentence}' is valid.")
    else:
        print(f"'{sentence}' is invalid.")

Sentence Validity Check:
'the dog runs' is valid.
'the dogs run' is valid.
'the cat barks' is valid.
'the cats bark' is valid.
'the dog run' is valid.
'the dogs barks' is valid.
'the children play' is valid.
'a child plays' is valid.
'a child play' is valid.


In [None]:
#pcfg
import nltk
from nltk import PCFG
from nltk.parse import ViterbiParser

# Define the Probabilistic Context-Free Grammar (PCFG)
pcfg_grammar = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.6] | Det Adj N [0.4]
    VP -> V NP [0.7] | V [0.3]
    Det -> 'the' [0.6] | 'a' [0.4]
    N -> 'dog' [0.5] | 'cat' [0.5]
    Adj -> 'big' [0.8] | 'small' [0.2]
    V -> 'saw' [0.6] | 'likes' [0.4]
""")

# Define the input sentence
sentence = "the big dog saw a cat".split()

# Create a parser using the Viterbi algorithm
parser = ViterbiParser(pcfg_grammar)

# Parse the sentence and print the most probable parse tree
print("Most Probable Parse Tree:")
for tree in parser.parse(sentence):
    print(tree)
    tree.pretty_print()

Most Probable Parse Tree:
(S
  (NP (Det the) (Adj big) (N dog))
  (VP (V saw) (NP (Det a) (N cat)))) (p=0.0048384)
         S                 
      ___|_______           
     |           VP        
     |        ___|___       
     NP      |       NP    
  ___|___    |    ___|___   
Det Adj  N   V  Det      N 
 |   |   |   |   |       |  
the big dog saw  a      cat



In [None]:
#programs/prog_16_SpacyLibrary_NER.py
import spacy

# Load the SpaCy language model
nlp = spacy.load("en_core_web_md")

# Input text
text = "Apple Inc. is planning to open a new office in San Francisco by next year."

# Process the text
doc = nlp(text)

# Extract named entities
print("Named Entities, their labels, and positions:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_}) [{ent.start_char}, {ent.end_char}]")

OSError: [E050] Can't find model 'en_core_web_md'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
#programs/prog_17_WordNet, a lexical database, to retrieve synsets.py
from nltk.corpus import wordnet as wn

# Word to explore
word = "bank"

# Retrieve synsets
synsets = wn.synsets(word)

print(f"Synsets for '{word}':")
for synset in synsets:
    print(f"{synset.name()}: {synset.definition()}")

# Explore a specific synset
if synsets:
    synset = synsets[0]
    print("\nExamples and lemmas for first synset:")
    print(f"Examples: {synset.examples()}")
    print(f"Lemmas: {[lemma.name() for lemma in synset.lemmas()]}")

Synsets for 'bank':
bank.n.01: sloping land (especially the slope beside a body of water)
depository_financial_institution.n.01: a financial institution that accepts deposits and channels the money into lending activities
bank.n.03: a long ridge or pile
bank.n.04: an arrangement of similar objects in a row or in tiers
bank.n.05: a supply or stock held in reserve for future use (especially in emergencies)
bank.n.06: the funds held by a gambling house or the dealer in some gambling games
bank.n.07: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force
savings_bank.n.02: a container (usually with a slot in the top) for keeping money at home
bank.n.09: a building in which the business of banking transacted
bank.n.10: a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning)
bank.v.01: tip laterally
bank.v.02: enclose with a bank
bank.v.03: do business with a bank or keep an acco

In [None]:
#fopc parser
import re

def parse_fopc(expression):
    pattern = r"\b(All|Some|Exists|Not|And|Or|Implies|Equals)\b"
    tokens = re.findall(pattern, expression)
    return tokens

# Input logical expression
expression = "All x (Exists y (Loves(x, y)) Implies Loves(y, x))"
tokens = parse_fopc(expression)

print(f"Parsed tokens: {tokens}")

Parsed tokens: ['All', 'Exists', 'Implies']


In [None]:
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def lesk_algorithm(word, sentence):
    """
    Implements the Lesk algorithm for Word Sense Disambiguation.

    Args:
        word (str): The target word to disambiguate.
        sentence (str): The sentence containing the target word.

    Returns:
        nltk.corpus.reader.wordnet.Synset: The best sense (meaning) of the word
                                          based on the context of the sentence.
                                          Returns None if no sense is found.
    """
    words = set(word_tokenize(sentence))
    best_sense = None
    max_overlap = 0
    for sense in wn.synsets(word):
        definition = sense.definition()
        examples = ' '.join(sense.examples())
        signature = set(word_tokenize(definition + ' ' + examples))
        signature -= set(stopwords.words('english'))
        overlap = len(signature.intersection(words))
        if overlap > max_overlap:
            max_overlap = overlap
            best_sense = sense
    return best_sense

if __name__ == "__main__":
    # Input text
    sentence = "He went to the bank to withdraw money."
    word = "bank"

    # Disambiguate the word using the Lesk algorithm
    sense = lesk_algorithm(word, sentence)

    # Print the results
    if sense:
        print(f"Best sense for '{word}': {sense.name()} - {sense.definition()}")
    else:
        print(f"No sense found for '{word}' in the given sentence.")

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [None]:
#TF ITD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Example documents
documents = [
    "The sky is blue and beautiful.",
    "Love this blue and bright sky!",
    "The quick brown fox jumps over the lazy dog.",
    "A king's breakfast has sausages, ham, bacon, and eggs.",
    "Blue birds are flying in the sky."
]

# Query
query = "blue sky"

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words="english")
tfidf_matrix = vectorizer.fit_transform(documents + [query])

# Cosine similarity
cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()

# Rank documents
ranked_docs = sorted(enumerate(cosine_similarities), key=lambda x: x[1], reverse=True)

print("Document Rankings:")
for idx, score in ranked_docs:
    print(f"Document {idx + 1}: {score:.4f}")

Document Rankings:
Document 1: 0.6427
Document 2: 0.5102
Document 5: 0.5102
Document 3: 0.0000
Document 4: 0.0000


In [None]:
#programs/prog_21_syntax-driven semantic analysis.py
import spacy

# Load the SpaCy English model
nlp = spacy.load("en_core_web_sm")

def extract_noun_phrases(sentence):
    doc = nlp(sentence)
    noun_phrases = [chunk.text for chunk in doc.noun_chunks]
    return noun_phrases

sentence = "The intelligent student quickly solved the challenging math problem."
noun_phrases = extract_noun_phrases(sentence)
print("Noun Phrases:", noun_phrases)

Noun Phrases: ['The intelligent student', 'the challenging math problem']


In [None]:
#programs/prog_22_reference resolution within a text.py
import spacy
import neuralcoref

# Load SpaCy with neural coref
nlp = spacy.load("en_core_web_sm")
neuralcoref.add_to_pipe(nlp)

def resolve_references(text):
    doc = nlp(text)
    return doc._.coref_resolved

text = "John said he would help Mary with her project."
resolved_text = resolve_references(text)
print("Resolved Text:", resolved_text)

ModuleNotFoundError: No module named 'neuralcoref'

In [None]:
#programs/prog_23_coherence of a given text.py
def evaluate_coherence(text):
    sentences = text.split(". ")
    overlap_count = 0
    for i in range(len(sentences) - 1):
        words_current = set(sentences[i].lower().split())
        words_next = set(sentences[i + 1].lower().split())
        overlap_count += len(words_current & words_next)

    coherence_score = overlap_count / (len(sentences) - 1)
    return coherence_score

text = "The cat is on the mat. The mat is very soft. Soft materials are comfortable."
coherence_score = evaluate_coherence(text)
print(f"Coherence Score: {coherence_score:.2f}")

Coherence Score: 2.00


In [None]:
#programs/prog_24_recognizes dialog acts in a given dialog or conversation.py
from transformers import pipeline

# Load the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification")

def recognize_dialog_acts(dialog, labels):
    results = classifier(dialog, labels)
    return results["labels"][0]  # Highest scoring label

dialog = "Could you please tell me the time?"
labels = ["Request", "Question", "Greeting", "Command"]
act = recognize_dialog_acts(dialog, labels)
print("Dialog Act:", act)

In [None]:
#programs/prog_25_OpenAI GPT-3 library.py
#Install OpenAI library
#pip install openai
import openaiS

# Set up the API key
openai.api_key = "YOUR_API_KEY"

def generate_text(prompt, max_tokens=50):
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=max_tokens
    )
    return response["choices"][0]["text"].strip()

prompt = "Write a short story about a brave knight."
generated_text = generate_text(prompt)
print("Generated Text:", generated_text)

SyntaxError: invalid syntax (<ipython-input-34-80c7da2904d5>, line 2)

In [None]:
#programs/prog_26_Hugging Face Transformers library,  translate English text.py
# Install transformers library
#pip install transformers
from transformers import MarianMTModel, MarianTokenizer

def translate_text_to_french(text):
    model_name = "Helsinki-NLP/opus-mt-en-fr"
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)

    inputs = tokenizer.encode(text, return_tensors="pt", max_length=512, truncation=True)
    translated = model.generate(inputs, max_length=512, num_beams=4, early_stopping=True)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

text = "The weather is beautiful today."
translated_text = translate_text_to_french(text)
print("Translated Text:", translated_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Translated Text: Le temps est beau aujourd'hui.
