# Chunking in NLP

In [2]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser

# Download necessary NLTK data files (only need to do this once)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Sample text
text = "The quick brown fox jumps over the lazy dog."

# Tokenize the text
tokens = word_tokenize(text)

# Perform part-of-speech tagging
tagged_tokens = pos_tag(tokens)

# Define a chunk grammar
chunk_grammar = r"""
  NP: {<DT>?<JJ>*<NN>}   # Noun Phrase
  VP: {<VB.*><NP|PP>*}    # Verb Phrase
  PP: {<IN><NP>}          # Prepositional Phrase
"""

# Create a chunk parser
chunk_parser = RegexpParser(chunk_grammar)

# Parse the tagged tokens
chunked = chunk_parser.parse(tagged_tokens)

# Print the chunked output
print(chunked)

# Optionally, you can visualize the chunks
chunked.draw()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


(S
  (NP The/DT quick/JJ brown/NN)
  (NP fox/NN)
  (VP jumps/VBZ)
  (PP over/IN (NP the/DT lazy/JJ dog/NN))
  ./.)


In [3]:
pip install transformers

Collecting transformers
  Downloading transformers-4.55.2-py3-none-any.whl.metadata (41 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.4-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.6.2-cp38-abi3-win_amd64.whl.metadata (4.1 kB)
Downloading transformers-4.55.2-py3-none-any.whl (11.3 MB)
   ---------------------------------------- 0.0/11.3 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/11.3 MB 6.7 MB/s eta 0:00:02
   ---------- ----------------------------- 2.9/11.3 MB 7.3 MB/s eta 0:00:02
   ------------- -------------------------- 3.9/11.3 MB 6.9 MB/s eta 0:00:02
   ------------------- -------------------- 5.5/11.3 MB 6.8 MB/s eta 0:00:01
   ------------------------ --------------- 6.8/11.3 MB 6.8 MB/s eta 0:00:01
   --------

### Pre-trained model (GPT2)

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load a pre-trained model and tokenizer
model_name = "gpt2"  # You can replace with any other LLM
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

def chunk_text(text, max_length=512):
    """Chunk text into smaller pieces."""
    tokens = tokenizer.encode(text, return_tensors='pt')[0]
    chunks = []
    
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i + max_length]
        chunks.append(chunk)

    return chunks

def generate_responses(chunks):
    """Generate responses for each chunk using the LLM."""
    responses = []
    for chunk in chunks:
        input_ids = chunk.unsqueeze(0)  # Add batch dimension
        output = model.generate(input_ids, max_length=100)  # Generate response
        responses.append(tokenizer.decode(output[0], skip_special_tokens=True))
    
    return responses

# Example long text
long_text = "Your long text goes here. " * 50  # Repeat to simulate long text

# Chunk the text
chunks = chunk_text(long_text)

# Generate responses for each chunk
responses = generate_responses(chunks)

# Print the responses
for i, response in enumerate(responses):
    print(f"Response for chunk {i+1}:\n{response}\n")


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

ImportError: 
AutoModelForCausalLM requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForCausalLM".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.
