In [52]:
# Cell 1 - Package Installation (GOOD)
# Install required packages for parse trees
!pip install nltk
!pip install stanfordnlp
!pip install conllu

print("All packages installed successfully!")

All packages installed successfully!


In [53]:
# Cell 2 - Imports and Setup (GOOD)
import nltk
import os
import random
from nltk import Tree
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk import word_tokenize, pos_tag
from collections import Counter
import matplotlib.pyplot as plt

# Download required NLTK data
print("Downloading NLTK resources...")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
print("NLTK setup complete!")

Downloading NLTK resources...
NLTK setup complete!


[nltk_data] Downloading package punkt to C:\Users\Mitansh
[nltk_data]     Kanani\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mitansh Kanani\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\Mitansh
[nltk_data]     Kanani\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Mitansh
[nltk_data]     Kanani\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [54]:
# Cell 3 - Core Functions (NEEDS IMPROVEMENT)
def create_enhanced_gujarati_grammar():
    """
    Define an enhanced context-free grammar for Gujarati sentences
    """
    grammar = """
    S -> NP VP | VP | NP
    NP -> N | Det N | Adj N | NP PP | N NP
    VP -> V | V NP | V PP | Adv VP | V Adv
    PP -> P NP
    Det -> 'આ' | 'એ' | 'તે' | 'કોઈ' | 'દરેક' | 'બધા'
    N -> 'વનસ્પતિ' | 'પ્રાણી' | 'સજીવ' | 'કોષ' | 'જીવ' | 'પર્યાવરણ' | 'પ્રજનન' | 'પાણી' | 'હવા' | 'જમીન' | 'ઊર્જા' | 'ખોરાક'
    V -> 'છે' | 'કરે' | 'થાય' | 'રહે' | 'વધે' | 'જીવે' | 'ખાય' | 'પીએ' | 'જન્મે' | 'મરે'
    Adj -> 'નાનો' | 'મોટો' | 'લીલો' | 'સજીવ' | 'નિર્જીવ' | 'તાજું' | 'પ્રાથમિક'
    Adv -> 'ધીમે' | 'ઝડપથી' | 'સરળતાથી' | 'આસાનીથી'
    P -> 'માં' | 'પર' | 'થી' | 'સાથે' | 'માંથી' | 'ને' | 'ના'
    """
    return nltk.CFG.fromstring(grammar)

def parse_gujarati_sentence(sentence, grammar):
    """
    Parse a Gujarati sentence using the defined grammar
    """
    try:
        tokens = sentence.split()
        
        # Skip very long sentences
        if len(tokens) > 10:
            return None
            
        parser = nltk.ChartParser(grammar)
        trees = list(parser.parse(tokens))
        
        if trees:
            return trees[0]  # Return first parse tree
        else:
            return None
    except Exception as e:
        # print(f"   Parse error: {e}")  # Comment out for cleaner output
        return None

def visualize_parse_tree(tree, title="Parse Tree"):
    """
    Visualize the parse tree - simplified version
    """
    if tree:
        print(f"   Tree for: {title}")
        tree.pretty_print()
        print()  # Add spacing
        return tree
    else:
        print("   Could not parse the sentence")
        return None

In [55]:
# Cell 4 - Test with Sample Sentences (FIXED GRAMMAR)
# Enhanced Gujarati sentences from NCERT biology text
gujarati_sentences = [
    "સજીવ વનસ્પતિ છે",
    "પ્રાણી પર્યાવરણમાં જીવે છે", 
    "નાનો કોષ વધે છે",
    "સજીવ પ્રજનન કરે છે",
    "વનસ્પતિ પાણી છે",
    "કોષ મોટો છે",
    "પ્રાણી ખાય છે",
    "વનસ્પતિ હવા છે"
]

def create_comprehensive_gujarati_grammar():
    """
    Define a comprehensive context-free grammar for Gujarati sentences
    """
    grammar = """
    # Sentence patterns
    S -> NP VP | NP PP VP | VP | NP
    
    # Noun Phrases
    NP -> N | Det N | Adj N | NP PP | N NP | Adj NP | Det Adj N
    
    # Verb Phrases  
    VP -> V | V NP | V PP | V Adv | V NP PP | V NP Adv | Aux V | V Aux | V NP Aux
    VP -> V NP PP | V Adv NP | V NP NP
    
    # Prepositional Phrases
    PP -> P NP | P N | P Adj N
    
    # Words
    Det -> 'આ' | 'એ' | 'તે' | 'કોઈ' | 'દરેક' | 'બધા' | 'કેટલાક'
    N -> 'વનસ્પતિ' | 'પ્રાણી' | 'સજીવ' | 'કોષ' | 'જીવ' | 'પર્યાવરણ' | 'પ્રજનન' | 'પાણી' | 'હવા' | 'જમીન' | 'ઊર્જા' | 'ખોરાક' | 'પર્યાવરણમાં' | 'પાણીમાં'
    V -> 'છે' | 'કરે' | 'થાય' | 'રહે' | 'વધે' | 'જીવે' | 'ખાય' | 'પીએ' | 'જન્મે' | 'મરે' | 'જાય' | 'આવે'
    Adj -> 'નાનો' | 'મોટો' | 'લીલો' | 'સજીવ' | 'નિર્જીવ' | 'તાજું' | 'પ્રાથમિક' | 'મોટા' | 'નાના'
    Adv -> 'ધીમે' | 'ઝડપથી' | 'સરળતાથી' | 'આસાનીથી' | 'સહેલાઈથી'
    P -> 'માં' | 'પર' | 'થી' | 'સાથે' | 'માંથી' | 'ને' | 'ના' | 'ની'
    Aux -> 'છે' | 'હતું' | 'હતા'
    """
    return nltk.CFG.fromstring(grammar)

def debug_parse_attempt(sentence, grammar):
    """
    Debug why parsing fails for a sentence
    """
    print(f"   Debugging: '{sentence}'")
    words = sentence.split()
    print(f"   Words: {words}")
    
    # Check if all words are in grammar
    missing_words = []
    for word in words:
        found = False
        for production in grammar.productions():
            if word in production.rhs():
                found = True
                break
        if not found:
            missing_words.append(word)
    
    if missing_words:
        print(f"   ❌ Words not in grammar: {missing_words}")
    else:
        print(f"   ✅ All words found in grammar")
        
    # Try to parse with more detailed error info
    try:
        parser = nltk.ChartParser(grammar)
        trees = list(parser.parse(words))
        if trees:
            return trees[0]
        else:
            print(f"   ❌ No valid parse trees found")
            return None
    except Exception as e:
        print(f"   ❌ Parse error: {e}")
        return None

# Create comprehensive grammar and test parsing
grammar = create_comprehensive_gujarati_grammar()

print("Testing Comprehensive Parse Trees for Gujarati Sentences:")
print("=" * 65)

success_count = 0
for i, sentence in enumerate(gujarati_sentences, 1):
    print(f"\n{i}. Sentence: '{sentence}'")
    
    # First try normal parse
    tree = parse_gujarati_sentence(sentence, grammar)
    if tree:
        success_count += 1
        print("   ✓ Parse successful!")
        visualize_parse_tree(tree, f"'{sentence}'")
    else:
        print("   ✗ Parse failed - trying debug mode...")
        # Try with debug information
        tree = debug_parse_attempt(sentence, grammar)
        if tree:
            success_count += 1
            print("   ✓ Parse successful after debug!")
            visualize_parse_tree(tree, f"'{sentence}'")

print(f"\nSummary: {success_count}/{len(gujarati_sentences)} sentences parsed successfully ({success_count/len(gujarati_sentences):.1%})")

# Show grammar coverage
print(f"\nGrammar Coverage Analysis:")
print("=" * 40)
all_words = set()
for sentence in gujarati_sentences:
    all_words.update(sentence.split())

grammar_words = set()
for production in grammar.productions():
    for item in production.rhs():
        if isinstance(item, str) and not item.isupper():  # Terminal symbols
            grammar_words.add(item)

missing_words = all_words - grammar_words
print(f"Total unique words in sentences: {len(all_words)}")
print(f"Words covered by grammar: {len(grammar_words)}")
print(f"Missing words: {missing_words if missing_words else 'None!'}")

Testing Comprehensive Parse Trees for Gujarati Sentences:

1. Sentence: 'સજીવ વનસ્પતિ છે'
   ✓ Parse successful!
   Tree for: 'સજીવ વનસ્પતિ છે'
            S       
       _____|_____   
      NP          VP
  ____|_____      |  
Adj         N     V 
 |          |     |  
સજીવ     વનસ્પતિ  છે



2. Sentence: 'પ્રાણી પર્યાવરણમાં જીવે છે'
   ✓ Parse successful!
   Tree for: 'પ્રાણી પર્યાવરણમાં જીવે છે'
                        S              
         _______________|________       
        NP                       |     
   _____|_______                 |      
  |             NP               VP    
  |             |            ____|___   
  N             N           V       Aux
  |             |           |        |  
પ્રાણી     પર્યાવરણમાં     જીવે      છે



3. Sentence: 'નાનો કોષ વધે છે'
   ✓ Parse successful!
   Tree for: 'નાનો કોષ વધે છે'
              S             
       _______|_______       
      NP              VP    
  ____|___         ___|___   
Adj       N       V      A

In [56]:
# Cell 5 - POS Analysis (GOOD)
def analyze_gujarati_pos_patterns(sentences):
    """
    Analyze POS patterns in Gujarati sentences (simulated)
    """
    print("Analyzing Gujarati Sentence Patterns:")
    print("=" * 40)
    
    pos_patterns = []
    
    for sentence in sentences:
        words = sentence.split()
        simulated_tags = []
        
        for j, word in enumerate(words):
            if word in ['છે', 'કરે', 'થાય', 'રહે', 'વધે', 'જીવે', 'ખાય', 'પીએ', 'જન્મે', 'મરે']:
                simulated_tags.append(('V', word))
            elif word in ['વનસ્પતિ', 'પ્રાણી', 'સજીવ', 'કોષ', 'જીવ', 'પર્યાવરણ', 'પ્રજનન', 'પાણી', 'હવા', 'જમીન', 'ઊર્જા', 'ખોરાક']:
                simulated_tags.append(('N', word))
            elif word in ['નાનો', 'મોટો', 'લીલો', 'સજીવ', 'નિર્જીવ', 'તાજું', 'પ્રાથમિક']:
                simulated_tags.append(('Adj', word))
            elif word in ['માં', 'પર', 'થી', 'સાથે', 'માંથી', 'ને', 'ના']:
                simulated_tags.append(('P', word))
            elif word in ['ધીમે', 'ઝડપથી', 'સરળતાથી', 'આસાનીથી']:
                simulated_tags.append(('Adv', word))
            else:
                simulated_tags.append(('?', word))
        
        pos_patterns.append(simulated_tags)
        print(f"\nSentence: {sentence}")
        print(f"Simulated POS: {simulated_tags}")
    
    return pos_patterns

# Analyze patterns in our sentences
pos_analysis = analyze_gujarati_pos_patterns(gujarati_sentences)

Analyzing Gujarati Sentence Patterns:

Sentence: સજીવ વનસ્પતિ છે
Simulated POS: [('N', 'સજીવ'), ('N', 'વનસ્પતિ'), ('V', 'છે')]

Sentence: પ્રાણી પર્યાવરણમાં જીવે છે
Simulated POS: [('N', 'પ્રાણી'), ('?', 'પર્યાવરણમાં'), ('V', 'જીવે'), ('V', 'છે')]

Sentence: નાનો કોષ વધે છે
Simulated POS: [('Adj', 'નાનો'), ('N', 'કોષ'), ('V', 'વધે'), ('V', 'છે')]

Sentence: સજીવ પ્રજનન કરે છે
Simulated POS: [('N', 'સજીવ'), ('N', 'પ્રજનન'), ('V', 'કરે'), ('V', 'છે')]

Sentence: વનસ્પતિ પાણી છે
Simulated POS: [('N', 'વનસ્પતિ'), ('N', 'પાણી'), ('V', 'છે')]

Sentence: કોષ મોટો છે
Simulated POS: [('N', 'કોષ'), ('Adj', 'મોટો'), ('V', 'છે')]

Sentence: પ્રાણી ખાય છે
Simulated POS: [('N', 'પ્રાણી'), ('V', 'ખાય'), ('V', 'છે')]

Sentence: વનસ્પતિ હવા છે
Simulated POS: [('N', 'વનસ્પતિ'), ('N', 'હવા'), ('V', 'છે')]


In [57]:
# Cell 6 - Corpus Analysis (FIXED PATHS)
import os
import random

def check_data_directory():
    """Check what files actually exist"""
    # Since your notebook is in 'notebooks' folder and data is in main project directory
    base_dir = '..'  # Go up one level from notebooks to main project directory
    absolute_base_path = os.path.abspath(base_dir)
    print(f"Checking directory: {absolute_base_path}")
    
    if os.path.exists(absolute_base_path):
        print("\nAvailable files in project directory:")
        for root, dirs, files in os.walk(absolute_base_path):
            # Only show top 2 levels to avoid too much output
            level = root.replace(absolute_base_path, '').count(os.sep)
            if level <= 2:  # Limit depth
                indent = ' ' * 2 * level
                print(f'{indent}{os.path.basename(root)}/')
                subindent = ' ' * 2 * (level + 1)
                for file in files[:10]:  # Limit files per directory
                    print(f'{subindent}{file}')
                if len(files) > 10:
                    print(f'{subindent}... and {len(files)-10} more files')
            if level > 2:
                break
    else:
        print(f"❌ Base data directory not found: {absolute_base_path}")

def generate_parse_trees_from_corpus(file_path, num_samples=5):
    """
    Generate parse trees from your processed Gujarati corpus
    """
    file_path = os.path.abspath(file_path)
    print(f"Looking for file: {file_path}")
    
    # Check if file exists
    if not os.path.exists(file_path):
        print(f"❌ File not found: {file_path}")
        check_data_directory()
        return 0.0
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            sentences = [line.strip() for line in f if line.strip()]
        
        print(f"Loaded {len(sentences)} sentences")
        
        if len(sentences) == 0:
            print("❌ File is empty")
            return 0.0
            
        # Select random samples for analysis
        sample_sentences = random.sample(sentences, min(num_samples, len(sentences)))
        
        grammar = create_enhanced_gujarati_grammar()
        
        print(f"\nGenerating Parse Trees for {len(sample_sentences)} sample sentences:")
        print("=" * 60)
        
        successful_parses = 0
        
        for i, sentence in enumerate(sample_sentences, 1):
            print(f"\n{i}. Sentence: '{sentence}'")
            
            # Simple sentence complexity check
            word_count = len(sentence.split())
            print(f"   Words: {word_count}")
            
            # Try to parse
            tree = parse_gujarati_sentence(sentence, grammar)
            if tree:
                successful_parses += 1
                print("   ✓ Parse successful")
                # For longer sentences, just show structure without full visualization
                if word_count <= 6:
                    visualize_parse_tree(tree, f"Sample {i}")
                else:
                    print("   (Sentence too long for detailed visualization)")
            else:
                print("   ✗ Parse failed")
        
        success_rate = successful_parses / len(sample_sentences) if sample_sentences else 0
        print(f"\nSummary: {successful_parses}/{len(sample_sentences)} sentences parsed successfully ({success_rate:.1%})")
        return success_rate
        
    except Exception as e:
        print(f"❌ Error reading file: {e}")
        return 0.0

# First check what files we have
check_data_directory()

print("\n" + "="*50)
print("ATTEMPTING TO LOAD CORPUS FILES:")
print("="*50)

# CORRECTED PATHS - since notebook is in 'notebooks' folder and data is in main directory
possible_paths = [
    '../data/next/class11_biology_prepro.txt',  # Go up to main dir, then into data/next
    './data/next/class11_biology_prepro.txt',   # Current directory (notebooks) doesn't have data
    'class11_biology_prepro.txt',               # Direct file in notebooks (unlikely)
    '../../data/next/class11_biology_prepro.txt' # Two levels up (might be wrong)
]

corpus_file = None
for path in possible_paths:
    absolute_path = os.path.abspath(path)
    if os.path.exists(absolute_path):
        corpus_file = path
        print(f"✅ Found file: {absolute_path}")
        break

if corpus_file:
    print(f"\n📁 Using corpus file: {corpus_file}")
    success_rate = generate_parse_trees_from_corpus(corpus_file, num_samples=3)
    print(f"\nOverall success rate: {success_rate:.1%}")
    
    # If success rate is 0%, it might be due to grammar complexity, not file reading
    if success_rate == 0:
        print("\n💡 Note: 0% success rate may be due to:")
        print("   - Complex sentence structures in the corpus")
        print("   - Limited grammar rules in our CFG")
        print("   - Vocabulary not covered in our grammar")
        print("\nLet's test with a simple sentence from the file to verify reading works...")
        
        # Test with first few sentences to see what we're working with
        try:
            with open(corpus_file, 'r', encoding='utf-8') as f:
                first_few = [line.strip() for line in f if line.strip()][:3]
            print("\nFirst 3 sentences from file:")
            for i, sent in enumerate(first_few, 1):
                print(f"  {i}. '{sent}' (words: {len(sent.split())})")
        except Exception as e:
            print(f"Error reading file for preview: {e}")
            
else:
    print("❌ No corpus files found. Using sample sentences instead.")
    # Use our sample sentences as fallback
    grammar = create_enhanced_gujarati_grammar()
    print("\nUsing built-in sample sentences:")
    success_count = 0
    for i, sentence in enumerate(gujarati_sentences[:3], 1):
        print(f"\n{i}. '{sentence}'")
        tree = parse_gujarati_sentence(sentence, grammar)
        if tree:
            success_count += 1
            visualize_parse_tree(tree, f"Sample {i}")
    print(f"\nSample sentences success: {success_count}/3 ({success_count/3:.1%})")

Checking directory: c:\Users\Mitansh Kanani\Desktop\college\sem 7\NLP\NLP_LAB_GYANGUJ

Available files in project directory:
NLP_LAB_GYANGUJ/
  project_plan.md
  README.md
  requirements.txt
  .git/
    COMMIT_EDITMSG
    config
    description
    FETCH_HEAD
    HEAD
    index
    ORIG_HEAD
    hooks/
      applypatch-msg.sample
      commit-msg.sample
      fsmonitor-watchman.sample
      post-update.sample
      pre-applypatch.sample
      pre-commit.sample
      pre-merge-commit.sample
      pre-push.sample
      pre-rebase.sample
      pre-receive.sample
      ... and 4 more files
    info/
      exclude
    logs/
      HEAD

ATTEMPTING TO LOAD CORPUS FILES:
✅ Found file: c:\Users\Mitansh Kanani\Desktop\college\sem 7\NLP\NLP_LAB_GYANGUJ\data\next\class11_biology_prepro.txt

📁 Using corpus file: ../data/next/class11_biology_prepro.txt
Looking for file: c:\Users\Mitansh Kanani\Desktop\college\sem 7\NLP\NLP_LAB_GYANGUJ\data\next\class11_biology_prepro.txt
Loaded 23 sentences

Generat

In [58]:
# Cell 7 - Challenges Analysis (GOOD)
def analyze_indian_language_challenges():
    """
    Analyze specific challenges in parsing Indian languages like Gujarati
    """
    print("ANALYSIS OF PARSING CHALLENGES FOR INDIAN LANGUAGES")
    print("=" * 60)
    
    challenges = {
        "Morphological Richness": [
            "Gujarati has rich morphology with complex verb conjugations",
            "Nouns have gender (masculine, feminine, neuter) and case markers",
            "Agglutinative nature leads to long compound words"
        ],
        "Word Order Flexibility": [
            "Relatively free word order compared to English",
            "SOV (Subject-Object-Verb) is common but not fixed",
            "Postpositions instead of prepositions"
        ],
        "Resource Scarcity": [
            "Limited annotated corpora for training parsers",
            "Few pre-trained models for Gujarati syntax analysis",
            "Lack of comprehensive grammar formalisms"
        ],
        "Technical Challenges": [
            "Tokenization issues with compound words",
            "POS tagging ambiguity",
            "Handling of vibhakti (case markers) and samasa (compounds)"
        ]
    }
    
    for challenge, issues in challenges.items():
        print(f"\n{challenge}:")
        for issue in issues:
            print(f"  • {issue}")
    
    print("\n" + "=" * 60)
    print("RECOMMENDATIONS:")
    print("1. Develop language-specific grammar rules")
    print("2. Create annotated corpora for Gujarati")
    print("3. Use rule-based approaches combined with statistical methods")
    print("4. Focus on chunking before full parsing")
    print("5. Collaborate with linguists for grammar formalisms")

# Run the challenges analysis
analyze_indian_language_challenges()

ANALYSIS OF PARSING CHALLENGES FOR INDIAN LANGUAGES

Morphological Richness:
  • Gujarati has rich morphology with complex verb conjugations
  • Nouns have gender (masculine, feminine, neuter) and case markers
  • Agglutinative nature leads to long compound words

Word Order Flexibility:
  • Relatively free word order compared to English
  • SOV (Subject-Object-Verb) is common but not fixed
  • Postpositions instead of prepositions

Resource Scarcity:
  • Limited annotated corpora for training parsers
  • Few pre-trained models for Gujarati syntax analysis
  • Lack of comprehensive grammar formalisms

Technical Challenges:
  • Tokenization issues with compound words
  • POS tagging ambiguity
  • Handling of vibhakti (case markers) and samasa (compounds)

RECOMMENDATIONS:
1. Develop language-specific grammar rules
2. Create annotated corpora for Gujarati
3. Use rule-based approaches combined with statistical methods
4. Focus on chunking before full parsing
5. Collaborate with linguists 

In [59]:
# Cell 8 - Export Results (NEEDS FIX)
def export_parse_tree_results(sentences, output_file):
    """
    Export parse tree analysis results
    """
    # Create output directory if needed
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    grammar = create_enhanced_gujarati_grammar()
    
    results = []
    for sentence in sentences:
        words = sentence.split()
        tree = parse_gujarati_sentence(sentence, grammar)
        
        result = {
            'sentence': sentence,
            'word_count': len(words),
            'parse_success': tree is not None,
            'sentence_complexity': 'Simple' if len(words) <= 5 else 'Complex'
        }
        results.append(result)
    
    # Write results
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write("PARSE TREE ANALYSIS RESULTS\\n")
            f.write("=" * 50 + "\\n\\n")
            
            success_count = sum(1 for r in results if r['parse_success'])
            f.write(f"Total sentences analyzed: {len(results)}\\n")
            f.write(f"Successfully parsed: {success_count}\\n")
            f.write(f"Success rate: {success_count/len(results):.2%}\\n\\n")
            
            f.write("DETAILED ANALYSIS:\\n")
            f.write("-" * 50 + "\\n")
            
            for i, result in enumerate(results, 1):
                status = "✓" if result['parse_success'] else "✗"
                f.write(f"{i}. {status} {result['sentence_complexity']:>8}: {result['sentence']}\\n")
        
        print(f"✅ Results exported to: {output_file}")
        return results
        
    except Exception as e:
        print(f"❌ Error exporting results: {e}")
        return results

# Export results using our sample sentences
output_results_file = './parse_tree_analysis.txt'
results = export_parse_tree_results(gujarati_sentences, output_results_file)

print(f"\\nFinal Summary:")
print(f"Analyzed {len(results)} sentences")
success_count = sum(1 for r in results if r['parse_success'])
print(f"Success rate: {success_count/len(results):.1%}")

✅ Results exported to: ./parse_tree_analysis.txt
\nFinal Summary:
Analyzed 8 sentences
Success rate: 37.5%
