In [4]:
#STEP-1
# First, let's handle the numpy version conflict
!pip install numpy==1.24.3  # Compatible version

# Install packages in specific order to avoid conflicts
!pip install --upgrade pip

# Document processing (install PyMuPDF with correct name)
!pip install PyPDF2 python-docx pytesseract pillow
!pip install PyMuPDF  # This is the correct package name, not 'fitz' or 'pymupdf'

# NLP and ML packages
!pip install spacy transformers sentence-transformers
!pip install nltk textstat

# Download spaCy model
!python -m spacy download en_core_web_sm

# Web framework
!pip install streamlit

# Data handling
!pip install pandas

# OCR support (if needed)
# Note: You'll need to install tesseract separately on your system
# For Ubuntu/Debian: sudo apt-get install tesseract-ocr
# For macOS: brew install tesseract
# For Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki

Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (5.6 kB)
Using cached numpy-1.24.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.0 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.6
    Uninstalling numpy-2.2.6:
      Successfully uninstalled numpy-2.2.6
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you have numpy 1.24.3 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-1.24.3
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (63 kB)
Using cached numpy-2.2.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (14.3 MB)
Installing collected packag

In [5]:
#STEP-2
import os
import json
import re
import time
from pathlib import Path
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Data handling
import pandas as pd
import numpy as np

# Document processing imports with error handling
try:
    import PyPDF2
    print("✓ PyPDF2 imported successfully")
except ImportError as e:
    print(f"✗ PyPDF2 import failed: {e}")

try:
    from docx import Document
    print("✓ python-docx imported successfully")
except ImportError as e:
    print(f"✗ python-docx import failed: {e}")

try:
    import fitz  # PyMuPDF
    print("✓ PyMuPDF imported successfully")
except ImportError as e:
    print(f"✗ PyMuPDF import failed: {e}")
    print("Try: pip install PyMuPDF")

try:
    from PIL import Image
    print("✓ PIL imported successfully")
except ImportError as e:
    print(f"✗ PIL import failed: {e}")

try:
    import pytesseract
    print("✓ pytesseract imported successfully")
    # Test if tesseract is installed
    try:
        pytesseract.get_tesseract_version()
        print("✓ Tesseract OCR is available")
    except:
        print("⚠ Tesseract OCR not found - OCR features will be disabled")
        pytesseract = None
except ImportError as e:
    print(f"✗ pytesseract import failed: {e}")
    pytesseract = None

# NLP imports with error handling
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
    print("✓ spaCy imported and model loaded successfully")
except ImportError as e:
    print(f"✗ spaCy import failed: {e}")
except OSError as e:
    print(f"✗ spaCy model not found: {e}")
    print("Run: python -m spacy download en_core_web_sm")

try:
    from transformers import pipeline
    print("✓ Transformers imported successfully")
except ImportError as e:
    print(f"✗ Transformers import failed: {e}")

try:
    from sentence_transformers import SentenceTransformer
    print("✓ Sentence Transformers imported successfully")
except ImportError as e:
    print(f"✗ Sentence Transformers import failed: {e}")

try:
    import nltk
    from textstat import flesch_reading_ease, automated_readability_index
    print("✓ NLTK and textstat imported successfully")
except ImportError as e:
    print(f"✗ NLTK/textstat import failed: {e}")

print("\n" + "="*50)
print("Setup complete! Check above for any failed imports.")
print("="*50)

✓ PyPDF2 imported successfully
✓ python-docx imported successfully
✓ PyMuPDF imported successfully
✓ PIL imported successfully
✓ pytesseract imported successfully
✓ Tesseract OCR is available
✓ spaCy imported and model loaded successfully
✓ Transformers imported successfully
✓ Sentence Transformers imported successfully
✓ NLTK and textstat imported successfully

Setup complete! Check above for any failed imports.


In [6]:
#STEP-3
def extract_pdf_text(file_path):
    """Extract text from PDF files with fallback to OCR"""
    text = ""
    try:
        # Try standard PDF text extraction
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page in pdf_reader.pages:
                text += page.extract_text()
        
        # If text is minimal, use OCR
        if len(text.strip()) < 100:
            text = extract_pdf_with_ocr(file_path)
    except Exception as e:
        print(f"Error extracting PDF: {e}")
        text = extract_pdf_with_ocr(file_path)
    
    return text

In [7]:
#STEP-4
def extract_docx_text(file_path):
    """Extract text from DOCX files"""
    try:
        doc = Document(file_path)
        text = ""
        for paragraph in doc.paragraphs:
            text += paragraph.text + "\n"
        return text
    except Exception as e:
        print(f"Error extracting DOCX: {e}")
        return ""

In [8]:
#STEP-5
def extract_pdf_with_ocr(file_path):
    """Extract text using OCR for image-based PDFs"""
    if pytesseract is None:
        print("OCR not available - pytesseract not installed")
        return ""
    
    text = ""
    try:
        if 'fitz' in globals():
            # Use PyMuPDF for OCR
            pdf_document = fitz.open(file_path)
            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                pix = page.get_pixmap()
                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
                text += pytesseract.image_to_string(img) + "\n"
            pdf_document.close()
        else:
            print("PyMuPDF not available - OCR disabled")
            return ""
    except Exception as e:
        print(f"OCR extraction failed: {e}")
    return text

def check_tesseract_installation():
    """Check if Tesseract is properly installed"""
    try:
        if pytesseract is not None:
            version = pytesseract.get_tesseract_version()
            print(f"Tesseract version: {version}")
            return True
        else:
            print("Pytesseract not available")
            return False
    except Exception as e:
        print(f"Tesseract not properly installed: {e}")
        print("Please install Tesseract OCR:")
        print("- Ubuntu/Debian: sudo apt-get install tesseract-ocr")
        print("- macOS: brew install tesseract")
        print("- Windows: Download from https://github.com/UB-Mannheim/tesseract/wiki")
        return False

# Check OCR availability
ocr_available = check_tesseract_installation()

Tesseract version: 5.3.4


In [9]:
#STEP-6
# Initialize models with error handling
def initialize_models():
    """Initialize NLP models with fallback options"""
    models = {}
    
    # Initialize spaCy model
    try:
        if 'nlp' in globals():
            models['nlp'] = nlp
            print("✓ spaCy model initialized")
        else:
            print("✗ spaCy not available")
            models['nlp'] = None
    except Exception as e:
        print(f"✗ spaCy initialization failed: {e}")
        models['nlp'] = None
    
    # Initialize sentence transformer
    try:
        models['sentence_model'] = SentenceTransformer('all-MiniLM-L6-v2')
        print("✓ Sentence Transformer model loaded")
    except Exception as e:
        print(f"✗ Sentence Transformer failed: {e}")
        models['sentence_model'] = None
    
    # Initialize summarizer
    try:
        models['summarizer'] = pipeline("summarization", model="facebook/bart-large-cnn")
        print("✓ Summarization model loaded")
    except Exception as e:
        print(f"✗ Summarization model failed: {e}")
        models['summarizer'] = None
    
    # Initialize classifier
    try:
        models['classifier'] = pipeline("text-classification", 
                                       model="distilbert-base-uncased-finetuned-sst-2-english")
        print("✓ Classification model loaded")
    except Exception as e:
        print(f"✗ Classification model failed: {e}")
        models['classifier'] = None
    
    return models

# Initialize all models
models = initialize_models()

✓ spaCy model initialized
✓ Sentence Transformer model loaded


Device set to use cpu


✓ Summarization model loaded


Device set to use cpu


✓ Classification model loaded


In [10]:
#STEP-7
def preprocess_text(text):
    """Clean and preprocess extracted text"""
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    
    # Remove special characters but keep basic punctuation
    text = re.sub(r'[^\w\s.,!?;:-]', '', text)
    
    return text

In [11]:
#STEP-8
def extract_key_information(text):
    """Extract entities, keywords, and important sections"""
    entities = {
        'PERSON': [],
        'ORG': [],
        'GPE': [],  # Geopolitical entities
        'DATE': [],
        'MONEY': [],
        'PERCENT': []
    }
    key_phrases = []
    
    try:
        if models['nlp'] is not None:
            doc = models['nlp'](text)
            
            # Extract named entities
            for ent in doc.ents:
                if ent.label_ in entities:
                    entities[ent.label_].append(ent.text)
            
            # Extract key phrases using noun chunks
            key_phrases = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
        else:
            # Fallback: simple regex-based extraction
            print("Using fallback entity extraction")
            # Simple patterns for common entities
            import re
            
            # Extract potential names (capitalized words)
            names = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
            entities['PERSON'] = list(set(names))[:10]  # Top 10 unique names
            
            # Extract dates
            dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}\b', text)
            entities['DATE'] = list(set(dates))[:5]
            
            # Extract percentages
            percentages = re.findall(r'\b\d+(?:\.\d+)?%\b', text)
            entities['PERCENT'] = list(set(percentages))
            
            # Simple key phrase extraction (common noun phrases)
            words = text.split()
            bigrams = [f"{words[i]} {words[i+1]}" for i in range(len(words)-1)]
            key_phrases = list(set(bigrams))[:10]
    
    except Exception as e:
        print(f"Entity extraction failed: {e}")
    
    return entities, key_phrases

In [12]:
#STEP-9
def classify_document_type(text):
    """Classify document type based on content"""
    # Simple rule-based classification
    text_lower = text.lower()
    
    if any(word in text_lower for word in ['contract', 'agreement', 'terms', 'legal']):
        return 'Legal Document'
    elif any(word in text_lower for word in ['report', 'analysis', 'findings', 'conclusion']):
        return 'Report'
    elif any(word in text_lower for word in ['manual', 'guide', 'instructions', 'how to']):
        return 'Manual/Guide'
    elif any(word in text_lower for word in ['proposal', 'project', 'plan', 'strategy']):
        return 'Proposal/Plan'
    else:
        return 'General Document'

In [13]:
#STEP-10
def generate_summary(text, max_length=150):
    """Generate document summary"""
    try:
        if models['summarizer'] is not None:
            # Chunk text if too long
            if len(text) > 1024:
                chunks = [text[i:i+1024] for i in range(0, len(text), 1024)]
                summaries = []
                for chunk in chunks[:3]:  # Process first 3 chunks
                    if len(chunk.strip()) > 50:  # Only process substantial chunks
                        summary = models['summarizer'](chunk, max_length=50, min_length=10, do_sample=False)
                        summaries.append(summary[0]['summary_text'])
                return ' '.join(summaries)
            else:
                if len(text.strip()) > 50:
                    summary = models['summarizer'](text, max_length=max_length, min_length=30, do_sample=False)
                    return summary[0]['summary_text']
                else:
                    return text[:max_length] + "..." if len(text) > max_length else text
        else:
            # Fallback: extract first few sentences
            sentences = text.split('.')
            summary_sentences = []
            char_count = 0
            
            for sentence in sentences:
                if char_count + len(sentence) < max_length:
                    summary_sentences.append(sentence.strip())
                    char_count += len(sentence)
                else:
                    break
            
            return '. '.join(summary_sentences[:3]) + '.' if summary_sentences else text[:max_length]
    
    except Exception as e:
        print(f"Summarization failed: {e}")
        # Final fallback
        sentences = text.split('.')[:3]
        return '. '.join([s.strip() for s in sentences if s.strip()]) + '.'

In [14]:
#STEP-11
def extract_topics(text, num_topics=5):
    """Extract main topics from document"""
    try:
        if models['nlp'] is not None:
            doc = models['nlp'](text)
            
            # Get important tokens (nouns, adjectives)
            important_tokens = [token.lemma_.lower() for token in doc 
                               if token.pos_ in ['NOUN', 'ADJ'] and len(token.text) > 3]
        else:
            # Fallback: simple word frequency
            import re
            words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
            # Filter out common words
            stop_words = {'this', 'that', 'with', 'have', 'will', 'from', 'they', 'been', 
                         'have', 'were', 'said', 'each', 'which', 'their', 'time', 'but'}
            important_tokens = [word for word in words if word not in stop_words]
        
        # Count frequency
        word_freq = Counter(important_tokens)
        
        # Return top topics
        return [word for word, count in word_freq.most_common(num_topics)]
    
    except Exception as e:
        print(f"Topic extraction failed: {e}")
        # Simple fallback
        words = text.lower().split()
        word_freq = Counter(word for word in words if len(word) > 4)
        return [word for word, count in word_freq.most_common(num_topics)]

In [3]:
#STEP-12
def create_metadata_schema():
    """Define the metadata structure"""
    return {
        'basic_info': {
            'filename': '',
            'file_type': '',
            'file_size': 0,
            'creation_date': '',
            'processing_date': ''
        },
        'content_analysis': {
            'document_type': '',
            'language': 'en',
            'word_count': 0,
            'character_count': 0,
            'readability_score': 0,
            'sentiment': ''
        },
        'semantic_data': {
            'summary': '',
            'key_topics': [],
            'entities': {},
            'key_phrases': []
        },
        'technical_metadata': {
            'extraction_method': '',
            'confidence_score': 0.0,
            'processing_time': 0.0
        }
    }

In [15]:
#STEP-13
def generate_metadata(file_path):
    """Main function to generate complete metadata"""
    import time
    start_time = time.time()
    
    # Initialize metadata structure
    metadata = create_metadata_schema()
    
    # Extract basic file information
    file_info = Path(file_path)
    metadata['basic_info']['filename'] = file_info.name
    metadata['basic_info']['file_type'] = file_info.suffix
    metadata['basic_info']['file_size'] = file_info.stat().st_size
    metadata['basic_info']['processing_date'] = time.strftime('%Y-%m-%d %H:%M:%S')
    
    # Extract text content
    if file_path.endswith('.pdf'):
        text = extract_pdf_text(file_path)
        extraction_method = 'PDF extraction'
    elif file_path.endswith('.docx'):
        text = extract_docx_text(file_path)
        extraction_method = 'DOCX extraction'
    else:
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        extraction_method = 'Direct text reading'
    
    # Preprocess text
    text = preprocess_text(text)
    
    # Content analysis
    metadata['content_analysis']['word_count'] = len(text.split())
    metadata['content_analysis']['character_count'] = len(text)
    metadata['content_analysis']['document_type'] = classify_document_type(text)
    metadata['content_analysis']['readability_score'] = flesch_reading_ease(text)
    
    # Semantic analysis
    metadata['semantic_data']['summary'] = generate_summary(text)
    metadata['semantic_data']['key_topics'] = extract_topics(text)
    entities, key_phrases = extract_key_information(text)
    metadata['semantic_data']['entities'] = entities
    metadata['semantic_data']['key_phrases'] = key_phrases[:10]  # Top 10 phrases
    
    # Technical metadata
    metadata['technical_metadata']['extraction_method'] = extraction_method
    metadata['technical_metadata']['processing_time'] = time.time() - start_time
    metadata['technical_metadata']['confidence_score'] = calculate_confidence_score(text)
    
    return metadata

In [16]:
#STEP-14
import streamlit as st

def create_streamlit_interface():
    """Create web interface using Streamlit"""
    st.title("Automated Metadata Generation System")
    st.write("Upload documents to generate comprehensive metadata")
    
    # File upload
    uploaded_file = st.file_uploader(
        "Choose a file", 
        type=['pdf', 'docx', 'txt'],
        help="Upload PDF, DOCX, or TXT files"
    )
    
    if uploaded_file is not None:
        # Save uploaded file temporarily
        with open(f"temp_{uploaded_file.name}", "wb") as f:
            f.write(uploaded_file.getbuffer())
        
        # Generate metadata
        with st.spinner('Generating metadata...'):
            metadata = generate_metadata(f"temp_{uploaded_file.name}")
        
        # Display results
        display_metadata(metadata)
        
        # Clean up
        os.remove(f"temp_{uploaded_file.name}")

def display_metadata(metadata):
    """Display metadata in organized tabs"""
    tab1, tab2, tab3, tab4 = st.tabs(["Basic Info", "Content Analysis", "Semantic Data", "Technical Info"])
    
    with tab1:
        st.json(metadata['basic_info'])
    
    with tab2:
        st.json(metadata['content_analysis'])
    
    with tab3:
        st.write("**Summary:**")
        st.write(metadata['semantic_data']['summary'])
        st.write("**Key Topics:**")
        st.write(metadata['semantic_data']['key_topics'])
        st.write("**Entities:**")
        st.json(metadata['semantic_data']['entities'])
    
    with tab4:
        st.json(metadata['technical_metadata'])

In [17]:
#STEP-15
def process_multiple_documents(folder_path):
    """Process multiple documents in a folder"""
    results = []
    
    for file_path in Path(folder_path).glob('*'):
        if file_path.suffix in ['.pdf', '.docx', '.txt']:
            try:
                metadata = generate_metadata(str(file_path))
                results.append(metadata)
                print(f"Processed: {file_path.name}")
            except Exception as e:
                print(f"Error processing {file_path.name}: {e}")
    
    return results

In [18]:
#STEP-16
def save_metadata_to_json(metadata, output_path):
    """Save metadata to JSON file"""
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

def export_to_csv(metadata_list, output_path):
    """Export metadata list to CSV"""
    # Flatten metadata for CSV export
    flattened_data = []
    for metadata in metadata_list:
        flat_dict = {}
        for category, data in metadata.items():
            if isinstance(data, dict):
                for key, value in data.items():
                    flat_dict[f"{category}_{key}"] = str(value)
            else:
                flat_dict[category] = str(data)
        flattened_data.append(flat_dict)
    
    df = pd.DataFrame(flattened_data)
    df.to_csv(output_path, index=False)

In [None]:
#STEP-17
# Required imports for main execution
import os
import time
import json
from pathlib import Path
from collections import Counter

def main():
    """Main execution function with progress indicators"""
    print("Automated Metadata Generation System")
    print("=" * 40)
    
    # Check if models are already loaded
    print("Checking system readiness...")
    models_ready = check_models_status()
    
    if not models_ready:
        print("⚠ Some models not loaded. System will use fallback methods.")
        load_choice = input("Load ML models now? (y/n) [This may take a few minutes]: ")
        if load_choice.lower() == 'y':
            print("Loading models... This may take a few minutes on first run.")
            models = initialize_models()
    
    # Choose mode
    print("\nSelect operation mode:")
    print("1: Process single file")
    print("2: Batch process folder")
    print("3: Start web interface")
    
    mode = input("Enter choice (1-3): ")
    
    if mode == "1":
        file_path = input("Enter file path: ")
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return
        
        print("Processing file... Please wait.")
        start_time = time.time()
        metadata = generate_metadata(file_path)
        end_time = time.time()
        
        print(f"\n✓ Processing completed in {end_time - start_time:.2f} seconds")
        print(json.dumps(metadata, indent=2))
        
        # Save option
        save_option = input("\nSave to file? (y/n): ")
        if save_option.lower() == 'y':
            output_path = input("Enter output path: ")
            save_metadata_to_json(metadata, output_path)
            print(f"Metadata saved to: {output_path}")
    
    elif mode == "2":
        folder_path = input("Enter folder path: ")
        if not os.path.exists(folder_path):
            print(f"Folder not found: {folder_path}")
            return
            
        print("Processing documents in batch...")
        results = process_multiple_documents(folder_path)
        print(f"✓ Processed {len(results)} documents")
        
        # Export option
        export_option = input("Export to CSV? (y/n): ")
        if export_option.lower() == 'y':
            output_path = input("Enter CSV output path: ")
            export_to_csv(results, output_path)
            print(f"Results exported to: {output_path}")
    
    elif mode == "3":
        print("Starting web interface...")
        print("This will open in your browser. Press Ctrl+C to stop.")
        try:
            create_streamlit_interface()
        except Exception as e:
            print(f"Web interface failed: {e}")
            print("Make sure Streamlit is installed: pip install streamlit")
    
    
    else:
        print("Invalid choice. Please run again and select 1-3.")

def check_models_status():
    """Quick check of model availability"""
    try:
        status = {
            'nlp': models.get('nlp') is not None,
            'summarizer': models.get('summarizer') is not None,
            'sentence_model': models.get('sentence_model') is not None
        }
        
        loaded_count = sum(status.values())
        total_count = len(status)
        
        print(f"Models loaded: {loaded_count}/{total_count}")
        return loaded_count > 0
    except NameError:
        print("Models not initialized yet.")
        return False

# Fast startup version - minimal model loading
def quick_start():
    """Quick start with minimal dependencies"""
    print("🚀 Quick Start Mode - Basic metadata extraction only")
    print("=" * 50)
    
    file_path = input("Enter file path: ").strip().strip('"\'')  # Remove quotes if present
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        print("Please check the file path and try again.")
        return
    
    print(f"Processing: {file_path}")
    
    # Basic metadata without heavy ML models
    try:
        metadata = generate_basic_metadata(file_path)
        print("\n" + "="*50)
        print("METADATA GENERATED:")
        print("="*50)
        print(json.dumps(metadata, indent=2))
        
        # Save option
        save_option = input("\nSave to JSON file? (y/n): ")
        if save_option.lower() == 'y':
            output_path = input("Enter output filename (e.g., metadata.json): ")
            if not output_path.endswith('.json'):
                output_path += '.json'
            save_metadata_to_json(metadata, output_path)
            print(f"✓ Metadata saved to: {output_path}")
            
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()

def generate_basic_metadata(file_path):
    """Generate basic metadata without ML models"""
    start_time = time.time()
    
    # Initialize basic metadata structure
    metadata = create_metadata_schema()
    
    # Extract basic file information
    file_info = Path(file_path)
    metadata['basic_info']['filename'] = file_info.name
    metadata['basic_info']['file_type'] = file_info.suffix
    metadata['basic_info']['file_size'] = file_info.stat().st_size
    metadata['basic_info']['creation_date'] = time.ctime(file_info.stat().st_ctime)
    metadata['basic_info']['processing_date'] = time.strftime('%Y-%m-%d %H:%M:%S')
    
    # Extract text content
    text = ""
    extraction_method = ""
    
    try:
        if file_path.lower().endswith('.pdf'):
            text = extract_pdf_text(file_path)
            extraction_method = 'PDF extraction'
        elif file_path.lower().endswith('.docx'):
            text = extract_docx_text(file_path)
            extraction_method = 'DOCX extraction'
        elif file_path.lower().endswith(('.txt', '.md')):
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            extraction_method = 'Direct text reading'
        else:
            # Try to read as text anyway
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                text = f.read()
            extraction_method = 'Generic text reading'
    except Exception as e:
        print(f"Warning: Text extraction failed: {e}")
        text = f"[Text extraction failed: {str(e)}]"
        extraction_method = 'Failed extraction'
    
    # Preprocess text
    text = preprocess_text(text) if text else ""
    
    # Basic content analysis
    words = text.split() if text else []
    metadata['content_analysis']['word_count'] = len(words)
    metadata['content_analysis']['character_count'] = len(text)
    metadata['content_analysis']['document_type'] = classify_document_type(text)
    
    # Try to calculate readability score
    try:
        from textstat import flesch_reading_ease
        metadata['content_analysis']['readability_score'] = flesch_reading_ease(text) if text else 0
    except:
        metadata['content_analysis']['readability_score'] = 0
    
    # Simple summary (first 200 characters)
    if text:
        sentences = [s.strip() for s in text.split('.') if s.strip()]
        summary_text = '. '.join(sentences[:2]) + '.' if sentences else text[:200]
        metadata['semantic_data']['summary'] = summary_text + "..." if len(summary_text) > 200 else summary_text
    else:
        metadata['semantic_data']['summary'] = "No text content extracted"
    
    # Basic topics (most frequent meaningful words)
    if text:
        # Simple stop words to filter out
        stop_words = {'the', 'and', 'are', 'for', 'with', 'this', 'that', 'from', 'they', 'have', 'been', 'will', 'said', 'each', 'which', 'their', 'time', 'but', 'all', 'can', 'may', 'was', 'were', 'not', 'you', 'your'}
        words_clean = [word.lower().strip('.,!?;:"()[]') for word in words if len(word) > 3 and word.lower() not in stop_words]
        word_freq = Counter(words_clean)
        metadata['semantic_data']['key_topics'] = [word for word, count in word_freq.most_common(5)]
    else:
        metadata['semantic_data']['key_topics'] = []
    
    # Basic entities (simple pattern matching)
    entities = {'PERSON': [], 'ORG': [], 'DATE': [], 'PERCENT': []}
    if text:
        import re
        # Find capitalized words (potential names/organizations)
        capitalized = re.findall(r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b', text)
        entities['PERSON'] = list(set(capitalized))[:5]  # Top 5 unique
        
        # Find dates
        dates = re.findall(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b|\b\d{4}\b', text)
        entities['DATE'] = list(set(dates))[:3]
        
        # Find percentages
        percentages = re.findall(r'\b\d+(?:\.\d+)?%\b', text)
        entities['PERCENT'] = list(set(percentages))
    
    metadata['semantic_data']['entities'] = entities
    
    # Technical metadata
    metadata['technical_metadata']['extraction_method'] = extraction_method
    metadata['technical_metadata']['processing_time'] = time.time() - start_time
    metadata['technical_metadata']['confidence_score'] = 0.8 if text else 0.1
    
    return metadata

# Run the system
if __name__ == "__main__":
    # Choose startup mode
    print("Automated Metadata Generation System")
    print("=" * 50)
    startup_mode = input("Choose startup mode:\n1: Full system (may take time to load)\n2: Quick start (basic features only)\nEnter choice (1 or 2): ")
    
    if startup_mode == "2":
        quick_start()
    else:
        main()

Automated Metadata Generation System


Choose startup mode:
1: Full system (may take time to load)
2: Quick start (basic features only)
Enter choice (1 or 2):  1


Automated Metadata Generation System
Checking system readiness...
Models loaded: 3/3

Select operation mode:
1: Process single file
2: Batch process folder
3: Start web interface


Enter choice (1-3):  2
Enter folder path:  /home/navya/Downloads/test_metadata/


Processing documents in batch...


In [21]:
def calculate_confidence_score(text):
    """Calculate confidence score for metadata quality"""
    score = 0.0
    
    # Text length factor
    if len(text) > 100:
        score += 0.3
    
    # Entity detection factor
    doc = nlp(text)
    if len(doc.ents) > 0:
        score += 0.3
    
    # Structure factor (presence of sentences)
    sentences = text.split('.')
    if len(sentences) > 3:
        score += 0.2
    
    # Readability factor
    readability = flesch_reading_ease(text)
    if readability > 30:  # Readable text
        score += 0.2
    
    return min(score, 1.0)

In [22]:
import logging

def setup_logging():
    """Setup logging configuration"""
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler('metadata_generation.log'),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

logger = setup_logging()