In [2]:
!pip install flask flask-cors pyngrok transformers scikit-learn pdfplumber python-docx pytesseract langdetect spacy
!python -m spacy download en_core_web_sm
!apt-get install -y tesseract-ocr



Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m120.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [3]:
# Replace with your own ngrok auth token
!ngrok config add-authtoken 2yrAm1yrFsl2KmvLeumqKoLxZaU_6HGgQnvaRfMHNv1X6gE7s




Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [4]:
import os

os.makedirs("templates", exist_ok=True)
os.makedirs("static", exist_ok=True)

# index.html
with open("templates/index.html", "w") as f:
    f.write("""
<!DOCTYPE html>
<html lang="en">
<head>
  <meta charset="UTF-8">
  <title>📄 Metadata Generator</title>
  <link rel="stylesheet" href="{{ url_for('static', filename='style.css') }}">
</head>
<body>
  <div class="container">
    <h1>📁 Upload a Document</h1>
    <form method="POST" enctype="multipart/form-data">
      <input type="file" name="file" required><br>
      <button type="submit">Generate Metadata</button>
    </form>

    {% if metadata %}
    <div class="result">
      <h2>📌 Title</h2>
      <p>{{ metadata.title }}</p>

      <h2>📊 Word Count</h2>
      <p>{{ metadata.word_count }}</p>

      <h2>🌐 Language</h2>
      <p>{{ metadata.language }}</p>

      <h2>👀 Preview</h2>
      <pre>{{ metadata.preview }}</pre>

      <h2>🔍 Summary</h2>
      <textarea readonly>{{ metadata.summary }}</textarea>

      <h2>📝 Keywords</h2>
      <ul>
        {% for keyword in metadata.keywords %}
          <li>{{ keyword }}</li>
        {% endfor %}
      </ul>

      <h2>🏷️ Named Entities</h2>
      <ul>
        {% for entity in metadata.named_entities %}
          <li>{{ entity }}</li>
        {% endfor %}
      </ul>

      <h2>📥 Download</h2>
      <form method="POST" action="/download">
        <input type="hidden" name="data" value='{{ metadata | tojson }}'>
        <button type="submit">⬇️ Download Metadata (JSON)</button>
      </form>
    </div>
    {% endif %}
  </div>
</body>
</html>
""")

# style.css
with open("static/style.css", "w") as f:
    f.write("""
body {
  font-family: Arial, sans-serif;
  background-color: #f1f1f1;
  padding: 30px;
}
.container {
  max-width: 700px;
  margin: auto;
  background-color: white;
  padding: 30px;
  border-radius: 10px;
  box-shadow: 0 0 10px rgba(0,0,0,0.1);
}
textarea {
  width: 100%;
  height: 150px;
  padding: 10px;
  margin-top: 10px;
  resize: vertical;
}
button {
  background-color: #4CAF50;
  color: white;
  border: none;
  padding: 10px 20px;
  margin-top: 10px;
  font-size: 16px;
  cursor: pointer;
}
.result {
  margin-top: 30px;
}
""")


In [5]:
from flask import Flask, request, render_template, send_file
from pyngrok import ngrok
import os, json
import pytesseract, pdfplumber
from docx import Document
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect
import spacy

# Load NLP models
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
nlp = spacy.load("en_core_web_sm")

app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'uploads'
os.makedirs('uploads', exist_ok=True)

# --- Utility Functions ---
def extract_text(path):
    try:
        ext = os.path.splitext(path)[1].lower()
        if ext == '.pdf':
            with pdfplumber.open(path) as pdf:
                return '\n'.join(p.extract_text() for p in pdf.pages if p.extract_text())
        elif ext == '.docx':
            return '\n'.join(p.text for p in Document(path).paragraphs)
        elif ext == '.txt':
            return open(path, 'r', encoding='utf-8').read()
        elif ext in ['.jpg', '.jpeg', '.png']:
            return pytesseract.image_to_string(path)
        else:
            return ""
    except Exception as e:
        print(f"[ERROR] extract_text: {e}")
        return ""

def generate_summary(text):
    try:
        input_text = text[:1000]
        return summarizer(input_text, max_length=100, min_length=30, do_sample=False)[0]['summary_text']
    except Exception as e:
        print(f"[ERROR] summary: {e}")
        return "Summary could not be generated."

def extract_keywords(text):
    try:
        vectorizer = TfidfVectorizer(stop_words='english', max_features=10)
        vectorizer.fit_transform([text])
        return vectorizer.get_feature_names_out().tolist()
    except Exception as e:
        print(f"[ERROR] keywords: {e}")
        return []

def detect_language(text):
    try:
        return detect(text)
    except Exception as e:
        print(f"[ERROR] langdetect: {e}")
        return "unknown"

def extract_named_entities(text):
    try:
        doc = nlp(text)
        return list(set(ent.text for ent in doc.ents if ent.label_ in ['PERSON', 'ORG', 'GPE']))
    except Exception as e:
        print(f"[ERROR] NER: {e}")
        return []

# --- Routes ---
@app.route("/", methods=["GET", "POST"])
def index():
    metadata = None
    if request.method == "POST":
        file = request.files.get("file")
        if file:
            try:
                path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
                file.save(path)
                print(f"[INFO] File saved to {path}")

                text = extract_text(path)
                print(f"[INFO] Extracted text length: {len(text)}")

                if text.strip():
                    metadata = {
                        "title": file.filename,
                        "word_count": len(text.split()),
                        "language": detect_language(text),
                        "preview": '\n'.join(text.strip().splitlines()[:3]),
                        "summary": generate_summary(text),
                        "keywords": extract_keywords(text),
                        "named_entities": extract_named_entities(text)
                    }
                    print("[INFO] Metadata generated successfully.")
                else:
                    print("[WARN] Empty text extracted.")
            except Exception as e:
                print(f"[ERROR] During processing: {e}")
    return render_template("index.html", metadata=metadata)

@app.route("/download", methods=["POST"])
def download_metadata():
    try:
        data = json.loads(request.form['data'])
        with open("metadata.json", "w") as f:
            json.dump(data, f, indent=2)
        return send_file("metadata.json", as_attachment=True)
    except Exception as e:
        print(f"[ERROR] download: {e}")
        return "Failed to generate metadata file", 500


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [None]:
public_url = ngrok.connect(5000)
print("🌐 App is running at:", public_url)
app.run()


🌐 App is running at: NgrokTunnel: "https://d505-34-16-172-32.ngrok-free.app" -> "http://localhost:5000"
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 03:56:46] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 03:56:47] "GET /static/style.css HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 03:56:48] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


[INFO] File saved to uploads/5-mb-example-file.pdf
[INFO] Extracted text length: 2895623


INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 03:59:26] "POST / HTTP/1.1" 200 -


[ERROR] NER: [E088] Text of length 2895623 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
[INFO] Metadata generated successfully.


INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 03:59:27] "[36mGET /static/style.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:00:37] "[31m[1mPOST / HTTP/1.1[0m" 400 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:00:37] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:00:38] "[36mGET /static/style.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:01:20] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:01:21] "[36mGET /static/style.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:02:23] "[31m[1mPOST / HTTP/1.1[0m" 400 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:03:00] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:03:00] "[36mGET /static/style.css HTTP/1.1[0m" 304 -


[INFO] File saved to uploads/5-mb-example-file.pdf
[INFO] Extracted text length: 2895623


INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:05:38] "POST / HTTP/1.1" 200 -


[ERROR] NER: [E088] Text of length 2895623 exceeds maximum of 1000000. The parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. This means long texts may cause memory allocation errors. If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
[INFO] Metadata generated successfully.


INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:05:39] "[36mGET /static/style.css HTTP/1.1[0m" 304 -
INFO:werkzeug:127.0.0.1 - - [25/Jun/2025 04:06:11] "POST /download HTTP/1.1" 200 -
