In [2]:
from google.colab import drive

In [3]:
!pip install pymupdf transformers requests

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [5]:

import fitz  # PyMuPDF
import json
from transformers import AutoTokenizer
import requests
import time

drive.mount('/content/drive')

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/TinyLlama-1.1B-Chat-v1.0") # lightweight tokenizer for counting

# Step 1: Extract text
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

# Step 2: Chunk text into ~512 tokens
def chunk_text(text, max_tokens=1024):
    words = text.split()
    chunks = []
    current_chunk = []
    token_count = 0

    for word in words:
        token_count += len(tokenizer.tokenize(word))
        current_chunk.append(word)
        if token_count >= max_tokens:
            chunks.append(" ".join(current_chunk))
            current_chunk = []
            token_count = 0

    if current_chunk:
        chunks.append(" ".join(current_chunk))
    return chunks

# Step 3: Prepare prompts
def prepare_prompts(chunks):
    return [
        {
            "instruction": "Summarize the following text.",
            "input": chunk,
            "output": ""
        }
        for chunk in chunks
    ]

# Step 4: Call Mixtral via Together API
def call_mixtral_together(prompt_input):
    headers = {
        "Authorization": f"Bearer 7e8ca3cdaa79a0a9f2dc96392ccbab467f3af0412694c21bd8800108dcf87061",  # Replace with your actual API key
        "Content-Type": "application/json"
    }
    data = {
        "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
        "messages": [
            {"role": "user", "content": prompt_input}
        ],
        "max_tokens": 128,
        "temperature": 0.7
    }
    response = requests.post("https://api.together.xyz/v1/chat/completions", headers=headers, json=data)
    time.sleep(1.5)  # rate limiting
    response_json = response.json()
    try:
    # response_json = response.json()
      if "choices" in response_json and response_json["choices"]:
          return response_json["choices"][0]["message"]["content"]
      elif "error" in response_json:
          raise ValueError(f"API Error: {response_json['error']}")
      else:
          raise ValueError(f"Unexpected response format: {response_json}")
    except Exception as e:
        raise RuntimeError(f"Failed to parse response: {e}\nRaw response: {response.text}")
    return response_json["choices"][0]["message"]["content"]

# Step 5: Run everything
pdf_path = "/content/drive/MyDrive/real_estate_sector.pdf"
raw_text = extract_text_from_pdf(pdf_path)

chunks = chunk_text(raw_text)
dataset = prepare_prompts(chunks)

# test_prompt = "Summarize the following text:\nArtificial Intelligence is transforming many industries..."
# print(call_mixtral_together(test_prompt))

# below file will create suedo code means unlabeld data and converting into labeld data
# Generate pseudo-labels and save
with open("/content/drive/MyDrive/pseudo_labeled.jsonl", "w", encoding="utf-8") as f_out:
    for entry in dataset:
        try:
            entry["output"] = call_mixtral_together(f"{entry['instruction']}\n{entry['input']}")
            f_out.write(json.dumps(entry) + "\n")
        except Exception as e:
            print(f"Failed on entry: {e}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
