In [1]:
!pip install -U langchain-community
!!pip install faiss-cpu
!pip install transformers accelerate torch docx python-docx
!pip install -U bitsandbytes

Collecting bitsandbytes
  Using cached bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [2]:
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from langchain_community.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.schema import Document
from docx import Document as DocxDocument
from huggingface_hub import login

# --- Load environment variables ---
HF_TOKEN = <insert your token here>
# --- Model and tokenizer setup ---
LLM_MODEL = "mistralai/Mistral-7B-Instruct-v0.3"
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

login(token=HF_TOKEN)
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL,
    token=HF_TOKEN,
    quantization_config=quantization_config,
    device_map="auto"
)

llm_pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=2024,
    temperature=0.3,
    return_full_text=False
)

# --- Input paths ---
TRANSCRIPT_PATH = "/content/Lora&Qlora_transcript.txt"
OUTPUT_DOCX = "lecture_notes.docx"

# --- Cleaning function for transcript ---
def clean_transcript(text):
    import re
    lines = text.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    lines = list(dict.fromkeys(lines))  # Remove duplicates
    text = " ".join(lines)
    text = re.sub(r"\b(uh+|um+|like|you know)\b", "", text, flags=re.IGNORECASE)
    return text

# --- Load and clean transcript ---
with open(TRANSCRIPT_PATH, "r") as f:
    raw_transcript = f.read()
cleaned_transcript = clean_transcript(raw_transcript)

# --- RAG input corpus: only transcript ---
documents = [Document(page_content=cleaned_transcript, metadata={"source": "transcript"})]

# --- Embedding and Vectorstore ---
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(documents, embedding_model)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# --- Prompt formatting ---
def build_prompt(context):
    return (
        "You are a helpful assistant who writes detailed and organized lecture notes from the given transcript.\n"
        "Organize notes into sections with headings and bullet points. Use paragraphs for explanations.\n\n"
        f"Transcript Content:\n{context}\n\n"
        "Write structured, cleaned, and comprehensive lecture notes:\n"
    )

# --- QA Chain setup ---
def generate_notes():
    query = "Create detailed lecture notes from the transcript."
    docs = retriever.get_relevant_documents(query)
    combined_context = "\n\n".join([doc.page_content for doc in docs])
    final_prompt = build_prompt(combined_context)
    result = llm_pipe(final_prompt)[0]['generated_text']
    return result

# --- Generate notes and save ---
generated_notes = generate_notes()
doc = DocxDocument()
doc.add_heading("Lecture Notes", 0)
doc.add_paragraph(generated_notes)
doc.save(OUTPUT_DOCX)

print(f"✅ Lecture notes generated and saved to {OUTPUT_DOCX}")

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Device set to use cuda:0
  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  docs = retriever.get_relevant_documents(query)
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


✅ Lecture notes generated and saved to lecture_notes.docx


In [3]:
# --- Input Transcript Path ---
TRANSCRIPT_PATH = "/content/Lora&Qlora_transcript.txt"
OUTPUT_DOCX = "lecture_notes.docx"

# --- Clean transcript ---
def clean_transcript(text):
    import re
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    lines = list(dict.fromkeys(lines))  # Remove duplicates
    text = " ".join(lines)
    text = re.sub(r"\b(uh+|um+|like|you know)\b", "", text, flags=re.IGNORECASE)
    return text

# --- Load and preprocess transcript ---
with open(TRANSCRIPT_PATH, "r", encoding="utf-8") as f:
    raw_transcript = f.read()
cleaned_transcript = clean_transcript(raw_transcript)

# --- Split transcript into chunks ---
def split_into_chunks(text, chunk_size=1000, overlap=200):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

chunks = split_into_chunks(cleaned_transcript)

# --- Prompt builder (neutral) ---
def build_prompt(context):
    return (
        "Below is a segment of a technical lecture transcript.\n"
        "Write detailed and structured notes covering all key points in this segment.\n"
        "Use section headings, bullet points, and clear explanations.\n\n"
        f"{context}\n\n"
        "Lecture Notes:"
    )

# --- Generate notes from each chunk ---
def generate_notes_per_chunk(chunks):
    notes_list = []
    for i, chunk in enumerate(chunks):
        print(f"📄 Generating notes for chunk {i + 1}/{len(chunks)}...")
        prompt = build_prompt(chunk)
        try:
            output = llm_pipe(prompt)[0]['generated_text']
            notes_list.append((f"Part {i + 1}", output.strip()))
        except Exception as e:
            print(f"⚠️ Error on chunk {i + 1}: {e}")
    return notes_list

# --- Generate notes ---
all_notes = generate_notes_per_chunk(chunks)

# --- Save all notes to a .docx file ---
doc = DocxDocument()
doc.add_heading("Lecture Notes", 0)

for title, content in all_notes:
    doc.add_heading(title, level=1)
    doc.add_paragraph(content)

doc.save(OUTPUT_DOCX)
print(f"✅ Full lecture notes saved to: {OUTPUT_DOCX}")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 1/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 2/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 3/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 4/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 5/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 6/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 7/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 8/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 9/25...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 10/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 11/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 12/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 13/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 14/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 15/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 16/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 17/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 18/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 19/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 20/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 21/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 22/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 23/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 24/25...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


📄 Generating notes for chunk 25/25...
✅ Full lecture notes saved to: lecture_notes.docx
