In [None]:
from google.colab import userdata
api_key = userdata.get('gemini_api_key')

In [None]:
import google.generativeai as genai
from tqdm import tqdm

genai.configure(api_key=api_key)
generation_config = genai.GenerationConfig(
        temperature=0,
)
model = genai.GenerativeModel("gemini-2.0-flash-001", generation_config=generation_config)

In [None]:
from google.api_core import retry
from google.api_core import exceptions

# Catch transient Gemini errors.
def is_retryable(e) -> bool:
    if retry.if_transient_error(e):
        # Good practice, but probably won't fire with the google-genai SDK
        return True
    elif isinstance(e, exceptions.TooManyRequests):  # HTTP 429
        return True
    elif isinstance(e, exceptions.ServiceUnavailable):  # HTTP 503
        return True
    else:
        return False

@retry.Retry(predicate=is_retryable)
def generate_content_with_rate_limit(prompt):
  return model.generate_content(prompt).text

In [None]:
def remove_markdown_tag(text):
    return text.replace("```markdown", "").replace("```", "")

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
!cp "/content/drive/MyDrive/dataset-pdf-zip/dataset-pdf-20250430T213758Z-001.zip" .

In [None]:
!rm -rf dataset-pdf

In [None]:
!unzip "dataset-pdf-20250430T213758Z-001.zip"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: dataset-pdf/287/images/Figure_2.png  
  inflating: dataset-pdf/1041/images/Figure_3.png  
  inflating: dataset-pdf/1041/images/Figure_11.png  
  inflating: dataset-pdf/1041/images/Figure_6.png  
  inflating: dataset-pdf/1041/images/Figure_5.png  
  inflating: dataset-pdf/1041/images/Figure_2.png  
  inflating: dataset-pdf/222/images/Figure_5.png  
  inflating: dataset-pdf/1041/images/Figure_8.png  
  inflating: dataset-pdf/1041/images/Figure_9.png  
  inflating: dataset-pdf/106/paper.pdf  
  inflating: dataset-pdf/1041/images/Figure_7.png  
  inflating: dataset-pdf/156/images/Figure_7.png  
  inflating: dataset-pdf/1041/images/Figure_10.png  
  inflating: dataset-pdf/106/images/Figure_5.png  
  inflating: dataset-pdf/106/images/Figure_6.png  
  inflating: dataset-pdf/106/images/Figure_4.png  
  inflating: dataset-pdf/156/images/Figure_9.png  
  inflating: dataset-pdf/1041/images/Figure_4.png  
  inflating: da

In [None]:
from pathlib import Path
dataset_dir = Path("/content/dataset-pdf")
dataset_dir = sorted(list(dataset_dir.glob("*")), key = lambda x: int(x.name))
# select first 1001 as train left over is test
train_dataset = dataset_dir[:1001]
test_dataset = dataset_dir[1001:]
assert len(train_dataset) == 1001
assert len(test_dataset) == 100
for train_path in train_dataset:
  assert int(train_path.name) < 1001, f"Expect train dataset to be 0-1000 but got {train_path.name}"
for test_path in test_dataset:
  assert int(test_path.name) >= 1001, f"Expect test dataset to be 1001-1100 but got {test_path.name}"

In [None]:
def get_clean_slide(slide_md_text):
    instruction = f"""FROM:
{slide_md_text}
You will have to cleanup slide text above to easier to understand. If you saw duplicate slide you will have to remove it too.Each slide should cover a complete topic or subtopic, not incremental additions.
For example, if a topic like "Introduction" has several bullet points, combine them into one slide titled "Introduction", rather than creating separate slides for each point.
Keep slide titles concise and avoid repeating titles across multiple slides. You will have to output slide text with ```markdown ``` format. do not include unnecessary text such as Okay. you will have to adjust slide text According to slidev or https://github.com/slidevjs/slidev"""
    return generate_content_with_rate_limit(instruction)

In [None]:
from tqdm import tqdm
import time
for path in tqdm(train_dataset[105:]):
  unclean_slide_markdown_path = path / "slide.md"
  with open(unclean_slide_markdown_path, "r") as f:
    slide_md_text = f.read()
  slide_md_text = slide_md_text
  try:
    clean_slide_md_text = remove_markdown_tag(get_clean_slide(slide_md_text))
    with open(unclean_slide_markdown_path, "w") as f:
        f.write(clean_slide_md_text)
    time.sleep(4)
  except Exception as e:
    print(e)
    time.sleep(4)
    continue



  0%|          | 0/896 [00:00<?, ?it/s]

Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.


 62%|██████▏   | 553/896 [2:14:06<1:13:08, 12.79s/it]

Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.


 76%|███████▌  | 678/896 [2:43:44<47:10, 12.98s/it]

Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.


100%|██████████| 896/896 [3:34:50<00:00, 14.39s/it]


In [None]:
# Clean up trailing space or newline
for path in tqdm(train_dataset):
  unclean_slide_markdown_path = path / "slide.md"
  with open(unclean_slide_markdown_path, "r") as f:
    slide_md_text = f.read()
  clean_slide_md_text = slide_md_text.strip()
  with open(unclean_slide_markdown_path, "w") as f:
    f.write(clean_slide_md_text)

100%|██████████| 1001/1001 [00:00<00:00, 5353.43it/s]


In [None]:
!zip -r dataset-pdf-clean.zip dataset-pdf

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: dataset-pdf/896/paper.pdf (deflated 32%)
  adding: dataset-pdf/896/slide.xml (deflated 75%)
  adding: dataset-pdf/896/slide.md (deflated 66%)
  adding: dataset-pdf/896/figures_and_tables.txt (deflated 52%)
  adding: dataset-pdf/896/input_prompt.txt (deflated 67%)
  adding: dataset-pdf/896/1084.windowed_summarunner_scores.txt (deflated 89%)
  adding: dataset-pdf/896/extracted_text_with_tag.txt (deflated 67%)
  adding: dataset-pdf/994/ (stored 0%)
  adding: dataset-pdf/994/images/ (stored 0%)
  adding: dataset-pdf/994/images/Figure_1.png (deflated 4%)
  adding: dataset-pdf/994/slide.pdf (deflated 8%)
  adding: dataset-pdf/994/paper.tei.xml (deflated 75%)
  adding: dataset-pdf/994/extracted_text.txt (deflated 63%)
  adding: dataset-pdf/994/paper.pdf (deflated 4%)
  adding: dataset-pdf/994/slide.xml (deflated 58%)
  adding: dataset-pdf/994/slide.md (deflated 55%)
  adding: dataset-pdf/994/figures_and_tables.txt (def

In [None]:
!cp dataset-pdf-clean.zip "/content/drive/MyDrive/nlp/dataset-pdf-zip.zip"

In [None]:
instruction = f"""FROM:
{slide_md_text}
You will have to cleanup slide text above to easier to understand. If you saw duplicate slide you will have to remove it too.Each slide should cover a complete topic or subtopic, not incremental additions.
For example, if a topic like "Introduction" has several bullet points, combine them into one slide titled "Introduction", rather than creating separate slides for each point.
Keep slide titles concise and avoid repeating titles across multiple slides. You will have to output slide text with ```markdown ``` format. do not include unnecessary text such as Okay. you will have to adjust slide text According to slidev or https://github.com/slidevjs/slidev"""
print(generate_content_with_rate_limit(instruction))

```markdown
---
theme: default
title: Matrix-Vector Multiplication in Sub-Quadratic Time (Some Preprocessing Required)
class: text-center
---

# Matrix-Vector Multiplication in Sub-Quadratic Time
Ryan Williams

---

# Introduction

Matrix-vector multiplication is a fundamental operation in scientific computing. The question is: How fast can an n x n matrix-vector multiplication be performed? Note that it takes O(n²) steps just to read the matrix!

---

# Main Result

If we allow O(n²⁺ε) preprocessing, then matrix-vector multiplication over any finite semiring can be done in O(n²/(log n)²).

---

# Better Algorithms for Matrix Multiplication

Three major developments in matrix multiplication algorithms:

*   Arlazarov et al. (Four Russians, 1960s): O(n³/log n) operations. Uses table lookups, good for hardware with short vector operations as primitives.
*   Strassen (1969): O(n^log₂7) = O(n².⁸¹) operations. Asymptotically fast, but overhead exists. Experiments are inconclusive about Stra