<a href="https://colab.research.google.com/github/mioyn/AdvDataProg/blob/main/LiteratureReviewAssistantv4(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Literature Review Assistant

**Abstract → Components → Search → Select → Related Work**

Run cells in order.


Licensed under MIT License!!!

In [None]:
# @title 1. Setup
GPU_TYPE = "A100"  # @param ["A100", "T4"]

from IPython.display import HTML, display

# CSS hack for word wrap
def set_css():
    display(HTML('<style>pre { white-space: pre-wrap; }</style>'))
get_ipython().events.register('pre_run_cell', set_css)

def cprint(text, color='white'):
    colors = {'red': '91', 'green': '92', 'yellow': '93', 'blue': '94',
              'pink': '95', 'teal': '96', 'grey': '90', 'white': '97'}
    print(f'\033[{colors.get(color, "97")}m{text}\x1b[0m')

!nvidia-smi --query-gpu=name --format=csv,noheader

cprint("Installing dependencies...", 'blue')

if GPU_TYPE == "A100":
    !wget https://antidote.cloud/f/29294c604b024f2eb1ff/?dl=1 -O llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl -q
else:
    !wget https://antidote.cloud/f/ae5312aa983845c7abf1/?dl=1 -O llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl -q

!pip install ./llama_cpp_python-0.3.16-cp312-cp312-linux_x86_64.whl -q
!pip install huggingface_hub pandas --quiet

cprint("Downloading model...", 'blue')
!huggingface-cli download bartowski/Mistral-Nemo-Instruct-2407-GGUF \
    Mistral-Nemo-Instruct-2407-Q5_K_M.gguf \
    --local-dir . --local-dir-use-symlinks False

cprint("Setup complete", 'green')

NVIDIA A100-SXM4-40GB
[94mInstalling dependencies...[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[94mDownloading model...[0m
Downloading 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf' to '.cache/huggingface/download/nPs0nIBuHPYaf12n7QiCpTH-YPM=.f2118506b57d31403cba021b476ceb95600cc278fbc2feeaaf884b6e64fa6ee5.incomplete'
Mistral-Nemo-Instruct-2407-Q5_K_M.gguf: 100% 8.73G/8.73G [00:22<00:00, 387MB/s]
Download complete. Moving file to Mistral-Nemo-Instruct-2407-Q5_K_M.gguf
Mistral-Nemo-Instruct-2407-Q5_K_M.gguf
[92mSetup complete[0m


In [None]:
# @title 2. Load Model and Utilities
from llama_cpp import Llama
import json, os, re, requests, time

if GPU_TYPE == "A100":
    llm = Llama(
        model_path="./Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
        n_ctx=65536, n_gpu_layers=-1, n_batch=2048, n_ubatch=512,
        flash_attn=True, use_mmap=True, use_mlock=True, verbose=False, seed=42
    )
else:
    llm = Llama(
        model_path="./Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
        n_ctx=32768, n_gpu_layers=-1, n_batch=1024,
        use_mmap=True, verbose=False, seed=42
    )
cprint(f"Model loaded ({GPU_TYPE})", 'green')

# Shared state
STATE = {'abstract': '', 'components': [], 'paper_components': []}

def generate_bibtex_key(authors, year, title):
    """Generate a BibTeX key from paper metadata."""
    if authors:
        first_author = authors[0].split()[-1].lower()
        first_author = re.sub(r'[^a-z]', '', first_author)
    else:
        first_author = 'unknown'
    year_str = str(year) if year else '0000'
    title_words = re.findall(r'\b[A-Za-z]+\b', title or 'paper')
    title_word = ''
    for w in title_words:
        if w.lower() not in ['a', 'an', 'the', 'of', 'for', 'and', 'in', 'on', 'to', 'with']:
            title_word = w.lower()[:8]
            break
    return f"{first_author}{year_str}{title_word}"

def format_author_citation(authors):
    """Format as 'Smith et al.' or 'Smith'."""
    if not authors:
        return "Unknown"
    last_name = authors[0].split()[-1]
    return f"{last_name} et al." if len(authors) > 1 else last_name

def generate_bibtex_entry(paper):
    """Generate full BibTeX entry."""
    key = paper.get('bibtex_key', 'unknown')
    authors = paper.get('authors', ['Unknown'])
    author_str = ' and '.join(authors)
    doi = f"\n  doi = {{{paper['doi']}}}," if paper.get('doi') else ""
    return f"""@article{{{key},
  title = {{{paper.get('title', 'Unknown')}}},
  author = {{{author_str}}},
  year = {{{paper.get('year', '')}}},
  journal = {{{paper.get('venue', 'Unknown')}}},{doi}
}}"""

cprint("Utilities loaded", 'green')

llama_context: n_ctx_per_seq (65536) < n_ctx_train (1024000) -- the full capacity of the model will not be utilized


[92mModel loaded (A100)[0m
[92mUtilities loaded[0m


In [None]:
# @title 3. Enter Abstract

ABSTRACT = """The rapid classification of social media content during humanitarian crises is essential for effective disaster response. This paper presents a training-free, multimodal classification framework using zero-shot learning with vision-language models for crisis-related social media analysis."""  # @param {type:"string"}

STATE['abstract'] = ABSTRACT
cprint("Abstract saved", 'green')
print(f"\n{ABSTRACT}")

[92mAbstract saved[0m

The rapid classification of social media content during humanitarian crises is essential for effective disaster response. This paper presents a training-free, multimodal classification framework using zero-shot learning with vision-language models for crisis-related social media analysis.


In [None]:
# @title 4. Identify Components

prompt = f"""[INST] Identify 4-6 key research components from this abstract that would make good literature review subsection topics.

Abstract: {STATE['abstract']}

Rules:
- Each component: 2-5 words
- Focus on methods, domains, key concepts
- These will become subsection headings (e.g., "2.1 Social Media in Disaster Response")
- Be specific enough to find relevant papers

Return ONLY a numbered list:
1. First component
2. Second component
[/INST]"""

output = llm(prompt, max_tokens=300, temperature=0.3)
response = output["choices"][0]["text"]

components = []
for line in response.strip().split('\n'):
    match = re.match(r'^\d+\.\s*(.+)$', line.strip())
    if match:
        components.append(match.group(1).strip().rstrip('.'))

STATE['components'] = components

cprint("\nIdentified components:", 'green')
for i, c in enumerate(components, 1):
    print(f"  {i}. {c}")

[92m
Identified components:[0m
  1. **Social Media in Disaster Response**
  2. **Multimodal Classification Framework**
  3. **Zero-Shot Learning**
  4. **Vision-Language Models**
  5. **Training-Free Approach**


In [None]:
# @title 5. Edit Components

components = STATE['components'].copy()

def show():
    cprint("\nCurrent components:", 'blue')
    for i, c in enumerate(components, 1):
        print(f"  {i}. {c}")

def menu():
    print("\n[1] View  [2] Remove  [3] Move  [4] Add  [5] Done")

while True:
    show()
    menu()
    try:
        choice = input("Choice: ").strip()
        if choice == '1':
            continue
        elif choice == '2':
            idx = int(input("Remove #: ")) - 1
            if 0 <= idx < len(components):
                removed = components.pop(idx)
                cprint(f"Removed: {removed}", 'yellow')
        elif choice == '3':
            f = int(input("Move #: ")) - 1
            t = int(input("To #: ")) - 1
            if 0 <= f < len(components) and 0 <= t < len(components):
                item = components.pop(f)
                components.insert(t, item)
                cprint(f"Moved: {item}", 'green')
        elif choice == '4':
            new = input("New component: ").strip()
            if new:
                components.append(new)
                cprint(f"Added: {new}", 'green')
        elif choice == '5':
            break
    except (ValueError, IndexError):
        cprint("Invalid input", 'red')

STATE['components'] = components
cprint("\nFinal components:", 'green')
show()

with open('components.json', 'w') as f:
    json.dump(components, f, indent=2)

[94m
Current components:[0m
  1. **Social Media in Disaster Response**
  2. **Multimodal Classification Framework**
  3. **Zero-Shot Learning**
  4. **Vision-Language Models**
  5. **Training-Free Approach**

[1] View  [2] Remove  [3] Move  [4] Add  [5] Done
Choice: 5
[92m
Final components:[0m
[94m
Current components:[0m
  1. **Social Media in Disaster Response**
  2. **Multimodal Classification Framework**
  3. **Zero-Shot Learning**
  4. **Vision-Language Models**
  5. **Training-Free Approach**


In [None]:
# @title 6. Search Settings
RESULTS_PER_QUERY = 10  # @param {type:"number"}
START_YEAR = 2020  # @param {type:"number"}
YOUR_EMAIL = "student@university.edu"  # @param {type:"string"}

cprint(f"Settings: {len(STATE['components'])} components, {RESULTS_PER_QUERY} results each, from {START_YEAR}+", 'green')

[92mSettings: 5 components, 10 results each, from 2020+[0m


In [None]:
# @title 7. Search and Select Papers
# @markdown [Y] keep, [n] skip, [s] skip rest of query

def search_openalex(query, limit=10, year=2020):
    try:
        r = requests.get('https://api.openalex.org/works',
            params={'search': query, 'per_page': limit,
                    'filter': f'from_publication_date:{year}-01-01,type:article',
                    'select': 'id,doi,title,authorships,publication_year,cited_by_count,abstract_inverted_index,primary_location',
                    'sort': 'cited_by_count:desc'},
            headers={'mailto': YOUR_EMAIL}, timeout=30)
        if r.status_code != 200:
            return [], 0
        data = r.json()
        papers = []
        for w in data.get('results', []):
            abstract = ""
            if w.get('abstract_inverted_index'):
                idx = w['abstract_inverted_index']
                words = [''] * (max(max(p) for p in idx.values()) + 1)
                for word, positions in idx.items():
                    for pos in positions:
                        words[pos] = word
                abstract = ' '.join(words)

            venue = ''
            if w.get('primary_location') and w['primary_location'].get('source'):
                venue = w['primary_location']['source'].get('display_name', '')

            authors = [a['author']['display_name'] for a in w.get('authorships', [])[:10]]
            year_val = w.get('publication_year')
            title = w.get('title', 'Unknown')

            paper = {
                'title': title,
                'year': year_val,
                'authors': authors,
                'abstract': abstract,
                'citations': w.get('cited_by_count', 0),
                'doi': w.get('doi', ''),
                'venue': venue,
                'bibtex_key': generate_bibtex_key(authors, year_val, title),
                'cite_command': f"\\cite{{{generate_bibtex_key(authors, year_val, title)}}}",
                'author_citation': format_author_citation(authors)
            }
            papers.append(paper)
        return papers, data.get('meta', {}).get('count', 0)
    except Exception as e:
        cprint(f"Error: {e}", 'red')
        return [], 0

def highlight(text, query):
    if not query or not text:
        return text or ""
    for word in re.findall(r'\b\w+\b', query):
        if len(word) > 2:
            text = re.sub(fr'\b({word})\b', '\x1b[1;31m\\1\x1b[0m', text, flags=re.IGNORECASE)
    return text

for f in ['selected_papers.json', 'bib.bib']:
    if os.path.exists(f):
        os.remove(f)

paper_components = []
all_bibtex = []

print("="*60)
cprint("PAPER SEARCH AND SELECTION", 'blue')
print("="*60)

for ci, comp in enumerate(STATE['components']):
    print(f"\n--- Component {ci+1}/{len(STATE['components'])}: {comp} ---")
    papers, total = search_openalex(comp, RESULTS_PER_QUERY, START_YEAR)

    if not papers:
        cprint("No results", 'yellow')
        paper_components.append([])
        continue

    cprint(f"Found {total}, showing {len(papers)} (by citations)", 'green')

    component_papers = []

    for i, p in enumerate(papers):
        print(f"\n{'─'*50}")
        cprint(f"Result {i+1}: {p['title']}", 'pink')
        print(f"{p['author_citation']} ({p['year']}) | {p['citations']} cites | {p['cite_command']}")
        if p['venue']:
            print(f"Venue: {p['venue']}")
        if p['abstract']:
            abs_display = p['abstract'][:400] + "..." if len(p['abstract']) > 400 else p['abstract']
            print(f"\n{highlight(abs_display, comp)}")

        choice = input("\nKeep? [Y/n/s]: ").strip().lower()
        if choice == 's':
            cprint("Skipping rest of query", 'yellow')
            break
        elif choice in ('', 'y'):
            component_papers.append(p)
            all_bibtex.append(generate_bibtex_entry(p))
            cprint(f"Added ({len(component_papers)} for this component)", 'green')

    paper_components.append(component_papers)

STATE['paper_components'] = paper_components

with open('selected_papers.json', 'w') as f:
    json.dump(paper_components, f, indent=2)

with open('bib.bib', 'w') as f:
    f.write('\n\n'.join(all_bibtex))

total_papers = sum(len(pc) for pc in paper_components)
print(f"\n{'='*60}")
cprint(f"DONE: {total_papers} papers across {len(paper_components)} components", 'green')
print("="*60)

[94mPAPER SEARCH AND SELECTION[0m

--- Component 1/5: **Social Media in Disaster Response** ---
[92mFound 61687, showing 10 (by citations)[0m

──────────────────────────────────────────────────
[95mResult 1: Prevalence of Depression Symptoms in US Adults Before and During the COVID-19 Pandemic[0m
Ettman et al. (2020) | 2170 cites | \cite{ettman2020prevalen}
Venue: JAMA Network Open

These findings suggest that prevalence of depression symptoms in the US was more than 3-fold higher during COVID-19 compared with before the COVID-19 pandemic. Individuals with lower [1;31msocial[0m resources, lower economic resources, and greater exposure to stressors (eg, job loss) reported a greater burden of depression symptoms. Post-COVID-19 plans should account for the probable increase in mental ill...

Keep? [Y/n/s]: 
[92mAdded (1 for this component)[0m

──────────────────────────────────────────────────
[95mResult 2: Online University Teaching During and After the Covid-19 Crisis: Refocu

In [None]:
# @title 8. Generate Literature Review
# @markdown Generates 3 paragraphs per component with LaTeX citations.

if not STATE['paper_components']:
    if os.path.exists('selected_papers.json'):
        with open('selected_papers.json', 'r') as f:
            STATE['paper_components'] = json.load(f)
    if os.path.exists('components.json'):
        with open('components.json', 'r') as f:
            STATE['components'] = json.load(f)

paper_components = STATE['paper_components']
components = STATE['components']
abstract = STATE['abstract']

if not paper_components or not any(paper_components):
    cprint("No papers found. Run search first.", 'red')
else:
    literature_review = []

    print("="*60)
    cprint("GENERATING LITERATURE REVIEW", 'blue')
    print("="*60)

    for idx, (comp, papers) in enumerate(zip(components, paper_components)):
        if not papers:
            cprint(f"\nSkipping '{comp}' - no papers", 'yellow')
            continue

        cprint(f"\nProcessing: {comp} ({len(papers)} papers)...", 'blue')

        # Build abstracts text
        abstracts_txt = ""
        for p in papers:
            if p.get('abstract'):
                abstracts_txt += f"ABSTRACT {p['cite_command']}: {p['abstract'][:800]}\n\n"

        # Paragraph 1: Background
        prompt1 = f"""[INST][ABSTRACTS]
{abstracts_txt}
[/ABSTRACTS]

[RESEARCH OUTLINE]
{abstract}
[/RESEARCH OUTLINE]

INSTRUCTIONS:
1. Focus on: "{comp}"
2. Write a paragraph defining '{comp}' and explaining why it matters for research.
3. Use information from the abstracts.
4. End claims with citation commands exactly as given (e.g., \\cite{{smith2023deep}}).

Example style:
Social media analysis in disaster response is crucial for rapid situational awareness \\cite{{gupta2023handcrafted}}. This research addresses the challenge of filtering irrelevant content \\cite{{gupta2023handcrafted, ponce2022social}}.

Write the paragraph:
[/INST]"""

        para1 = ""
        for chunk in llm(prompt1, max_tokens=600, temperature=0, stream=True):
            para1 += chunk['choices'][0]['text']

        # Build summaries for paragraph 2
        summaries_txt = ""
        for p in papers:
            abs_short = (p.get('abstract', '') or '')[:300]
            summaries_txt += f"PAPER by {p['author_citation']} {p['cite_command']}: {abs_short}\n\n"

        # Paragraph 2: What researchers did
        prompt2 = f"""[INST]
{summaries_txt}

INSTRUCTIONS:
1. Focus on: "{comp}"
2. Describe what each researcher contributed.
3. Format: "Author et al. \\cite{{key}} did X. Author \\cite{{key}} developed Y."

Example:
Gupta et al. \\cite{{gupta2023handcrafted}} developed a framework to filter irrelevant images. Ponce-López et al. \\cite{{ponce2022social}} employed binary classification for severity evaluation.

Write the paragraph:
[/INST]"""

        para2 = ""
        for chunk in llm(prompt2, max_tokens=600, temperature=0, stream=True):
            para2 += chunk['choices'][0]['text']

        # Paragraph 3: Summary
        prompt3 = f"""[INST]
TEXT:
{para1}
{para2}

Write a 2-3 sentence summary starting with 'To sum up' or 'To summarize'.
Do NOT use \\cite commands.
Identify any research gap if relevant.

Example: To sum up, scholars have studied disaster response through social media from multiple perspectives. However, few studies have focused on zero-shot multimodal approaches.
[/INST]"""

        para3 = ""
        for chunk in llm(prompt3, max_tokens=200, temperature=0, stream=True):
            para3 += chunk['choices'][0]['text']

        para1 = para1.strip()
        para2 = para2.strip()
        para3 = para3.strip()

        section = f"\n{para1}\n    {para2}\n    {para3}\n"
        literature_review.append((comp, section))

        print(f"\n### 2.{idx+1} {comp}")
        print(section)

    # Build final document
    intro = "Our research builds on earlier work on " + ", ".join(components[:-1]) + f", and {components[-1]}."
    full_review = "# 2. Literature Review\n\n" + intro + "\n"

    for idx, (comp, section) in enumerate(literature_review):
        full_review += f"\n## 2.{idx+1} {comp}\n{section}"

    with open('literature_review.txt', 'w') as f:
        f.write(full_review)

    print("\n" + "="*60)
    cprint("COMPLETE - Saved to literature_review.txt", 'green')
    print("="*60)

[94mGENERATING LITERATURE REVIEW[0m
[94m
Processing: **Social Media in Disaster Response** (10 papers)...[0m

### 2.1 **Social Media in Disaster Response**

Social media in disaster response refers to the use of social media platforms and data for real-time information gathering, situation awareness, and decision-making during crises. It matters for research because it enables swift understanding of affected populations' needs, sentiments, and behaviors, facilitating targeted aid and intervention \cite{li2020impact, ikuta2022global}. However, the vast amount of user-generated content can be overwhelming and unreliable, necessitating robust classification methods to extract valuable insights \cite{rasheed2020digital, zhang2020impact}.
    Ettman et al. \cite{ettman2020prevalen} found that the prevalence of depression symptoms in the U.S. increased significantly during the COVID-19 pandemic, with individuals having lower social and economic resources, and greater exposure to stressor

In [None]:
# @title 9. Export Files

cprint("\nFiles:", 'blue')
for f in ['literature_review.txt', 'bib.bib', 'selected_papers.json', 'components.json']:
    if os.path.exists(f):
        cprint(f"  {f} ({os.path.getsize(f)} bytes)", 'green')

try:
    from google.colab import files
    for f in ['literature_review.txt', 'bib.bib', 'selected_papers.json', 'components.json']:
        if os.path.exists(f):
            files.download(f)
except:
    pass

---
# Chat with Documents

In [None]:
# @title 10. Upload PDF
!pip install pypdfium2 -q
import pypdfium2 as pdfium

try:
    from google.colab import files
    uploaded = files.upload()
    pdf_file = [f for f in uploaded.keys() if f.endswith('.pdf')][0]
except:
    pdf_file = input("PDF filename: ").strip()

if os.path.exists(pdf_file):
    pdf = pdfium.PdfDocument(pdf_file)
    text = ""
    for i in range(min(len(pdf), 15)):
        text += pdf[i].get_textpage().get_text_range()
    if len(text) > 40000:
        text = text[:40000]
    STATE['paper_text'] = text
    cprint(f"Loaded {pdf_file} ({len(text)} chars)", 'green')
else:
    cprint(f"Not found: {pdf_file}", 'red')

In [None]:
# @title 11. Chat

if 'paper_text' not in STATE:
    cprint("Upload a PDF first", 'red')
else:
    history = ""
    print("Chat with paper. Type 'exit' to stop.\n")

    while True:
        q = input("You: ").strip()
        if q.lower() == 'exit':
            break

        prompt = f"""[INST] Answer based on this paper.

Paper:
{STATE['paper_text'][:25000]}

History:
{history[-2000:]}

Question: {q}
[/INST]"""

        print("Assistant: ", end="")
        response = ""
        for chunk in llm(prompt, max_tokens=800, temperature=0, stream=True):
            t = chunk["choices"][0]["text"]
            print(t, end="", flush=True)
            response += t
        print("\n")
        history += f"Q: {q}\nA: {response}\n"

---
**Notes:**
- Generated review is a draft - rewrite in your voice
- Verify citations match bib.bib
- Check for hallucinated claims