<a href="https://colab.research.google.com/github/midnightoatmeal/research-gpt/blob/main/research-gpt_phase1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# checking GPU availability and specifications

!nvidia-smi
import torch
print(f"nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
  print(f"GPU Model: {torch.cuda.get_device_name(0)}")

Sat Feb  8 18:27:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P0             45W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:

import sys
print(f"Python version: {sys.version}")
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

!pip install -q transformers==4.36.2  # Latest stable version with good Mistral support
!pip install -q datasets==2.16.1      # For handling our research paper dataset
!pip install -q peft==0.7.1           # For parameter-efficient fine-tuning
!pip install -q accelerate==0.25.0    # For optimized training
!pip install -q arxiv==2.0.0          # For fetching papers from arXiv
!pip install -q evaluate==0.4.1       # For model evaluation
!pip install -q wandb==0.16.2         # For experiment tracking
!pip install -q rouge-score==0.1.2    # For evaluating text generation

!pip list | grep -E "transformers|datasets|peft|accelerate|arxiv|evaluate|wandb|rouge-score"

Python version: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
PyTorch version: 2.5.1+cu124
CUDA available: True
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.2/8.2 MB[0m [31m117.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 3.4.1 requires transformers<5.0.0,>=4.41.0, but you have transformers 4.36.2 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m

In [5]:
# setting up the project structure

import os

def create_directory_structure():
  """Creates a structured directory system for my research project."""
  directories = [
      'data/raw',
      'data/processed',
      'models/checkpoints',
      'experiments/logs',
      'experiments/results',
      'src'
  ]

  for directory in directories:
    os.makedirs(directory, exist_ok=True)
    print(f"Created directory: {directory}")

create_directory_structure()

Created directory: data/raw
Created directory: data/processed
Created directory: models/checkpoints
Created directory: experiments/logs
Created directory: experiments/results
Created directory: src


In [6]:
import os
import wandb

# Set your API key (choose one method)
os.environ["WANDB_API_KEY"] = "3b19bd5a40bd69e53ac39df2cc708f51e07394f5"  # Method 1


# Initialize a new wandb run
# Think of a 'run' as one complete training session of your model
wandb.init(
    # Project name - this will group all your runs under this project
    project="researchgpt",

    # Configuration dictionary - these are the hyperparameters and settings
    # you want to track for this training run
    config={
        "model_name": "mistralai/Mistral-7B-v0.1",  # The model you're using
        "learning_rate": 2e-4,                       # Learning rate for training
        "batch_size": 4,                             # Number of samples processed at once
        "max_length": 512                            # Maximum sequence length
    }
)


[34m[1mwandb[0m: Currently logged in as: [33mlionelrozario98[0m ([33mlionelrozario98-personal-project[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
def test_environment():
    """
    Verifies that all components of our research environment are working correctly.
    """
    checks = {
        "GPU Available": torch.cuda.is_available(),
        "Project Structure": os.path.exists("data/raw"),
        "Git Repository": os.path.exists(".git"),
        "Python Version": sys.version
    }

    for check, status in checks.items():
        print(f"{check}: {'✅' if status else '❌'}")

test_environment()

GPU Available: ✅
Project Structure: ✅
Git Repository: ❌
Python Version: ✅


In [20]:
import arxiv
import pandas as pd
import numpy as np
from datetime import datetime
import logging
from pathlib import Path
import json
from typing import List, Dict, Optional, Union
import time
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class ResearchPaperPipeline:
    def __init__(self,
                 model_name: str = "facebook/opt-125m",
                 cache_dir: str = "data",
                 max_length: int = 512):
        """
        Initialize the research pipeline with configuration parameters.
        We start with a smaller model for development, then scale up to Mistral.
        """
        self.cache_dir = Path(cache_dir)
        self.max_length = max_length
        self.model_name = model_name

        # Set up logging with timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_file = self.cache_dir / f'pipeline_{timestamp}.log'

        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            handlers=[
                logging.FileHandler(log_file),
                logging.StreamHandler()
            ]
        )
        self.logger = logging.getLogger(__name__)

        # Initialize tokenizer with error handling
        try:
            self.logger.info(f"Loading tokenizer for {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.logger.info("Tokenizer loaded successfully")
        except Exception as e:
            self.logger.error(f"Error loading tokenizer: {str(e)}")
            raise

        # Create necessary directories
        for subdir in ['raw', 'processed', 'logs', 'models']:
            (self.cache_dir / subdir).mkdir(parents=True, exist_ok=True)

        self.logger.info("Pipeline initialized successfully")

    def fetch_papers(self,
                    categories: List[str] = ['cs.AI', 'cs.LG'],
                    max_results: int = 100,
                    save_raw: bool = True,
                    start_date: str = None) -> pd.DataFrame:
        """
        Fetch papers from arXiv with improved error handling and date filtering.

        Args:
            categories: List of arXiv categories to fetch
            max_results: Maximum number of papers to fetch
            save_raw: Whether to save raw data
            start_date: Optional start date in YYYY-MM-DD format
        """
        self.logger.info(f"Fetching {max_results} papers from categories: {categories}")

        papers = []
        search_query = ' OR '.join(f'cat:{cat}' for cat in categories)

        # Add date filter if specified
        if start_date:
            # Convert date to arXiv format (YYYYMMDDHHMMSS)
            arxiv_date = f"{start_date.replace('-', '')}000000"
            search_query += f' AND submittedDate:[{arxiv_date} TO 999999999999]'

        client = arxiv.Client(page_size=100, delay_seconds=3.0)
        search = arxiv.Search(
            query=search_query,
            max_results=max_results,
            sort_by=arxiv.SortCriterion.SubmittedDate
        )

        try:
            with tqdm(total=max_results, desc="Fetching papers") as pbar:
                for paper in client.results(search):
                    try:
                        papers.append({
                            'id': paper.entry_id,
                            'title': paper.title,
                            'abstract': paper.summary,
                            'authors': [author.name for author in paper.authors],
                            'categories': paper.categories,
                            'published': paper.published,
                            'updated': paper.updated,
                            'doi': paper.doi,
                            'primary_category': paper.primary_category
                        })
                        pbar.update(1)

                        if len(papers) >= max_results:
                            break

                    except Exception as e:
                        self.logger.warning(f"Error processing paper {paper.entry_id}: {str(e)}")
                        continue

        except Exception as e:
            self.logger.error(f"Error in paper fetch process: {str(e)}")
            if papers:  # Save what we have if there's an error
                self.logger.info("Saving partial results...")
            else:
                raise

        df = pd.DataFrame(papers)

        # Save raw data if requested
        if save_raw and not df.empty:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            save_path = self.cache_dir / 'raw' / f'papers_{timestamp}.parquet'
            df.to_parquet(save_path)
            self.logger.info(f"Saved raw data to {save_path}")

        self.logger.info(f"Successfully fetched {len(df)} papers")
        return df

    def prepare_training_data(self,
                            df: pd.DataFrame,
                            template_style: str = 'comprehensive') -> Dict:
        """
        Prepare paper data for model training with enhanced error checking and validation.
        Returns both the dataset and preparation metrics.
        """
        self.logger.info("Preparing training data")

        templates = {
            'comprehensive': [
                "Title: {title}\nAbstract: {abstract}\n\nSummarize the key findings of this research paper.",
                "Based on this abstract, what are the main contributions of the paper titled '{title}'?",
                "Analyze the research methodology described in: {title}\nAbstract: {abstract}"
            ],
            'simple': [
                "Summarize this paper: {title}\n{abstract}",
                "What is the main point of: {title}?"
            ]
        }

        selected_templates = templates.get(template_style, templates['simple'])

        training_data = []
        skipped_papers = 0

        for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing papers"):
            try:
                for template in selected_templates:
                    # Validate input data
                    if not isinstance(row['title'], str) or not isinstance(row['abstract'], str):
                        skipped_papers += 1
                        continue

                    prompt = template.format(
                        title=row['title'].strip(),
                        abstract=row['abstract'].strip()
                    )

                    # Create completion with citation
                    completion = f"Based on the paper (arXiv:{row['id']}), {row['abstract']}"

                    # Validate token length
                    total_tokens = len(self.tokenizer.encode(prompt + completion))
                    if total_tokens > self.max_length:
                        continue

                    training_data.append({
                        'prompt': prompt,
                        'completion': completion,
                        'paper_id': row['id'],
                        'token_count': total_tokens
                    })

            except Exception as e:
                self.logger.warning(f"Error processing paper {row.get('id', 'unknown')}: {str(e)}")
                skipped_papers += 1
                continue

        # Convert to DataFrame for easier analysis
        train_df = pd.DataFrame(training_data)

        # Save processed data
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_path = self.cache_dir / 'processed' / f'training_data_{timestamp}.parquet'
        train_df.to_parquet(save_path)

        # Prepare metrics
        metrics = {
            'total_examples': len(train_df),
            'unique_papers': len(train_df['paper_id'].unique()),
            'skipped_papers': skipped_papers,
            'avg_token_count': train_df['token_count'].mean(),
            'max_token_count': train_df['token_count'].max(),
            'preparation_timestamp': timestamp
        }

        self.logger.info(f"Created {len(train_df)} training examples from {metrics['unique_papers']} papers")
        return {'data': train_df, 'metrics': metrics}


In [21]:
# Step 1: Expanded Data Collection
categories = [
    'cs.AI',     # Artificial Intelligence
    'cs.LG',     # Machine Learning
    'cs.CL',     # Computation and Language
    'stat.ML'    # Statistics - Machine Learning
]

# Fetch papers
papers_df = pipeline.fetch_papers(
    categories=categories,
    max_results=100,
    save_raw=True
)

print(f"Fetched {len(papers_df)} papers")
print("\nDistribution across categories:")
print(papers_df['primary_category'].value_counts())

Fetching papers: 100%|██████████| 100/100 [00:02<00:00, 45.05it/s]

Fetched 100 papers

Distribution across categories:
primary_category
cs.LG              44
cs.CL              23
cs.CV               8
cs.AI               4
stat.ML             4
cs.RO               2
quant-ph            2
cs.SD               2
cs.SE               2
cs.HC               2
physics.chem-ph     1
math.NA             1
cs.CR               1
math.ST             1
stat.AP             1
eess.AS             1
eess.IV             1
Name: count, dtype: int64





In [22]:
# Step 2: Quality Analysis
def analyze_paper_quality(df):
    analysis = {
        "total_papers": len(df),
        "avg_abstract_length": df['abstract'].str.len().mean(),
        "papers_per_category": df['primary_category'].value_counts().to_dict(),
        "time_span": {
            "earliest": df['published'].min(),
            "latest": df['published'].max()
        }
    }
    return analysis

paper_analysis = analyze_paper_quality(papers_df)
print("\nPaper Collection Analysis:")
print(json.dumps(paper_analysis, indent=2, default=str))


Paper Collection Analysis:
{
  "total_papers": 100,
  "avg_abstract_length": 1217.15,
  "papers_per_category": {
    "cs.LG": 44,
    "cs.CL": 23,
    "cs.CV": 8,
    "cs.AI": 4,
    "stat.ML": 4,
    "cs.RO": 2,
    "quant-ph": 2,
    "cs.SD": 2,
    "cs.SE": 2,
    "cs.HC": 2,
    "physics.chem-ph": 1,
    "math.NA": 1,
    "cs.CR": 1,
    "math.ST": 1,
    "stat.AP": 1,
    "eess.AS": 1,
    "eess.IV": 1
  },
  "time_span": {
    "earliest": "2025-02-06 12:24:30+00:00",
    "latest": "2025-02-06 18:59:55+00:00"
  }
}


In [23]:
result = pipeline.prepare_training_data(
    papers_df,
    template_style='comprehensive'
)

print("\nTraining Data Metrics: ")
print(json.dumps(result['metrics'], indent=2))

print("\nSample Training Examples (3 random examples):")
sample_data = result['data'].sample(3)
for _, row in sample_data.iterrows():
  print("\n--------------------")
  print("PROMPT:")
  print(row['prompt'])
  print("\nCOMPLETION (first 200 chars):")
  print(row['completion'][:200] + "...")


Processing papers: 100%|██████████| 100/100 [00:00<00:00, 274.27it/s]


Training Data Metrics: 
{
  "total_examples": 171,
  "unique_papers": 99,
  "skipped_papers": 0,
  "avg_token_count": 357.906432748538,
  "max_token_count": 509,
  "preparation_timestamp": "20250208_185647"
}

Sample Training Examples (3 random examples):

--------------------
PROMPT:
Based on this abstract, what are the main contributions of the paper titled 'Realistic Image-to-Image Machine Unlearning via Decoupling and Knowledge Retention'?

COMPLETION (first 200 chars):
Based on the paper (arXiv:http://arxiv.org/abs/2502.04260v1), Machine Unlearning allows participants to remove their data from a trained
machine learning model in order to preserve their privacy, and ...

--------------------
PROMPT:
Based on this abstract, what are the main contributions of the paper titled 'Evaluating Inter-Column Logical Relationships in Synthetic Tabular Data Generation'?

COMPLETION (first 200 chars):
Based on the paper (arXiv:http://arxiv.org/abs/2502.04055v1), Current evaluations of syntheti




In [24]:
def analyze_paper_quality(df):
  analysis = {
      "total_papers": len(df),
      "avg_abstract_lenght": df['abstract'].str.len().mean(),
      "papers_per_category": df['primary_category'].value_counts().to_dict(),
      "time_span":{
          "earliest": df['published'].min(),
          "latest": df['published'].max()
      },
      # adding more detailed analysis
      "authors_per_paper": df['authors'].apply(len).mean(),
      "papers_with_doi": df['doi'].notna().sum()

    }
  return analysis

paper_analysis = analyze_paper_quality(papers_df)
print("\nPaper Collection Analysis: ")
print(json.dumps(paper_analysis, indent=2, default=str))


Paper Collection Analysis: 
{
  "total_papers": 100,
  "avg_abstract_lenght": 1217.15,
  "papers_per_category": {
    "cs.LG": 44,
    "cs.CL": 23,
    "cs.CV": 8,
    "cs.AI": 4,
    "stat.ML": 4,
    "cs.RO": 2,
    "quant-ph": 2,
    "cs.SD": 2,
    "cs.SE": 2,
    "cs.HC": 2,
    "physics.chem-ph": 1,
    "math.NA": 1,
    "cs.CR": 1,
    "math.ST": 1,
    "stat.AP": 1,
    "eess.AS": 1,
    "eess.IV": 1
  },
  "time_span": {
    "earliest": "2025-02-06 12:24:30+00:00",
    "latest": "2025-02-06 18:59:55+00:00"
  },
  "authors_per_paper": 4.9,
  "papers_with_doi": "2"
}


In [28]:
# Test the updated pipeline
pipeline = ResearchPaperPipeline()

# Define categories
categories = [
    'cs.AI',     # Core AI
    'cs.LG',     # Machine Learning
    'cs.CL',     # NLP
    'stat.ML',   # Statistical ML
    'cs.CV'      # Computer Vision
]

# Fetch papers with date filter
papers_df = pipeline.fetch_papers(
    categories=categories,
    max_results=200,
    start_date="2024-01-01"
)

# Print results
print(f"\nFetched {len(papers_df)} papers")
print("\nDistribution across categories:")
print(papers_df['primary_category'].value_counts())


Fetching papers: 100%|██████████| 200/200 [00:06<00:00, 29.88it/s]


Fetched 200 papers

Distribution across categories:
primary_category
cs.CV             153
eess.IV            15
cs.LG              12
cs.RO               9
cs.MM               2
cs.CL               2
cs.CR               1
cs.CY               1
cs.HC               1
cs.GR               1
cs.AI               1
physics.med-ph      1
hep-ex              1
Name: count, dtype: int64





In [31]:
def fetch_balanced_papers_v2(pipeline, categories_dict, papers_per_category=40):
    """
    Enhanced version with stricter category filtering and error handling
    """
    all_papers = []

    for category in categories_dict:
        print(f"\nFetching papers for {category}...")
        # Add primary category filter to query
        category_papers = pipeline.fetch_papers(
            categories=[category],
            max_results=papers_per_category * 2,  # Fetch more to filter
            start_date="2024-01-01"
        )

        # Filter to keep only papers where this is the primary category
        category_papers = category_papers[
            category_papers['primary_category'] == category
        ].head(papers_per_category)

        all_papers.append(category_papers)

    combined_df = pd.concat(all_papers, ignore_index=True)
    return combined_df

# Define core categories with strict filtering
core_categories = [
    'cs.AI',    # Artificial Intelligence
    'cs.LG',    # Machine Learning
    'cs.CL',    # Computational Linguistics
    'stat.ML',  # Statistical Machine Learning
    'cs.CV'     # Computer Vision
]

# Fetch balanced dataset
balanced_papers_v2 = fetch_balanced_papers_v2(pipeline, core_categories, papers_per_category=40)

print("\nNew Distribution across categories:")
print(balanced_papers_v2['primary_category'].value_counts())


Fetching papers for cs.AI...


Fetching papers: 100%|██████████| 80/80 [00:05<00:00, 14.46it/s]



Fetching papers for cs.LG...


Fetching papers: 100%|██████████| 80/80 [00:05<00:00, 13.87it/s]



Fetching papers for cs.CL...


Fetching papers: 100%|██████████| 80/80 [00:06<00:00, 13.14it/s]



Fetching papers for stat.ML...


Fetching papers: 100%|██████████| 80/80 [00:13<00:00,  5.82it/s]



Fetching papers for cs.CV...


Fetching papers: 100%|██████████| 80/80 [00:03<00:00, 26.57it/s]


New Distribution across categories:
primary_category
cs.LG      40
cs.CV      40
cs.CL      40
stat.ML    34
cs.AI       5
Name: count, dtype: int64



