In [26]:
import arxivscraper
import anthropic
import pandas as pd
import os
from anthropic.types import TextBlock
from transformers import AutoTokenizer, AutoModel
import torch
import requests
import PyPDF2
from io import BytesIO
import sqlite3
from sqlalchemy import create_engine, types

In [2]:
# set os env
os.environ['ANTHROPIC_API_KEY'] = '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'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [3]:
categories = [
    'cs',
    'econ',
    'eess',
    'math',
    'physics',
    'q-bio',
    'q-fin',
    'stat'
]

In [4]:
def rewrite_abstract(abstract: str, prompt: str) -> str:
    """
    Rewrites an academic abstract to be more investor-friendly using Claude AI.
    
    Args:
        abstract (str): The academic abstract to rewrite
        
    Returns:
        str: The investor-friendly version of the abstract
    """
    
    # Initialize Anthropic client
    client = anthropic.Anthropic(
        api_key=os.environ.get("ANTHROPIC_API_KEY")
    )
    
    # Generate response
    message = client.messages.create(
        model="claude-3-haiku-20240307",
        max_tokens=1024,
        system="You are a creative writing assistant.",
        messages=[
            {"role": "user", "content": f"Hello Claude. {prompt}"},
            {"role": "user", "content": abstract},
            {"role": "assistant", "content": "Here is a paraphrased version of the abstract:"}
        ]
    )
    
    return message.content

In [5]:
scraper = arxivscraper.Scraper(category='physics:cond-mat', date_from='2017-05-30',date_until='2017-06-01')
output = scraper.scrape()

df = pd.DataFrame(output)
# limit df to 5 rows
df['url'] = df['url'].replace('abs', 'pdf', regex=True)
df = df.head(1)

fetching up to  1000 records...
fetching is completed in 4.0 seconds.
Total number of records 215


In [6]:
df.head()

Unnamed: 0,title,id,abstract,categories,doi,created,updated,authors,affiliation,url
0,general relations for quantum gases in two and...,1210.1784,we derive exact general relations between vari...,cond-mat.quant-gas,10.1103/physreva.86.053633,2012-10-05,2017-05-30,"[félix werner, yvan castin]","[lkb, lkb]",https://arxiv.org/pdf/1210.1784


In [7]:
prompt_for_investors = "Please rewrite this abstract in a way that highlights the potential business opportunities and market impact of the described approach."
prompt_for_business = "Please rewrite this abstract to emphasize the practical applications, product development potential, and competitive advantages of the described technical approach."

df["investors"] = df.apply(lambda x: rewrite_abstract(x['abstract'], prompt_for_investors)[0].text, axis=1)
df["business"] = df.apply(lambda x: rewrite_abstract(x['abstract'], prompt_for_business)[0].text, axis=1)

df.head()


Unnamed: 0,title,id,abstract,categories,doi,created,updated,authors,affiliation,url,investors,business
0,general relations for quantum gases in two and...,1210.1784,we derive exact general relations between vari...,cond-mat.quant-gas,10.1103/physreva.86.053633,2012-10-05,2017-05-30,"[félix werner, yvan castin]","[lkb, lkb]",https://arxiv.org/pdf/1210.1784,\n\nThis research establishes a set of precis...,\n\nThis work derives precise mathematical re...


In [12]:
# Wczytaj model SciBERT
model_name = "allenai/scibert_scivocab_uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def extract_features(text):

    segment_length = 512
    segments = []
    for i in range(0, len(text), segment_length):
        segment = text[i:i+segment_length]
        if len(segment.split()) <= 512:
            segments.append(segment)
        else:
            # Podziel dłuższy segment na mniejsze
            subsegments = [" ".join(segment.split()[j:j+512]) for j in range(0, len(segment.split()), 512)]
            segments.extend(subsegments)

    if len(segments) > 2:
        segments = segments[:5]

    # Przetwarzaj segmenty i łącz wyniki
    outputs = []
    for segment in segments:
        if len(segment) < 10 or len(segment) > 512:
            continue
        input_ids = tokenizer.encode(segment, return_tensors="pt")
        output = model(input_ids)[0]
        outputs.append(output)

    # Połącz wyniki segmentów
    combined_output = torch.cat(outputs, dim=1)
    return combined_output




In [9]:
df["abstract_embedding"] = df["abstract"].apply(extract_features)
df.head()

Unnamed: 0,title,id,abstract,categories,doi,created,updated,authors,affiliation,url,investors,business,abstract_embedding
0,general relations for quantum gases in two and...,1210.1784,we derive exact general relations between vari...,cond-mat.quant-gas,10.1103/physreva.86.053633,2012-10-05,2017-05-30,"[félix werner, yvan castin]","[lkb, lkb]",https://arxiv.org/pdf/1210.1784,\n\nThis research establishes a set of precis...,\n\nThis work derives precise mathematical re...,"[[[tensor(0.4915, grad_fn=<UnbindBackward0>), ..."


In [13]:


def pdf_from_url_to_text(pdf_url):
    # Pobierz plik PDF z podanego URL
    response = requests.get(pdf_url)

    if response.status_code != 200:
        return None

    # Przekształć zawartość w strumień plikowy
    pdf_content = BytesIO(response.content)

    # Stwórz PdfReader obiekt na podstawie strumienia
    pdf_reader = PyPDF2.PdfReader(pdf_content)

    # Inicjalizuj pusty string do przechowywania tekstu
    text = ''

    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

    return text

# Zastosowanie z DataFrame
df["pdf_text"] = df["url"].apply(pdf_from_url_to_text)

df["pdf_text_embedding"] = df["pdf_text"].apply(extract_features)

In [14]:
df["pdf_text_embedding"][0]

tensor([[[ 0.4339,  0.7756, -0.1786,  ...,  0.1467, -0.8273, -1.9667],
         [ 0.7558,  0.8733, -1.2131,  ..., -0.5607,  0.3810, -0.5354],
         [ 0.3161,  0.8345, -0.7894,  ...,  1.1827,  0.3855, -1.0444],
         ...,
         [ 1.4476,  0.7881,  0.2619,  ..., -0.8661, -0.6159, -0.2972],
         [ 0.9158,  1.0434, -0.4043,  ..., -0.5693, -0.6252, -0.5861],
         [ 0.7784,  0.4416, -0.9861,  ..., -0.6237, -0.8712, -0.0601]]],
       grad_fn=<CatBackward0>)

In [None]:
# Wybierz kolumny
df = df[['title', 'abstract', 'investors', 'business', 'pdf_text', 'abstract_embedding', 'pdf_text_embedding']]

# Konwertuj typy danych
df = df.astype({
    'title': 'string',
    'abstract': 'string',
    'investors': 'string',
    'business': 'string',
    'pdf_text': 'string'
})

df['abstract_embedding'] = df['abstract_embedding'].apply(lambda x: x.detach().cpu().numpy().tobytes())
df['pdf_text_embedding'] = df['pdf_text_embedding'].apply(lambda x: x.detach().cpu().numpy().tobytes())

# Utwórz połączenie z bazą danych SQLite
conn = sqlite3.connect('example.db')

# Użyj SQLAlchemy do zapisu ramki danych
engine = create_engine('sqlite:///example.db')

# Zdefiniuj typy danych dla kolumn
dtype = {
    'title': types.String,
    'abstract': types.String,
    'investors': types.String,
    'business': types.String,
    'pdf_text': types.Text,
    'abstract_embedding': types.LargeBinary,
    'pdf_text_embedding': types.LargeBinary
}

# Zapisz ramkę danych do bazy SQLite
df.to_sql('my_table', engine, if_exists='replace', index=False, dtype=dtype)

# Zamknij połączenie
conn.close()

In [30]:
df

Unnamed: 0,title,abstract,investors,business,pdf_text,abstract_embedding,pdf_text_embedding
0,general relations for quantum gases in two and...,we derive exact general relations between vari...,"This research establishes a set of precise,...",This work derives precise mathematical rela...,arXiv:1210.1784v3 [cond-mat.quant-gas] 30 Ma...,b'w\xaa\xfb>\xae`K\xbeu\x8cQ\xbd\x02\x91\xb6\x...,"b'\xca""\xde>#\x8fF?@\xe76\xbe\x10AS?^h\x11\xbf..."
