In [1]:
%pip install datasets pandas --verbose

Using pip 23.1.2 from /Users/mechaneyes/Documents/ Projects/Third Bridge Creative/Mechaneyes RAG Bridge/langchain-retrieval-augmentation/.venv/lib/python3.11/site-packages/pip (python 3.11)
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
con = sqlite3.connect('./pitchforkSqliteDB.sqlite')

df_content = pd.read_sql_query("SELECT * from content", con)
df_content.to_sql('content', con, if_exists='replace', index=False)

df_reviews = pd.read_sql_query("SELECT * from reviews", con)
df_reviews.to_sql('reviews', con, if_exists='replace', index=False)



# Execute a SQL JOIN statement to combine the 'content' and 'reviews' tables
query = '''
SELECT content.*, reviews.reviewid AS reviewid2, reviews.title, reviews.artist, reviews.url, reviews.score, reviews.author, reviews.pub_date
FROM content
JOIN reviews
ON content.reviewid = reviews.reviewid
'''
df_combined = pd.read_sql_query(query, con)
df_combined.to_sql('combined', con, if_exists='replace', index=False)



con.close()

uri = "sqlite:///pitchforkSqliteDB.sqlite"


In [2]:
from datasets import Dataset

ds = Dataset.from_sql("content", uri)
ds_reviewws = Dataset.from_sql("reviews", uri)
ds_combined = Dataset.from_sql("combined", uri)

len(ds_combined)
# len(ds_reviewws)

ds_combined[9]


  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset sql (/Users/mechaneyes/.cache/huggingface/datasets/sql/default-ff2a0be9cb3a021a/0.0.0)
Found cached dataset sql (/Users/mechaneyes/.cache/huggingface/datasets/sql/default-240b28a6ad1fb140/0.0.0)
Found cached dataset sql (/Users/mechaneyes/.cache/huggingface/datasets/sql/default-1591c25f6a3c7476/0.0.0)


{'reviewid': 22724,
 'content': "There were innumerable cameos at the Bad Boy Family Reunion Tour, but as is often the case with nostalgia packages, “the inexorable march of time” stole the show. Shyne lip-synced “Bad Boyz” in exile from Belize. Lil’ Kim was as magnetic as ever, but tragically so, going blank during large portions of her past hits. While DMX and Ruff Ryders’ constant shirtlessness and bloody-knuckled Casio beats were a\xa0corrective to hip-hop’s sample-happy Shiny Suit era, with enough distance, they could all be lumped together as “late ’90s NYC rap.” And most bizarre of all were the once-estranged Lox\xa0screaming “if you glad that L-O-X\xa0is Ruff Ryders now!” during “Wild Out,” their first single after a nasty, public and possibly violent extrication from Bad Boy—referred to as “Rape'n U Records” on the subsequent\xa0We Are the Streets. Now signed to Roc Nation, the Lox are once again close to the locus of money, power and respect, affiliated with their third megas

In [3]:
%pip install -qU \
  langchain==0.0.162 \
  openai==0.27.7 \
  tiktoken==0.4.0 \
  "pinecone-client[grpc]"==2.2.1

Note: you may need to restart the kernel to use updated packages.


## tokenizer

A token is typically the size of a word or sub-word and varies by LLM. The tokens themselves are built using a tokenizer. We will be using gpt-3.5-turbo as our completion model. We will initialize the tokenizer then create tokens from plain text and count the number of tokens. We will wrap this into a function called `tiktoken_len`

In [4]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [5]:
import tiktoken  # !pip install tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

<br /><br />

## text_splitter

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [7]:
chunks = text_splitter.split_text(ds_combined[0]['content'])[:3]
chunks

['“Trip-hop” eventually became a ’90s punchline, a music-press shorthand for “overhyped hotel lounge music.” But today, the much-maligned subgenre almost feels like a secret precedent. Listen to any of the canonical Bristol-scene albums of the mid-late ’90s, when the genre was starting to chafe against its boundaries, and you’d think the claustrophobic, anxious 21st century started a few years ahead of schedule. Looked at from the right angle, trip-hop\xa0is part of an unbroken chain that runs from the abrasion of ’80s post-punk to the ruminative pop-R&B-dance fusion of the moment.\xa0The best of it has aged far more gracefully (and forcefully) than anything recorded in the waning days of the record industry’s pre-filesharing monomania has any right to. Tricky rebelled against being attached at the hip to a scene he was already looking to shed and decamped for Jamaica to record a more aggressive, bristling-energy mutation of his style in ’96; the name\xa0Pre-Millennium',
 "his style in

In [8]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(231, 216, 223)

<br /><br />

# creating embeddings

In [None]:
%pip install python-dotenv

In [9]:
import os
from dotenv import load_dotenv
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY_RWW")
# print(OPENAI_API_KEY)

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [10]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: The server had an error while processing your request. Sorry about that!.


(2, 1536)

<br /><br />

# vector database

https://docs.pinecone.io/docs/langchain-retrieval-augmentation#vector-database

In [11]:
import pinecone

# find API key in console at app.pinecone.io
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') or 'PINECONE_API_KEY'
# find ENV (cloud region) next to API key in console
PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') or 'PINECONE_ENVIRONMENT'

index_name = 'pitchfork-rag'
pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT
)

if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
    )

In [13]:
index = pinecone.GRPCIndex(index_name)

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

<br /><br />

## indexing

https://docs.pinecone.io/docs/langchain-retrieval-augmentation#indexing

can do the indexing using the LangChain vector store object. But for now is much faster to do it via the Pinecone python client directly. we're do this in batches of 100 or more.

In [18]:
from tqdm.auto import tqdm
from uuid import uuid4
import time

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(ds_combined)):
    # first get metadata fields for this record
    metadata = {
        'reviewid': record['reviewid'],
        'source': record['url'],
        'title': record['title'],
        'artist': record['artist'],
        'url': record['url'],
        'score': record['score'],
        'author': record['author'],
        'pub_date': record['pub_date']
    }
    
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['content'])

    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "content": text, **metadata
    } for j, text in enumerate(record_texts)]
    
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []
        time.sleep(1)

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 18401/18401 [26:39<00:00, 11.50it/s] 


In [19]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.2,
 'namespaces': {'': {'vector_count': 82408}},
 'total_vector_count': 82408}

<br /><br />

## creating a vector store + querying

Now that we've build our index we can switch back over to LangChain. We start by initializing a vector store using the same index we just built. 

In [20]:
from langchain.vectorstores import Pinecone

text_field = "content"

# switch back to normal index for langchain
index = pinecone.Index(index_name)

vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

### testing

In [21]:
query = "what is massive attack's music like?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='For their first three albums, you could count on Massive Attack to make music that was as intense as it was graceful. As the moods of their albums gradually transitioned from refined soul to grimy abrasion on Blue Lines, Protection, and Mezzanine, they used that balance to toy with the emotional structure of their sound. The result was some of the decade\'s most haunting, forward-thinking music. Depending on how and when you listen, the same Massive Attack song can creep you out, fill you with sorrow, or send you into a deep reverie. The best ones do it all at once.Many fans consider what little music Massive Attack released since Mezzanine to be a retreat of sorts, and it\'s true that they may have lost something with each original member that split off-- namely the hip-hop sensibility of Andrew "Mushroom" Vowles and the frigid snarl of Grant "Daddy G"\xa0Marshall. Their next release, 2003\'s 100th Window, seemed like a creative holding pattern brought on by th

<br /><br />

# Generative Question-Answering

https://docs.pinecone.io/docs/langchain-retrieval-augmentation#generative-question-answering

in GQA we take the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing being returned from the vectorstore.

include the sources of information that the LLM is using to answer the question. do this using a slightly different version of `RetrievalQA`: `RetrievalQAWithSourcesChain`

In [24]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain

llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

qa_with_sources(query)

{'question': "what is massive attack's music like?",
 'answer': "Massive Attack's music is intense, haunting, forward-thinking, and can evoke a range of emotions. Some fans consider their music since Mezzanine to be a retreat, and their albums are infrequent enough to be sold as events. Their debut album, Blue Lines, is still relevant and influential today. Their more recent releases have been criticized for being mediocre and falling flat. The Danny the Dog soundtrack features plodding downtempo and leftfield horse tranquilizers. \n",
 'sources': 'http://pitchfork.com/reviews/albums/13864-heligoland/, http://pitchfork.com/reviews/albums/14881-atlas-air-ep/, http://pitchfork.com/reviews/albums/17384-blue-lines-remastered-box-set/, http://pitchfork.com/reviews/albums/5136-danny-the-dog-ost/'}