In [1]:
import pandas as pd

df = pd.read_json("hf://datasets/Abirate/english_quotes/quotes.jsonl", lines=True)

In [None]:
# Check keys and structure
print(df)

In [None]:
# Remove rows with missing values
df.dropna(inplace=True)

# Strip unwanted characters and normalize
df['quote'] = df['quote'].str.lower().str.strip().str.replace("“|”", "", regex=True)
df['author'] = df['author'].str.lower().str.strip().str.replace(",", "", regex=True)

# (Optional) Flatten tags into a string for search purposes
df['tags'] = df['tags'].apply(lambda x: ", ".join(x) if isinstance(x, list) else "")

# Confirm changes
print(df.head())

In [None]:
!pip install sentence-transformers datasets

In [5]:
from sentence_transformers import InputExample

examples = []

# Loop through DataFrame rows
for _, row in df.iterrows():
    tags = row['tags']
    quote = row['quote']

    # Use each tag as a pseudo-query for training
    for tag in tags.split(','):
        tag = tag.strip()
        if tag:  # Skip empty strings
            query = f"quotes about {tag}"
            examples.append(InputExample(texts=[query, quote]))

In [6]:
print(examples[0].texts)
# Output: ['quotes about humor', 'so many books, so little time.']

['quotes about be-yourself', 'be yourself; everyone else is already taken.']


In [7]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sentence_transformers import SentenceTransformer, losses, SentencesDataset, models
from torch.utils.data import DataLoader

# Use a pre-trained base model
model = SentenceTransformer('all-MiniLM-L6-v2')

# DataLoader
train_dataloader = DataLoader(examples, shuffle=True, batch_size=16)
train_loss = losses.MultipleNegativesRankingLoss(model)

# Fine-tune
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

# Save the model
model.save("quote-retriever-model")

In [None]:
!pip install faiss-cpu

In [None]:
import faiss
import numpy as np

# Encode all quotes using the fine-tuned model
quote_embeddings = model.encode(df['quote'].tolist(), show_progress_bar=True)

# Create FAISS index
dimension = quote_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(quote_embeddings))

# Save index
faiss.write_index(index, "quotes.index")

In [None]:
import openai

openai.api_key = "YOUR_OPENAI_API_KEY"

def generate_answer(context, query):
    prompt = f"""
Context: {context}
Question: {query}
Answer:
"""
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [None]:
!pip install streamlit

In [13]:
%%writefile quote_app.py
import streamlit as st
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

# Load model and data
@st.cache_resource
def load_model_and_data():
    from sentence_transformers import SentenceTransformer
    import pandas as pd
    import faiss

    model = SentenceTransformer('/content/quote-retriever-model')
    df = pd.read_json("hf://datasets/Abirate/english_quotes/quotes.jsonl", lines=True)
    index = faiss.read_index("/content/quotes.index")  # Proper FAISS load
    return model, df, index

model, df, embeddings = load_model_and_data()

# Load FAISS index
index = faiss.read_index("/content/quotes.index")

st.title("📚 Quote Retrieval App (RAG-style)")
query = st.text_input("Ask something like: quotes about courage by Oscar Wilde")

if query:
    query_embedding = model.encode([query])
    D, I = index.search(query_embedding, k=5)

    results = df.iloc[I[0]]
    for i, row in results.iterrows():
        st.write(f"**Quote:** {row['quote']}")
        st.write(f"_Author:_ {row['author']}")
        st.write(f"_Tags:_ {row['tags']}")
        st.markdown("---")

Writing quote_app.py


In [14]:
!npm install -g localtunnel

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K
added 22 packages in 5s
[1G[0K⠸[1G[0K
[1G[0K⠸[1G[0K3 packages are looking for funding
[1G[0K⠸[1G[0K  run `npm fund` for details
[1G[0K⠸[1G[0K

In [15]:
# Start Streamlit in the background
!streamlit run quote_app.py &>/content/logs.txt &
!curl https://loca.lt/mytunnelpassword
# Wait briefly before tunneling
import time
time.sleep(5)

# Expose port 8501 using localtunnel
!npx localtunnel --port 8501

34.80.88.171[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0Kyour url is: https://little-melons-grin.loca.lt
^C


In [16]:
!curl https://loca.lt/mytunnelpassword

34.80.88.171

In [18]:
import shutil

# Replace with your folder path
shutil.make_archive('/content/quote-retriever-model', 'zip', '/content/quote-retriever-model')

# Then download
from google.colab import files
files.download('/content/quote-retriever-model.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>