In [1]:
!pip install -q langchain==0.0.150 pypdf pandas matplotlib tiktoken textract transformers openai faiss-cpu

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [12]:
!nvidia-smi

Sun Jun 23 10:17:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:06:00.0 Off |                    0 |
| N/A   46C    P0            116W /  300W |   32365MiB /  32768MiB |     39%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  |   00

In [31]:
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '5'
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [19]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPT2TokenizerFast
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

In [17]:
df = pd.read_csv("SPOTIFY_REVIEWS.csv")

In [20]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

df['cleaned_review_text'] = df['review_text'].astype(str).apply(preprocess_text)

In [34]:
df = df[:1000]
df

Unnamed: 0.1,Unnamed: 0,review_id,pseudo_author_id,author_name,review_text,review_rating,review_likes,author_app_version,review_timestamp,cleaned_review_text
0,0,14a011a8-7544-47b4-8480-c502af0ac26f,152618553977019693742,A Google user,Use it every day,5,1,1.1.0.91,2014-05-27 14:21:48,use it every day
1,1,bfa8876b-470e-4640-83a7-77427f7f37e8,234382942865437071667,A Google user,"I enjoy the awesome UI of this app, and it has...",5,4,1.1.0.91,2014-05-27 14:36:02,i enjoy the awesome ui of this app and it has ...
2,2,70e8252f-058a-47d9-b066-df9e1571c970,174473604608358796368,A Google user,Love it! Especially the new design!,5,2,1.1.0.91,2014-05-27 14:40:01,love it especially the new design
3,3,672a155a-e81e-4d28-bdeb-a74c031bc072,286593453219054880269,A Google user,"Awesome UI, best music app out there!",5,1,1.1.0.91,2014-05-27 15:17:20,awesome ui best music app out there
4,4,bbc1bf95-ed36-41a1-8b98-0f2e314caea5,167276875678680630145,A Google user,As a professional Android developer I'm glad t...,5,10,1.1.0.91,2014-05-27 15:26:48,as a professional android developer im glad to...
...,...,...,...,...,...,...,...,...,...,...
995,995,12c0d80a-84ac-49d5-a143-bb6efcd1d139,996404509657845998069,A Google user,what a frickin joke. subscription isnt recogni...,1,0,1.1.0.113,2014-05-29 03:40:40,what a frickin joke subscription isnt recognis...
996,996,564b8c9d-1ff6-4849-8817-c38f8e22a941,271634911259753744305,A Google user,In radio mode sometimes just shows a gray scre...,3,0,1.1.0.113,2014-05-29 03:41:45,in radio mode sometimes just shows a gray scre...
997,997,4f355cb6-40a8-483b-921b-afd1727e6464,507422578498859942392,A Google user,Plz add genre section in your music as well pl...,4,0,1.1.0.113,2014-05-29 03:43:43,plz add genre section in your music as well pl...
998,998,5f75446e-7831-4785-a79a-e7ad120aed6c,226983507757949968391,A Google user,I Loveeeee Ittt !!!!!!! ♥♥♥♡♡♡♡,5,0,1.1.0.113,2014-05-29 03:44:13,i loveeeee ittt


In [35]:
all_reviews = " ".join(df['cleaned_review_text'].dropna().tolist())

In [36]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text))

In [37]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 512,
    chunk_overlap = 24,
    length_function = count_tokens,
)

In [38]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)



In [62]:
from tqdm import tqdm

all_chunks = []

for review in tqdm(df['cleaned_review_text']):
    chunks = text_splitter.create_documents([review])
    all_chunks.extend(chunks)

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:02<00:00, 493.99it/s]


In [75]:
all_chunks = []
all_chunks.extend(text_splitter.create_documents([all_reviews]))

In [76]:
chunk_texts = [chunk.page_content for chunk in all_chunks]

embeddings = embedding_model.encode(chunk_texts, show_progress_bar=True)

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [77]:
import faiss
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

In [78]:
import numpy as np

In [79]:
# Save embeddings
np.save('embeddings_langchain.npy', embeddings)

# Save FAISS index
faiss.write_index(index, 'faiss_index_langchain.idx')

In [80]:
def similarity_search(query, k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(query_embedding, k)
    return [all_chunks[i] for i in indices[0]]

def answer_query(query):
    docs = similarity_search(query)
    doc_texts = [doc.page_content for doc in docs]
    query_embedding = embedding_model.encode(query)
    doc_embeddings = embedding_model.encode(doc_texts)
    hits = util.semantic_search(query_embedding, doc_embeddings, top_k=1)
    best_doc_idx = hits[0][0]['corpus_id']
    best_answer = doc_texts[best_doc_idx]
    return best_answer

In [81]:
from sentence_transformers import SentenceTransformer, util

In [82]:
query = "What do people like about Spotify?"
answer = answer_query(query)
print(answer)

i hate that it best music app by far yeah awsome it works i love it been using it since since it came out i absolutely love listening to spotify as a usa solider its great when back at the barricks love this stuff es la mejor app de musica en stream great app love it  lt3 it is great love it i think its not so difficult to develop a widget that could be resized its really annoying spotify was hit by a security breach so they had to change the app do people even read even though i have to redownload 1226 songs its still a great monthly purchase have to download playlist again bullshit love it everyone on the internet is a baby best music service app ive ever paid for the new updated app will not let me sign in using my prior info or sign in by fb login have wasted the last 20mins trying to get it to wk im not wasting anymore time on it deleting app which i really like prior love it  i use spotify on all my devices i like that playlists sync across all platforms but you cannot remove tra

In [83]:
docs = similarity_search(query)
docs

[Document(page_content='i hate that it best music app by far yeah awsome it works i love it been using it since since it came out i absolutely love listening to spotify as a usa solider its great when back at the barricks love this stuff es la mejor app de musica en stream great app love it  lt3 it is great love it i think its not so difficult to develop a widget that could be resized its really annoying spotify was hit by a security breach so they had to change the app do people even read even though i have to redownload 1226 songs its still a great monthly purchase have to download playlist again bullshit love it everyone on the internet is a baby best music service app ive ever paid for the new updated app will not let me sign in using my prior info or sign in by fb login have wasted the last 20mins trying to get it to wk im not wasting anymore time on it deleting app which i really like prior love it  i use spotify on all my devices i like that playlists sync across all platforms b