In [1]:
!pip install faiss-cpu
!pip install streamlit
!pip install pandas
!pip install re
!pip install transformers
!pip install sentence_transformers
!pip install numpy
!pip install nltk
!pip install tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
     |████████████████████████████████| 27.0 MB 2.8 MB/s            
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.8.0
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
Collecting streamlit
  Downloading streamlit-1.36.0-py2.py3-none-any.whl (8.6 MB)
     |████████████████████████████████| 8.6 MB 3.1 MB/s            
[?25hCollecting pandas<3,>=1.3.0
  Downloading pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
     |████████████████████████████████| 12.4 MB 68.6 MB/s            
[?25hCollecting blinker<2,>=1.0.0
  Downloading blinker-1.8.2-py3-none-any.whl (9.5 kB)
Collecting pydeck<1,>=0.8.0b4
  Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
     |████████████████████████████████| 6.9 MB 59.6 MB/s            
[?25hCollecting altair<6,>=4.0
  Download

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [3]:
!nvidia-smi

Sun Jun 23 09:04:07 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:06:00.0 Off |                    0 |
| N/A   45C    P0            115W /  300W |   31193MiB /  32768MiB |     38%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  |   00

In [4]:
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '7'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = "cpu"

In [5]:
import streamlit as st
import pandas as pd
import re
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [6]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

embedding_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

data = pd.read_csv('SPOTIFY_REVIEWS.csv', index_col=0)

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

data['cleaned_review_text'] = data['review_text'].astype(str).apply(preprocess_text)

In [10]:
data = pd.read_pickle('data.pkl')

embeddings = np.load('embeddings.npy')

index = faiss.read_index('faiss_index.idx')

In [53]:
def generate_text(prompt, max_length=128):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = gpt_model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'], 
        max_length=max_length, 
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text

def generate_answer(question, context):
    prompt = f"Question: {question}\nContext: {context}\nAnswer:"
    answer = generate_text(prompt)
    return answer

def retrieve_reviews(query, top_k=5):
    query_embedding = embedding_model.encode([query])
    distances, indices = index.search(np.array(query_embedding), top_k)
    return data.iloc[indices[0]]

def get_relevant_reviews(query, top_k=5):
    query_embedding = embedding_model.encode([query], device=device)
    _, indices = index.search(query_embedding, top_k)
    relevant_reviews = data['cleaned_review_text'].iloc[indices[0]].tolist()
    return relevant_reviews

def score_answer(answer, relevant_reviews):
    length_score = len(answer)
    answer_embedding = embedding_model.encode([answer], device=device)[0]
    relevant_embeddings = embedding_model.encode(relevant_reviews, device=device)
    relevance_scores = cosine_similarity([answer_embedding], relevant_embeddings).flatten()
    relevance_score = relevance_scores.mean()
    sentiment_scores = [sid.polarity_scores(review)['compound'] for review in relevant_reviews]
    sentiment_score = np.mean(sentiment_scores)
    score = (length_score * 0.1) + (relevance_score * 0.6) + (sentiment_score * 0.3)
    return score

In [54]:
tokenizer.pad_token = tokenizer.eos_token

In [55]:
relevant_reviews = get_relevant_reviews("What are the specific features or aspects that users appreciate the most in our application?")
relevant_reviews

['nice app  features etc ',
 'best app features',
 'what can say i love this application',
 'completely personalized application with lots of features love it',
 'i have to explore the features of the app more']

In [56]:
context = " ".join(relevant_reviews)
context

'nice app  features etc  best app features what can say i love this application completely personalized application with lots of features love it i have to explore the features of the app more'

In [57]:
answer = generate_answer(query, context)
answer

'Question: What are the specific features or aspects that users appreciate the most in our application?\nContext: nice app  features etc  best app features what can say i love this application completely personalized application with lots of features love it i have to explore the features of the app more\nAnswer: What are the specific features or aspects that users appreciate the most in our application?\nContext: nice app  features etc  best app features what can say i love this application completely personalized application with lots of features love it i have to explore the features of the app more\nAnswer: What are the specific features or aspects that users appreciate the most'

In [17]:
query = "What are the specific features or aspects that users appreciate the most in our application?"

if query:
    relevant_reviews = get_relevant_reviews(query)
    if relevant_reviews:
        context = " ".join(relevant_reviews)
        answer = generate_answer(query, context)
        score = score_answer(answer, relevant_reviews)
        print("Answer: {answer}")
        print("Score: {score}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 

In [None]:
st.title("Spotify Reviews Q&A Chatbot")

query = st.text_input("Enter your question here:")

if st.button("Get Answer"):
    if query:
        relevant_reviews = get_relevant_reviews(query)
        if relevant_reviews:
            context = " ".join(relevant_reviews)
            answer = generate_answer(query, context)
            score = score_answer(answer, relevant_reviews)
            st.text_area("Answer", f"{answer} (Score: {score})")
        else:
            st.write("No relevant reviews found.")
    else:
        st.write("Please enter a question.")