In [8]:
#!pip uninstall typing-extensions
#!pip uninstall openai

!pip install typing-extensions --upgrade
!pip install openai --upgrade

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from tqdm import tqdm
import openai
import streamlit as st
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [10]:
data = pd.read_csv('SPOTIFY_REVIEWS.csv', index_col=0)

In [11]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text.lower()

data['cleaned_review_text'] = (data['review_text'].astype(str)).apply(preprocess_text)

In [12]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [13]:
!nvidia-smi

Sun Jun 23 04:43:34 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla V100-SXM2-32GB           On  |   00000000:06:00.0 Off |                    0 |
| N/A   44C    P0            128W /  300W |   31439MiB /  32768MiB |     47%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla V100-SXM2-32GB           On  |   00

In [14]:
import torch

os.environ['CUDA_VISIBLE_DEVICES'] = '7'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [16]:
def encode_in_batches(texts, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding batches"):
        batch_texts = texts[i:i + batch_size]
        batch_embeddings = model.encode(batch_texts)
        embeddings.append(batch_embeddings)
    return np.vstack(embeddings)

In [17]:
embeddings = encode_in_batches(data['cleaned_review_text'].tolist(), batch_size=512)

Encoding batches: 100%|█████████████████████████████████████████████████████████████| 6597/6597 [28:54<00:00,  3.80it/s]


In [18]:
d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(np.array(embeddings))

In [21]:
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

def score_answer(answer, relevant_reviews):
    length_score = len(answer)
    
    answer_embedding = model.encode([answer], device=device)[0]
    relevant_embeddings = model.encode(relevant_reviews, device=device)
    relevance_scores = cosine_similarity([answer_embedding], relevant_embeddings).flatten()
    relevance_score = relevance_scores.mean()
    
    sentiment_scores = [sid.polarity_scores(review)['compound'] for review in relevant_reviews]
    sentiment_score = np.mean(sentiment_scores)
    
    score = (length_score * 0.1) + (relevance_score * 0.6) + (sentiment_score * 0.3)
    
    return score

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [22]:
np.save('embeddings.npy', embeddings)

faiss.write_index(index, 'faiss_index.idx')

data.to_pickle('data.pkl')

2024-06-23 05:14:25.285 
  command:

    streamlit run /usr/local/lib/python3.8/dist-packages/ipykernel_launcher.py [ARGUMENTS]


DeltaGenerator()

In [23]:
st.title("Spotify Reviews Q&A Chatbot")

query = st.text_input("Enter your question here:")

if st.button("Get Answer"):
    if query:
        relevant_reviews = retrieve_reviews(query)
        if not relevant_reviews.empty:
            context = " ".join(relevant_reviews['cleaned_review_text'].tolist())
            answer = generate_answer_openai(query, context)
            score = score_answer(answer, relevant_reviews['cleaned_review_text'].tolist())
            st.text_area("Answer", f"{answer} (Score: {score})")
        else:
            st.write("No relevant reviews found.")
    else:
        st.write("Please enter a question.")

In [81]:
import os
from openai import OpenAI

openai.api_key = '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'

client = OpenAI(
    api_key=openai.api_key,
)

In [82]:
def generate_answer_openai(question, context):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": f"Context: {context}\n\nQuestion: {question}\nAnswer:"}
        ],
        max_tokens=150,
        temperature=0.7,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response['choices'][0]['message']['content'].strip()

def retrieve_reviews(query, top_k=5):
    query_embedding = model.encode([query], device='cuda')  # Ensure GPU usage
    distances, indices = index.search(np.array(query_embedding), top_k)
    return data.iloc[indices[0]]

def answer_query(query):
    try:
        relevant_reviews = retrieve_reviews(query)
        if relevant_reviews.empty:
            return "No relevant reviews found."
        context = " ".join(relevant_reviews['cleaned_review_text'].tolist())
        answer = generate_answer_openai(query, context)
        score = score_answer(answer, relevant_reviews['cleaned_review_text'].tolist())
        return f"{answer} (Score: {score})"
    except Exception as e:
        return f"Error: {str(e)}"

In [83]:
query = "What are the specific features or aspects that users appreciate the most in our application?"

if query:
    relevant_reviews = retrieve_reviews(query)
    if not relevant_reviews.empty:
        context = " ".join(relevant_reviews['cleaned_review_text'].tolist())
        answer = generate_answer_openai(query, context)
        score = score_answer(answer, relevant_reviews['cleaned_review_text'].tolist())
        print(answer)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-Skeju***************************************Bo0e. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}