In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sentiment-df/df_sentiment.csv


# 4. Topic Modeling

In [2]:
df = pd.read_csv("/kaggle/input/sentiment-df/df_sentiment.csv")
df.tail(3)

Unnamed: 0,review_cleaned_text,sentiment_label
10337,best food ever have been trying hard to find ...,positive
10338,awesome dive bar citywides here looooove love...,positive
10339,i had the crispy chicken wrap with dijon and f...,positive


**Preprocessing**

In [3]:
import pandas as pd
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load Spacy for lemmatization
nlp = spacy.load('en_core_web_sm')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Load English stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# Function for preprocessing the review text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove any special characters or digits
    text = re.sub(r'[^a-z\s]', '', text)
    
    # Tokenize the text using spaCy
    doc = nlp(text)
    
    # Remove stopwords, short words, and lemmatize
    tokens = [token.lemma_ for token in doc if token.text not in stop_words and len(token.text) > 2]
    
    # Join the tokens back into a single string
    return ' '.join(tokens)

In [5]:
# Apply preprocessing to the review_cleaned_text column
df['review_preprocessed_text'] = df['review_cleaned_text'].apply(preprocess_text)

In [6]:
reviews = df.review_preprocessed_text

In [7]:
df.head(8)

Unnamed: 0,review_cleaned_text,sentiment_label,review_preprocessed_text
0,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...
1,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...
2,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...
3,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...
4,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...
5,this place was good but was nothing amazing at...,positive,place good nothing amazing least order first v...
6,this place was good but was nothing amazing at...,positive,place good nothing amazing least order first v...
7,this place was good but was nothing amazing at...,positive,place good nothing amazing least order first v...


**Topic Modeling with LDA**

In [8]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# TF-IDF Vectorization of review texts
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(reviews)

# Fit the LDA model
lda = LatentDirichletAllocation(n_components=3, random_state=42)  # n_components: number of topics
lda.fit(X)

# Display the top words for each topic
n_words = 10  # Top 10 words per topic
feature_names = np.array(vectorizer.get_feature_names_out())

for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic {topic_idx + 1}:")
    print(" ".join([feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]))


Topic 1:
food good time order place come service try eat like
Topic 2:
great food place good drink service wait bar friendly love
Topic 3:
good great chicken oyster fry cheese order food like shrimp


**Topic Modeling with BERTopic**

In [9]:
pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.40-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m54.2 MB/s[0m eta [36m0:

In [10]:
from bertopic import BERTopic
from umap import UMAP

# Set up UMAP with the n_neighbors parameter
umap_model = UMAP(n_neighbors=15)  # Adjust n_neighbors as needed

# Initialize the BERTopic model
topic_model = BERTopic(language="english", umap_model=umap_model, top_n_words=3)

# Fit the model
topics, probs = topic_model.fit_transform(reviews)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [11]:
# Get a list of valid topic indices (excluding the -1 label for "no topic")
valid_topics = [i for i in set(topics) if i != -1]

# Extract and display the top words for each valid topic
for i in valid_topics:
    # Get the words from the topic, ignoring the scores
    top_words = [word for word, _ in topic_model.get_topic(i)]
    print(f"Topic {i}: {', '.join(top_words)}")

Topic 0: creamy, hubby, breakfast
Topic 1: vegan, vegetarian, winner
Topic 2: game, marvel, tribe
Topic 3: manager, pay, iconic
Topic 4: tour, art, museum
Topic 5: casual, macadamia, pricing
Topic 6: scott, job, kebab
Topic 7: cream, ice, honeycomb
Topic 8: zoo, nonvegetarian, overachieve
Topic 9: runner, horrible, slider
Topic 10: caprese, bruschetta, ranch
Topic 11: sandwich, coworker, balt
Topic 12: parc, starr, bourguignon
Topic 13: actively, misunderstanding, disrespectful
Topic 14: untoasted, latte, muffin
Topic 15: tortillas, guacamole, habanerobase
Topic 16: spa, massage, celebration
Topic 17: prompter, slow, lounge
Topic 18: hotel, wallscould, hall
Topic 19: bib, easter, recommendedseriously
Topic 20: seasoning, soft, standard
Topic 21: tucson, polish, cottage
Topic 22: tampa, bay, donatello
Topic 23: poboy, filler, superb
Topic 24: yuppy, extraordinarily, hipster
Topic 25: ferg, dill, aisle
Topic 26: boliche, mentally, constant
Topic 27: voodoo, bone, vinegar
Topic 28: blane,

In [12]:
df['Topic'] = topics

# You can use the top words or manually map labels if needed
topic_labels = {}

# Extract the top words for each valid topic and create labels
for i in valid_topics:
    top_words = [word for word, _ in topic_model.get_topic(i)]
    topic_labels[i] = ', '.join(top_words)  # Assign the top words as the label

# Map the topic numbers to their labels and add them to the DataFrame
df['Topic Label'] = df['Topic'].map(topic_labels).fillna("No Topic")

In [13]:
df.drop('Topic',axis=1,inplace=True)

# 5. Emotion Detection 
Using Pretrained Huggingface model ---SamLowe/roberta-base-go_emotions

In [14]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline

# Load the pre-trained model and tokenizer from Hugging Face
tokenizer = RobertaTokenizer.from_pretrained('SamLowe/roberta-base-go_emotions')
model = RobertaForSequenceClassification.from_pretrained('SamLowe/roberta-base-go_emotions')

# Create a Hugging Face pipeline for emotion detection
emotion_model = pipeline('text-classification', model=model, tokenizer=tokenizer,device=0,truncation=True)


tokenizer_config.json:   0%|          | 0.00/380 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [15]:
# Function to detect emotion
def detect_emotion(review_text):
    result = emotion_model(review_text)
    predicted_class = result[0]['label']
    return predicted_class

In [16]:
# Apply the emotion detection to the 'review_cleaned_text' column
df['Emotion'] = df['review_cleaned_text'].apply(detect_emotion)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [17]:
# Display the updated DataFrame with emotion labels
df[['review_cleaned_text', 'Emotion']]

Unnamed: 0,review_cleaned_text,Emotion
0,good ambience\n\ngood service\n\nordered \n\nf...,admiration
1,good ambience\n\ngood service\n\nordered \n\nf...,admiration
2,good ambience\n\ngood service\n\nordered \n\nf...,admiration
3,good ambience\n\ngood service\n\nordered \n\nf...,admiration
4,good ambience\n\ngood service\n\nordered \n\nf...,admiration
...,...,...
10335,i ate here a few times before and was very ple...,admiration
10336,best food ever have been trying hard to find ...,admiration
10337,best food ever have been trying hard to find ...,admiration
10338,awesome dive bar citywides here looooove love...,love


In [18]:
df['Emotion'].value_counts()

Emotion
admiration        5681
disappointment     812
neutral            782
love               763
joy                621
approval           322
disapproval        243
annoyance          208
gratitude          175
disgust            106
amusement          103
desire              89
excitement          75
surprise            73
confusion           54
remorse             50
sadness             46
optimism            39
realization         30
fear                29
curiosity           23
caring               9
embarrassment        6
nervousness          1
Name: count, dtype: int64

# 6. Automatic Response Generation 

In [19]:
pip install openai==0.28

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
Successfully installed openai-0.28.0
Note: you may need to restart the kernel to use updated packages.


In [20]:
import openai

In [None]:
#set up the api key
openai.api_key="(Your Secret Key)"

In [22]:
def generate_response(review_text, sentiment, topic):
    # Define the conversation with system, user, and assistant roles
    messages = [
        {"role": "system", "content": "You are a helpful customer service assistant."},
        {"role": "user", "content": f"Review: {review_text}\nSentiment: {sentiment}\nTopic: {topic}"},
    ]
    
    # Make the API request using the chat model
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Using the chat model
        messages=messages,  # Pass the conversation history
        max_tokens=100,  # Limit the response length
        temperature=0.7,  # Control creativity in response generation
        n=1  # Generate 1 response
    )
    
    return response.choices[0].message["content"].strip()

In [23]:
df

Unnamed: 0,review_cleaned_text,sentiment_label,review_preprocessed_text,Topic Label,Emotion
0,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...,"basil, spicy, yom",admiration
1,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...,"basil, spicy, yom",admiration
2,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...,"basil, spicy, yom",admiration
3,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...,"basil, spicy, yom",admiration
4,good ambience\n\ngood service\n\nordered \n\nf...,positive,good ambience good service order fish pepper c...,"basil, spicy, yom",admiration
...,...,...,...,...,...
10335,i ate here a few times before and was very ple...,positive,eat time pleased large selection entree servic...,"app, lakeside, cancel",admiration
10336,best food ever have been trying hard to find ...,positive,good food ever try hard find good mexican rest...,"eggbacon, mimosa, queso",admiration
10337,best food ever have been trying hard to find ...,positive,good food ever try hard find good mexican rest...,"eggbacon, mimosa, queso",admiration
10338,awesome dive bar citywides here looooove love...,positive,awesome dive bar citywide looooove love love l...,"bros, booze, cheap",love


In [24]:
response = generate_response(df['review_cleaned_text'][8], df['sentiment_label'][8], df['Topic Label'][8])

In [25]:
df['review_cleaned_text'][8]

'this place was good but was nothing amazing at least from what i ordered  it was my first visit this afternoon  i was here for restaurant week\n\nmy first course was the wonton soup  good but nothing special  i then received the chili aioli rock shrimp  this was a tasty piece and was presented nicely\n\nthe main entree was the cumin beef  i had no issue with the quality and it was very good but i felt the dish did not distinguish itself from what you could get in a chinese takeout spot  everything was cooked well and you could definitely taste the spice when you first sample it  it was a good dish and i finished it but it was nothing that i was really impressed with  i finished off with a tiramisu and this was also good but nothing fantastic  \n\nthe one nice thing with this place is they brew there own tea and so you wont be getting the lipton stuff when order one  \n\nmy rating does not reflect any negative experiences with this venue  i just was not too impressed with the flavor of

In [26]:
# Display generated responses
response

"It sounds like you had a decent dining experience at the restaurant, but you were not overly impressed with the dishes you ordered. It's great that you appreciated the presentation of the chili aioli rock shrimp and enjoyed the homemade tea. Since it was your first visit during restaurant week, perhaps trying different menu items in the future might lead to a more satisfying experience. Thank you for sharing your feedback, and I hope your next visit will be even more enjoyable! If you have any questions or need recommendations for"

**Templates for Automation**

In [27]:
response_templates = {
    "positive": {
        "food quality": "Thank you for your positive feedback about the food! We're thrilled you enjoyed it.",
        "service": "We're so happy to hear you had a great experience with our service!",
        "ambiance": "Thank you for your kind words about the ambiance. We're glad you felt comfortable here!"
    },
    "negative": {
        "food quality": "We're sorry to hear the food didn't meet your expectations. We will work on improving our menu.",
        "service": "We apologize for the slow service and will ensure better attention in the future.",
        "noise": "We're sorry to hear that the noise affected your experience. We'll consider making improvements to the atmosphere."
    }
}

In [28]:
def generate_template_response(review,sentiment, topic):
    # Select appropriate template based on sentiment and topic
    try:
        return response_templates[sentiment][topic]
    except KeyError:
        return generate_response(review, sentiment, topic)

In [29]:
# # Apply the template-based response function
# template_response = df.apply(
#     lambda row: generate_template_response(row['review_cleaned_text'],row['sentiment_label'], row['Topic Label']),
#     axis=1
# )


#### Because of the Openai API constraints i have not run the above code.