In [1]:
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kk196\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kk196\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
import pandas as pd
df = pd.read_csv("Kaggle-reviews.csv")

In [3]:
df

Unnamed: 0,Reviews,Timeframe
0,Taste of the food was average. Wash room and d...,]
1,Ordered online waste of money taste was so bad...,a week ago
2,Worst experience...not for safety\nin human li...,3 months ago
3,Ordered through online app. Food was average,4 months ago
4,One of the cheap and best hotel in coimbatore....,6 months ago
...,...,...
135,��😘 …,4 years ago
136,Not Statistfied,3 years ago
137,taste good,2 years ago
138,Good service,4 years ago


In [4]:
import emoji

# Function to convert emojis to words
def convert_emojis_to_text(text):
    return emoji.demojize(text, delimiters=("", ""))

df['Reviews'] = df['Reviews'].apply(convert_emojis_to_text)

In [5]:
df.head()

Unnamed: 0,Reviews,Timeframe
0,Taste of the food was average. Wash room and d...,]
1,Ordered online waste of money taste was so bad...,a week ago
2,Worst experience...not for safety\nin human li...,3 months ago
3,Ordered through online app. Food was average,4 months ago
4,One of the cheap and best hotel in coimbatore....,6 months ago


In [6]:
# Train BERTopic with a custom CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 3), stop_words="english")

model = BERTopic(verbose=True,vectorizer_model=vectorizer_model,embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size= 7)

In [7]:
headline_topics, probs = model.fit_transform(df.Reviews)

2024-03-28 14:27:57,073 - BERTopic - Embedding - Transforming documents to embeddings.
Batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
freq = model.get_topic_info()
print("Number of topics: {}".format( len(freq)))
freq.head()

Number of topics: 7


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,29,-1_chicken_good_food_rice,"[chicken, good, food, rice, biriyani, half, po...","[Cheating,i ordered for chicken biryani 2 nos ..."
1,0,31,0_good_food_price_reasonable,"[good, food, price, reasonable, non, tasty, af...",[Food are amazing with reasonable price. Chine...
2,1,25,1_bad_service_good_liked,"[bad, service, good, liked, services, better, ...","[Service for parcel to be taken care, Literall..."
3,2,23,2_worst_bad_food_taste,"[worst, bad, food, taste, service, worst food,...",[Worst hotel worst food taste and quality.... ...
4,3,13,3_hotel_chola_poor_best hotel,"[hotel, chola, poor, best hotel, don try, pls,...","[Before 5 years, Chola is the best hotel serve..."


In [None]:
a_topic = freq.iloc[1]["Topic"] # Select the 1st topic
model.get_topic(a_topic) # Show the words and their c-TF-IDF scores

[('good', 0.07578230098482884),
 ('food', 0.0711336409293955),
 ('price', 0.0492201900286622),
 ('reasonable', 0.04188933100049024),
 ('non', 0.03860932835427209),
 ('tasty', 0.037313493211245656),
 ('affordable', 0.03525913695627235),
 ('good food', 0.033511464800392195),
 ('veg', 0.030887462683417667),
 ('best', 0.028939046346786828)]

In [None]:
model.visualize_barchart(top_n_topics=6)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
model.visualize_topics()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
model.visualize_hierarchy(top_n_topics=30)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
model.visualize_distribution(probs, min_probability=0.015)

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:
reviews = df['Reviews']

# Concatenate all reviews into a single text
all_reviews = ' '.join(reviews).lower()

# Tokenize and remove stop words
tokens = word_tokenize(all_reviews)
filtered_words = [word for word in tokens if word not in stopwords.words('english') and word.isalpha()]

# Count and display most common words
word_counts = Counter(filtered_words)
most_common_words = word_counts.most_common(20)  

print(most_common_words)


[('food', 39), ('good', 34), ('hotel', 24), ('worst', 23), ('taste', 22), ('bad', 20), ('chicken', 17), ('service', 15), ('poor', 10), ('price', 10), ('best', 9), ('quality', 9), ('tasty', 8), ('place', 8), ('nice', 8), ('one', 7), ('rice', 7), ('non', 7), ('veg', 7), ('biriyani', 7)]


In [None]:
# Load model and tokenizer

roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)
labels = ['Negative', 'Neutral', 'Positive']


In [None]:
import numpy as np

def custom_softmax(x):
    exp_x = np.exp(x - np.max(x))  # Subtracting the maximum value for numerical stability
    return exp_x / exp_x.sum(axis=0)


In [None]:
import numpy as np

def sentiment(text, return_max_label=True):
    # Sentiment analysis
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(encoded_text['input_ids'], encoded_text['attention_mask'])
    scores = output[0][0].detach().numpy()
    scores = custom_softmax(scores)

    max_index = scores.argmax()
    max_label = labels[max_index]
    max_score = scores[max_index]

    if return_max_label:
        return max_label
    else:
        return max_score


In [None]:
df["Sentiment"] = df["Reviews"].apply(lambda x: sentiment(x, return_max_label=True))
df["Sentiment %"] = df["Reviews"].apply(lambda x: sentiment(x, return_max_label=False))

In [None]:
df

Unnamed: 0,Reviews,Time,Sentiment,Sentiment %
0,Taste of the food was average. Wash room and d...,a month ago,Negative,0.959171
1,Ordered online waste of money taste was so bad...,a week ago,Negative,0.980588
2,Worst experience...not for safety\nin human li...,3 months ago,Negative,0.960175
3,Ordered through online app. Food was average,4 months ago,Negative,0.713923
4,One of the cheap and best hotel in coimbatore....,6 months ago,Positive,0.978462
...,...,...,...,...
135,face_blowing_a_kissface_blowing_a_kiss …,4 years ago,Neutral,0.767300
136,Not Statistfied,3 years ago,Neutral,0.502342
137,taste good,2 years ago,Positive,0.870600
138,Good service,4 years ago,Positive,0.853517


In [None]:
df.to_csv("Review_sen.csv")

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from scipy.special import softmax

# Load model and tokenizer
roberta = "cardiffnlp/twitter-roberta-base-sentiment"
model = AutoModelForSequenceClassification.from_pretrained(roberta)
tokenizer = AutoTokenizer.from_pretrained(roberta)
labels = ['Negative', 'Neutral', 'Positive']

def sentiment(text, return_max_label=True):
    # Sentiment analysis
    encoded_text = tokenizer(text, return_tensors='pt')
    output = model(encoded_text['input_ids'], encoded_text['attention_mask'])
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)

    max_index = scores.argmax()
    max_label = labels[max_index]
    max_score = scores[max_index]

    if return_max_label:
        return max_label
    else:
        return max_score
    
df["Sentiment"] = df["Reviews"].apply(lambda x: sentiment(x, return_max_label=True))
df["Sentiment %"] = df["Reviews"].apply(lambda x: sentiment(x, return_max_label=False)).round(3)

In [8]:
df

Unnamed: 0,Reviews,Timeframe,Sentiment,Sentiment %
0,Taste of the food was average. Wash room and d...,],Negative,0.959
1,Ordered online waste of money taste was so bad...,a week ago,Negative,0.981
2,Worst experience...not for safety\nin human li...,3 months ago,Negative,0.960
3,Ordered through online app. Food was average,4 months ago,Negative,0.714
4,One of the cheap and best hotel in coimbatore....,6 months ago,Positive,0.978
...,...,...,...,...
135,��😘 …,4 years ago,Neutral,0.782
136,Not Statistfied,3 years ago,Neutral,0.502
137,taste good,2 years ago,Positive,0.871
138,Good service,4 years ago,Positive,0.854


In [5]:
df

Unnamed: 0,Reviews,Timeframe,Sentiment,Sentiment %
0,Taste of the food was average. Wash room and d...,],Negative,0.959
1,Ordered online waste of money taste was so bad...,a week ago,Negative,0.981
2,Worst experience...not for safety\nin human li...,3 months ago,Negative,0.960
3,Ordered through online app. Food was average,4 months ago,Negative,0.714
4,One of the cheap and best hotel in coimbatore....,6 months ago,Positive,0.978
...,...,...,...,...
135,��😘 …,4 years ago,Neutral,0.782
136,Not Statistfied,3 years ago,Neutral,0.502
137,taste good,2 years ago,Positive,0.871
138,Good service,4 years ago,Positive,0.854


In [12]:
hi= df["Sentiment"].value_counts()

In [13]:
hi.index

Index(['Positive', 'Negative', 'Neutral'], dtype='object')

In [14]:
hi.values

array([59, 57, 24], dtype=int64)

In [2]:
generation_config = {
  "candidate_count": 1,
  "max_output_tokens": 256,
  "temperature": 1.0,
  "top_p": 0.7,
}

safety_settings=[
  {
    "category": "HARM_CATEGORY_DANGEROUS",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_NONE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_NONE",
  },
]

In [14]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_google_genai import GoogleGenerativeAI
import google.generativeai as genai


# llm = ChatGoogleGenerativeAI(model="models/gemini-pro", google_api_key="AIzaSyB-DjvJ98T5MW0l79IgYmCT1xZODu_2ATk",
#                  
#                temperature=1,convert_system_message_to_human=True)
genai.configure(api_key="AIzaSyB-DjvJ98T5MW0l79IgYmCT1xZODu_2ATk")
model = genai.GenerativeModel(
    model_name="gemini-pro",
    generation_config=generation_config,
    safety_settings=safety_settings,
)

In [15]:
jey = df[df['Reviews'].str.lower().str.contains("time")]
concatenated_reviews = ' '.join(jey.Reviews)

In [16]:
concatenated_reviews =concatenated_reviews

In [17]:
concatenated_reviews

"The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer. This place is not worth your time, let alone Vegas. Poor service, the waiter made me feel like I was stupid every time he came to the table. Always a great time at Dos Gringos! Update.....went back for a second time and it was still just as amazing 2 times - Very Bad Customer Service ! Today is the second time I've been to their lunch buffet and it was pretty good. Bland... Not a liking this place for a number of reasons and I don't want to waste time on bad reviewing.. I'll leave it at that... Definitely worth venturing off the strip for the pork belly, will return next time I'm in Vegas. I ordered the Voodoo pasta and it was the first time I'd had really excellent pasta since going gluten free several years ago. I've lived here since 1979 and this was the first (and last) time I've stepped foot into this place. For about 10 minutes, we we're waiting for her salad when we r

In [24]:
summarize=f"""I want you to write a sentence that can summarize meaningfully within 350 words. My first request is I need you to come up with a way to summarize  {concatenated_reviews}."""

In [27]:
hi=model.generate_content(summarize).text
hi

'**Summary:**\n\nReviews of Dos Gringos, a Las Vegas restaurant, vary widely, with some praising the excellent gluten-free pasta and pork belly while others criticize the poor service, bland food, and long wait times. The inconsistency in experiences suggests that the restaurant may have been inconsistent in its quality, leading to both positive and negative feedback.'

In [26]:
hi.text

'Despite some positive experiences, numerous patrons have expressed dissatisfaction with the poor service, bland food, and extended wait times at this establishment. Negative reviews highlight the rubbery potatoes, unattentive staff, and a general lack of value, while positive reviews praise the pork belly and gluten-free pasta. Overall, the consensus seems to be that this restaurant is not worth visiting, especially given the abundance of better options in the area.'