In [2]:
#preprocess task

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re

# DATA
data_path = r'C:\Users\nh013\Desktop\Restaurant Reviews\Restaurant reviews.csv'
df = pd.read_csv(data_path)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



# INITIALIZE STOPWORDS, STEMMER, LEMMATIZER
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()



# FUNCTION TO PREPROCESS..
def preprocess_text(text):
    # CHECK IF THE TXT IS A STRING 
    if isinstance(text, str):
        
        # ROVOME URLS
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        #REMOVE SPECIAL CHARECTER
        text = re.sub(r'[^\w\s]', '', text)

        # CONVERT TEXT TO LOWER CASE
        text = text.lower()

        # TOKENIZE THE TEXT
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS, PERFORM STEMMING AND LEMMATIZATION
        filtered_words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stop_words]

        return ' '.join(filtered_words)
    else:
        return ''  # RETURN AN EAMTY STRING FOR NON STRING VALUES

# PERFORM PREPROCESSING FOR REVIEW COL...
df['Review'] = df['Review'].apply(preprocess_text)


# FIND MISSING VALUES
missing_values = df.isnull().sum()

print("Missing Values in Each Column:")
print(missing_values)

df.dropna(inplace=True)

# RESET THE INDEX OF THE DATAFRAME  AFTER DROPPING ROWS
df.reset_index(drop=True, inplace=True)

print("DataFrame after removing rows with missing values:")
print(df.head())






[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Missing Values in Each Column:
Restaurant       0
Reviewer        38
Review           0
Rating          38
Metadata        38
Time            38
Pictures         0
7514          9999
dtype: int64
DataFrame after removing rows with missing values:
        Restaurant           Reviewer  \
0  Beyond Flavours  Rusha Chakraborty   

                                              Review Rating  \
0  ambienc good food quit good saturday lunch cos...      5   

                 Metadata             Time  Pictures    7514  
0  1 Review , 2 Followers  5/25/2019 15:54         0  2447.0  


In [3]:
#perform vader.......

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer  

# DATA
data_path = r'C:\Users\nh013\Desktop\Restaurant Reviews\Restaurant reviews.csv'
df = pd.read_csv(data_path)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



# INITIALIZE STOPWORDS, STEMMER, LEMMATIZER
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()



# FUNCTION TO PREPROCESS..
def preprocess_text(text):
    # CHECK IF THE TXT IS A STRING 
    if isinstance(text, str):
        
        # ROVOME URLS
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        #REMOVE SPECIAL CHARECTER
        text = re.sub(r'[^\w\s]', '', text)

        # CONVERT TEXT TO LOWER CASE
        text = text.lower()

        # TOKENIZE THE TEXT
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS, PERFORM STEMMING AND LEMMATIZATION
        filtered_words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stop_words]

        return ' '.join(filtered_words)
    else:
        return ''  # RETURN AN EAMTY STRING FOR NON STRING VALUES

# PERFORM PREPROCESSING FOR REVIEW COL...
df['Review'] = df['Review'].apply(preprocess_text)



# FUNCTION TO GET SENTIMENT ANALYSIS 
def get_sentiment_scores(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment

#APPLY VADER FOR REVIEW COL....
df['Sentiment_Scores'] = df['Review'].apply(get_sentiment_scores)

# EXTRACT COMPUND SENTIMENT SCORE
df['Compound_Score'] = df['Sentiment_Scores'].apply(lambda x: x['compound'])

# DEFINE THRESHOLD FOR CLASSIFY SENTIMENT 
threshold = 0.2
df['Sentiment'] = df['Compound_Score'].apply(lambda x: 'positive' if x > threshold else ('negative' if x < -threshold else 'neutral'))


print(df[['Review', 'Sentiment']])


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                 Review Sentiment
0     ambienc good food quit good saturday lunch cos...  positive
1     ambienc good pleasant even servic prompt food ...  positive
2     must tri great food great ambienc thnx servic ...  positive
3     soumen da arun great guy behavior sincereti go...  positive
4     food goodw order kodi drumstick basket mutton ...  positive
...                                                 ...       ...
9995  madhumathi mahajan well start nice courteou se...  positive
9996  place never disappoint u food courteou staff s...  positive
9997  bad rate mainli chicken bone found veg food ac...  positive
9998  person love prefer chines food coupl time husb...  positive
9999  check tri delici chines food seen nonveg lunch...  positive

[10000 rows x 2 columns]


In [4]:
#perform deep learning model........

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from keras.callbacks import EarlyStopping

# DATA
data_path = r'C:\Users\nh013\Desktop\Restaurant Reviews\Restaurant reviews.csv'
df = pd.read_csv(data_path)

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')



# INITIALIZE STOPWORDS, STEMMER, LEMMATIZER
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()



# FUNCTION TO PREPROCESS..
def preprocess_text(text):
    # CHECK IF THE TXT IS A STRING 
    if isinstance(text, str):
        
        # ROVOME URLS
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

        #REMOVE SPECIAL CHARECTER
        text = re.sub(r'[^\w\s]', '', text)

        # CONVERT TEXT TO LOWER CASE
        text = text.lower()

        # TOKENIZE THE TEXT
        words = nltk.word_tokenize(text)

        # REMOVE STOP WORDS, PERFORM STEMMING AND LEMMATIZATION
        filtered_words = [stemmer.stem(lemmatizer.lemmatize(word)) for word in words if word not in stop_words]

        return ' '.join(filtered_words)
    else:
        return ''  # RETURN AN EAMTY STRING FOR NON STRING VALUES

# PERFORM PREPROCESSING FOR REVIEW COL...
df['Review'] = df['Review'].apply(preprocess_text)



# FUNCTION TO GET SENTIMENT ANALYSIS 
def get_sentiment_scores(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment

#APPLY VADER FOR REVIEW COL....
df['Sentiment_Scores'] = df['Review'].apply(get_sentiment_scores)

# EXTRACT COMPUND SENTIMENT SCORE
df['Compound_Score'] = df['Sentiment_Scores'].apply(lambda x: x['compound'])

# DEFINE THRESHOLD FOR CLASSIFY SENTIMENT 
threshold = 0.2
df['Sentiment'] = df['Compound_Score'].apply(lambda x: 'positive' if x > threshold else ('negative' if x < -threshold else 'neutral'))

# SPLIT DATA INTO TRAINING AND TESTING SET 
X_train, X_test, y_train, y_test = train_test_split(df['Review'], df['Sentiment'], test_size=0.2, random_state=42)

# TOKENIZATION AND SEQUANCE PADDING 
max_words = 10000  
tokenizer = Tokenizer(num_words=max_words, oov_token='OOV')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100  # MAXIMUM LENGTH OF SEQUANCE
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# ENCODE THE LABEL
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# DEEP LEARNING MODEL 
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(3, activation='softmax'))

# COMPILE THE MODEL 
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# TRAIN MODEL 
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train_padded, y_train_encoded, validation_split=0.2, epochs=10, batch_size=64, callbacks=[early_stopping])

# EVALUATE MODLE 
loss, accuracy = model.evaluate(X_test_padded, y_test_encoded)
print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.4271, Test Accuracy: 0.8215
