In [1]:
#Preprocess Step...........


import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

# LET'S DOWNLOAD NLTK RESOURCES
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# DATAPATH
data_path = r"C:\Users\nh013\Desktop\MC''donald\McDonald_s_Reviews.csv"
df = pd.read_csv(data_path, encoding='latin1')



# PREPROCESSING FUNCTION
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # TOKENIZE TEXT
    tokens = word_tokenize(text)
    
    #REMOVE PUNCTUATION AND CONVERT TO LOWER CASE
    tokens = [token.lower() for token in tokens if token.isalpha()]
    
    #REMOVE STOP WORDS
    tokens = [token for token in tokens if token not in stop_words]
    
    # LEMMATIZE TOKEN
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    
    # JOIN TOKEN BACK INTO A SINGLE STRING
    processed_text = ' '.join(tokens)
    
    return processed_text

def get_wordnet_pos(token):
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)

# PERFROM PREPROCESSING TO THE  'review' COLUMN
df['review'] = df['review'].apply(preprocess_text)


print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   reviewer_id  store_name              category  \
0            1  McDonald's  Fast food restaurant   
1            2  McDonald's  Fast food restaurant   
2            3  McDonald's  Fast food restaurant   
3            4  McDonald's  Fast food restaurant   
4            5  McDonald's  Fast food restaurant   

                                       store_address  latitude   longitude  \
0  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
1  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
2  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
3  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
4  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   

  rating_count   review_time  \
0        1,240  3 months ago   
1        1,240    5 days ago   
2        1,240    5 days ago   
3        1,240   a month ago   
4        1,240  2 months ago   

                         

In [1]:
#To find positive and negative sentiment using the TextBlob library


import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from textblob import TextBlob

# LET'S DOWNLOAD NLTK RESOURCES
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#DATAPATH
data_path = r"C:\Users\nh013\Desktop\MC''donald\McDonald_s_Reviews.csv"
df = pd.read_csv(data_path, encoding='latin1')

# FUNCTION FOR PREPROCESS
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

def get_wordnet_pos(token):
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)



#PREPROESSING TO THE REVIEW COLUMN
df['review'] = df['review'].apply(preprocess_text)



# SNTIMENT ANALYSIS USING TEXT-BLOB
def calculate_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    return sentiment

df['sentiment'] = df['review'].apply(calculate_sentiment)

# CLASSIFY SENTIMENT AS POSSITIVE OR NEGATIVE
df['sentiment_category'] = df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative')


print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   reviewer_id  store_name              category  \
0            1  McDonald's  Fast food restaurant   
1            2  McDonald's  Fast food restaurant   
2            3  McDonald's  Fast food restaurant   
3            4  McDonald's  Fast food restaurant   
4            5  McDonald's  Fast food restaurant   

                                       store_address  latitude   longitude  \
0  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
1  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
2  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
3  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
4  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   

  rating_count   review_time  \
0        1,240  3 months ago   
1        1,240    5 days ago   
2        1,240    5 days ago   
3        1,240   a month ago   
4        1,240  2 months ago   

                         

In [2]:

#Sentiment analysis using vader.

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.sentiment import SentimentIntensityAnalyzer

#  LET'S DOWNLOAD NLTK RESOURCES
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')



# DATASET
data_path = r"C:\Users\nh013\Desktop\MC''donald\McDonald_s_Reviews.csv"
df = pd.read_csv(data_path, encoding='latin1')


# FUNCTION FOR PREPROCESSING
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()



def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    processed_text = ' '.join(tokens)
    return processed_text



def get_wordnet_pos(token):
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)



#PREPROCESSING TO THE REVIEW COLUMN
df['review'] = df['review'].apply(preprocess_text)


#SENTIMENT ANALYSIS USING VADER
def calculate_sentiment(text):
    sid = SentimentIntensityAnalyzer()
    sentiment = sid.polarity_scores(text)['compound']
    return sentiment

df['sentiment'] = df['review'].apply(calculate_sentiment)

# CLASSIFY SENTIMENT AS POSSITVE OR NEGATIVE
df['sentiment_category'] = df['sentiment'].apply(lambda x: 'Positive' if x > 0 else 'Negative')


print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


   reviewer_id  store_name              category  \
0            1  McDonald's  Fast food restaurant   
1            2  McDonald's  Fast food restaurant   
2            3  McDonald's  Fast food restaurant   
3            4  McDonald's  Fast food restaurant   
4            5  McDonald's  Fast food restaurant   

                                       store_address  latitude   longitude  \
0  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
1  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
2  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
3  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   
4  13749 US-183 Hwy, Austin, TX 78750, United States  30.460718 -97.792874   

  rating_count   review_time  \
0        1,240  3 months ago   
1        1,240    5 days ago   
2        1,240    5 days ago   
3        1,240   a month ago   
4        1,240  2 months ago   

                         

In [2]:
# TRAIN  RNN'S MODEL TO Explore geographical patterns in reviews and ratings to identify high-performing or 
#underperforming regions.


#Location-based analysis

import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle



# LET'S DOWNLOAD NLTK RESOURCES
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')



# DATASET
data_path = r"C:\Users\nh013\Desktop\MC''donald\McDonald_s_Reviews.csv"
df = pd.read_csv(data_path, encoding='latin1')



# FUNCTION FOR PREPROCESSING
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(token)) for token in tokens]
    processed_text = ' '.join(tokens)
    return processed_text

def get_wordnet_pos(token):
    tag = nltk.pos_tag([token])[0][1][0].upper()
    tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }
    return tag_dict.get(tag, wordnet.NOUN)



# PREPROCESS TO THE REVIEW COLUMN
df['review'] = df['review'].apply(preprocess_text)


# CLEAN THE RATING COLUMN
df['rating'] = df['rating'].apply(lambda x: re.sub(r'[^0-9.]', '', x))
df['rating'] = df['rating'].astype(float)


# SPLIT DATA INTO TRAINING AND TESTING SET
X = df['review']
y = df['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# TOKENIZE THE TEXT DATA AND CONVERT IT TO SEQUENCE
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)



# PAD THE SEQUENCE TO FIX LENGTH
max_sequence_length = 100
X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_sequences, maxlen=max_sequence_length)
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_sequences, maxlen=max_sequence_length)

# TRAIN MODEL
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(1, activation='linear'))


# COMPILE AND TRAIN THE MODEL
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(X_train_padded, y_train, validation_data=(X_test_padded, y_test), epochs=10, batch_size=32)



# PREDICT THE RATING USING THE TRAIN MOEL
predictions = model.predict(X_test_padded)


# FLATTEN THE PREDICTION ARRAY
predictions = predictions.flatten()


#ADD THE PREDICTION TO THE DATAFRAME
df.loc[X_test.index, 'predicted_rating'] = predictions



# EXPLORE GEOGRAPHICAL-PATTERN  BASED ON THE PREDICTED RATINGS
location_df = df[['store_address', 'latitude', 'longitude', 'predicted_rating']].copy()

# GROUP THE DATAFAME BY STORE ADDRESS AND CALCULATE AVERAGE PREDICTED RATINGS
average_predicted_rating_by_location = location_df.groupby('store_address')['predicted_rating'].mean().reset_index()


#SORT THE LOCATION BY AVERAGE PREDICTING RATING IN DESENDING ORDER
sorted_locations = average_predicted_rating_by_location.sort_values('predicted_rating', ascending=False)

#PRINT THE TOP 10 HIGH PERFORMGING REGION
print("Top 10 High-Performing Regions:")
print(sorted_locations.head(10))

# PRINT THE BOTTOM 10 UNDERPERFORMIMG REGION
print("\nBottom 10 Underperforming Regions:")
print(sorted_locations.tail(10))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nh013\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Top 10 High-Performing Regions:
                                        store_address  predicted_rating
28  5920 Balboa Ave, San Diego, CA 92111, United S...          3.829932
34  702-2 Haddonfield-Berlin Rd, Voorhees Township...          3.641321
26  555 13th St NW, Washington, DC 20004, United S...          3.638682
31  6875 Sand Lake Rd, Orlando, FL 32819, United S...          3.628165
22     429 7th Ave, New York, NY 10001, United States          3.597639
14  1698 US-209, Brodheadsville, PA 18322, United ...          3.512563
3   1100 N US Hwy 377, Roanoke, TX 76262, United S...          3.510272
4   111 Madison St, Oak Park, IL 60302, United States          3.468577
33  7010 Bradlick Shopping Center, Annandale, VA 2...          3.445307
9   1415 E State Rd, Fern Park, FL 32730, United S...          3.397718

Bottom 10 Underperforming Regions:
                             