In [1]:
import pandas as pd

In [2]:

# Read data
df = pd.read_csv('hotel_reviews.csv')

In [3]:

# Display information about the DataFrame
print("DataFrame Info:\n", df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7001 entries, 0 to 7000
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Index              7001 non-null   int64  
 1   Name               7001 non-null   object 
 2   Area               7001 non-null   object 
 3   Review_Date        7001 non-null   object 
 4   Rating_attribute   7001 non-null   object 
 5   Rating(Out of 10)  7001 non-null   float64
 6   Review_Text        6994 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 383.0+ KB
DataFrame Info:
 None


In [4]:
df.describe()

Unnamed: 0,Index,Rating(Out of 10)
count,7001.0,7001.0
mean,3500.0,7.030981
std,2021.15895,2.882846
min,0.0,1.0
25%,1750.0,6.0
50%,3500.0,8.0
75%,5250.0,9.0
max,7000.0,10.0


In [5]:
# Drop rows with NaN values in Review_Text
df = df.dropna(subset=['Review_Text'])

In [6]:

# Function to clean text
def clean_text(text):
    if isinstance(text, float) and np.isnan(text):
        return ''
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = ''.join([i for i in text if not i.isdigit()])  # Remove numbers
    return text

In [7]:
import string
 # Importing the string module for string manipulation tasks (e.g., removing punctuation)
import numpy as np
 # Importing NumPy for numerical computations and efficient array operations
import nltk
 # Importing NLTK (Natural Language Toolkit) for natural language processing tasks
from nltk.tokenize import word_tokenize
 # Importing word_tokenize for tokenizing text into words
from nltk.corpus import stopwords
 # Importing stopwords for filtering out common words in text analysis


In [8]:
# Download NLTK data files
#NLTK is a powerful library for natural language processing (NLP) tasks in Python.
#It provides easy-to-use interfaces to numerous linguistic data resources and algorithms.
nltk.download('punkt')
#punkt: Provides tokenizers for breaking text into tokens (words and punctuation).

#NLTK relies on various datasets and models (corpora, models, etc.) to perform its functions.
#These downloads ensure that your local NLTK installation has the necessary data files required for specific tasks

nltk.download('stopwords')

#stopwords: Contains lists of common words (e.g., "the", "is", "and") that are typically removed from text data because they do not contribute much to the meaning of the text.
nltk.download('wordnet')

#wordnet: A lexical database of English that is used for tasks such as synonymy, hyponymy, and lemmatization (reducing words to their base or root form).

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [9]:
# Apply text cleaning
df['Cleaned_Review_Text'] = df['Review_Text'].apply(clean_text)

# Tokenization
df['Tokens'] = df['Cleaned_Review_Text'].apply(word_tokenize)

# Stop Words Removal
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x])


In [10]:

# Before cleaning
print("Before Text Cleaning:")
print(df['Review_Text'])

# Apply text cleaning
df['Cleaned_Review_Text'] = df['Review_Text'].apply(clean_text)
print("After Text Cleaning:")
print(df['Cleaned_Review_Text'])


Before Text Cleaning:
0       Hotel the pearl is perfect place to stay in De...
1       Location of the hotel is perfect. The hotel is...
2                                  Location, Indian food.
3       The location and the hotel itself is great. Ne...
4       Friendly and smiling staffs.. The reception st...
                              ...                        
6996    The room was good, comfortable and aesthetic \...
6997                                           good hotel
6998    good experience for me about hotel \nvery good...
6999                                            well done
7000                                              Nothing
Name: Review_Text, Length: 6994, dtype: object
After Text Cleaning:
0       hotel the pearl is perfect place to stay in de...
1       location of the hotel is perfect the hotel is ...
2                                    location indian food
3       the location and the hotel itself is great nex...
4       friendly and smiling staffs the 

In [11]:
# Before tokenization
print("\nAfter Text Cleaning (before tokenization):")
print(df['Cleaned_Review_Text'])

# Tokenization
print("After tokenization")
df['Tokens'] = df['Cleaned_Review_Text'].apply(word_tokenize)
print(df['Tokens'])



After Text Cleaning (before tokenization):
0       hotel the pearl is perfect place to stay in de...
1       location of the hotel is perfect the hotel is ...
2                                    location indian food
3       the location and the hotel itself is great nex...
4       friendly and smiling staffs the reception staf...
                              ...                        
6996    the room was good comfortable and aesthetic \n...
6997                                           good hotel
6998    good experience for me about hotel \nvery good...
6999                                            well done
7000                                              nothing
Name: Cleaned_Review_Text, Length: 6994, dtype: object
After tokenization
0       [hotel, the, pearl, is, perfect, place, to, st...
1       [location, of, the, hotel, is, perfect, the, h...
2                                [location, indian, food]
3       [the, location, and, the, hotel, itself, is, g...
4       [fri

In [12]:
# Stop Words Removal
stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])


# After stop words removal
print("\nAfter Stop Words Removal:")
print(df['Tokens'])


After Stop Words Removal:
0       [hotel, pearl, perfect, place, stay, delhi, pa...
1       [location, hotel, perfect, hotel, peaceful, ni...
2                                [location, indian, food]
3       [location, hotel, great, next, time, stay, nic...
4       [friendly, smiling, staffs, reception, staff, ...
                              ...                        
6996    [room, good, comfortable, aesthetic, hotel, go...
6997                                        [good, hotel]
6998    [good, experience, hotel, good, staff, well, d...
6999                                         [well, done]
7000                                            [nothing]
Name: Tokens, Length: 6994, dtype: object


In [13]:
# Display tokens before stop words removal
print("Before Stop Words Removal:")
for tokens in df['Tokens'].head():
    print(tokens)

Before Stop Words Removal:
['hotel', 'pearl', 'perfect', 'place', 'stay', 'delhi', 'paharganj', 'whole', 'staff', 'helpful', 'informative', 'rooms', 'clean', 'comfortable', 'hotels', 'location', 'convenient', 'truly', 'wonderful', 'stay', 'recommended']
['location', 'hotel', 'perfect', 'hotel', 'peaceful', 'nice', 'staff', 'kind', 'nice', 'rooms', 'recommend', 'hotel']
['location', 'indian', 'food']
['location', 'hotel', 'great', 'next', 'time', 'stay', 'nice', 'rooms', 'comfortable', 'beds', 'good', 'attitude', 'staff', 'helpful', 'explain', 'everything', 'ask']
['friendly', 'smiling', 'staffs', 'reception', 'staff', 'excellent', 'ready', 'help', 'time', 'location', 'fantastic', 'near', 'attractions', 'room', 'big', 'comfortable']


In [14]:
# Define stop words
stop_words = set(stopwords.words('english'))

# Remove stop words
df['Tokens'] = df['Tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Display tokens after stop words removal
print("\nAfter Stop Words Removal:")
for tokens in df['Tokens'].head():
    print(tokens)



After Stop Words Removal:
['hotel', 'pearl', 'perfect', 'place', 'stay', 'delhi', 'paharganj', 'whole', 'staff', 'helpful', 'informative', 'rooms', 'clean', 'comfortable', 'hotels', 'location', 'convenient', 'truly', 'wonderful', 'stay', 'recommended']
['location', 'hotel', 'perfect', 'hotel', 'peaceful', 'nice', 'staff', 'kind', 'nice', 'rooms', 'recommend', 'hotel']
['location', 'indian', 'food']
['location', 'hotel', 'great', 'next', 'time', 'stay', 'nice', 'rooms', 'comfortable', 'beds', 'good', 'attitude', 'staff', 'helpful', 'explain', 'everything', 'ask']
['friendly', 'smiling', 'staffs', 'reception', 'staff', 'excellent', 'ready', 'help', 'time', 'location', 'fantastic', 'near', 'attractions', 'room', 'big', 'comfortable']


In [15]:
from nltk.stem import WordNetLemmatizer

In [16]:
# Lemmatization

lemmatizer = WordNetLemmatizer()
df['Tokens'] = df['Tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Display the cleaned data
print(df[['Review_Text', 'Cleaned_Review_Text', 'Tokens']].head())

                                         Review_Text  \
0  Hotel the pearl is perfect place to stay in De...   
1  Location of the hotel is perfect. The hotel is...   
2                             Location, Indian food.   
3  The location and the hotel itself is great. Ne...   
4  Friendly and smiling staffs.. The reception st...   

                                 Cleaned_Review_Text  \
0  hotel the pearl is perfect place to stay in de...   
1  location of the hotel is perfect the hotel is ...   
2                               location indian food   
3  the location and the hotel itself is great nex...   
4  friendly and smiling staffs the reception staf...   

                                              Tokens  
0  [hotel, pearl, perfect, place, stay, delhi, pa...  
1  [location, hotel, perfect, hotel, peaceful, ni...  
2                           [location, indian, food]  
3  [location, hotel, great, next, time, stay, nic...  
4  [friendly, smiling, staff, reception, staff, e..