In [1]:
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
!pip install Unidecode



In [3]:
# Read the data from CSV files
temp = pd.read_csv('../For_preprocessing.csv')
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4768 entries, 0 to 4767
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4763 non-null   object
 1   Sentiment    4766 non-null   object
dtypes: object(2)
memory usage: 74.6+ KB


In [4]:
temp=temp.dropna(subset=['Review_Text'])
temp=temp.dropna(subset=['Sentiment'])
temp = temp.reset_index(drop=True)

temp_val = temp['Sentiment'].tolist()
#len(temp_val)
ind_list = []

for i in range(len(temp['Sentiment'])):
    if temp_val[i] != '1' and temp_val[i] != '2' and temp_val[i] != '3':
        ind_list.append(i)
        
temp.reset_index(drop=True)
temp.drop(temp.index[ind_list], inplace=True)

In [5]:
temp = temp.reset_index(drop=True)
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4699 entries, 0 to 4698
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4699 non-null   object
 1   Sentiment    4699 non-null   object
dtypes: object(2)
memory usage: 73.5+ KB


In [6]:
st = temp['Sentiment'].tolist()
st=set(st)
st

{'1', '2', '3'}

In [7]:
temp_val = temp['Sentiment'].tolist()
#len(temp_val)
ind_list = []

for i in range(len(temp['Sentiment'])):
    if temp_val[i] != '1' and temp_val[i] != '2' and temp_val[i] != '3':
        ind_list.append(i)
temp.reset_index(drop=True)
temp.drop(temp.index[ind_list], inplace=True)

In [8]:
scale_mapper3 = {"1":0, "2":1, "3":2}
temp['Sentiment'] = temp['Sentiment'].replace(scale_mapper3)

In [9]:
temp = temp.reset_index(drop=True)
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4699 entries, 0 to 4698
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4699 non-null   object
 1   Sentiment    4699 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 73.5+ KB


In [10]:
temp

Unnamed: 0,Review_Text,Sentiment
0,Its really nice place to stay especially for b...,2
1,It seems that hotel does not check the basic a...,0
2,Worst hotel I have ever encountered. I will ne...,0
3,Had a good time in this hotel and the staff Ku...,2
4,good hotel and staff Veg food good non veg bre...,2
...,...,...
4694,My fifth stay at the hotel for business. Rooms...,2
4695,enjoyable,2
4696,Most impressive service by staff in all areas....,2
4697,"The linens were smelling bad, and the elevator...",0


In [11]:
def tokenize_reg():
    temp['Review_Text'] = temp['Review_Text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True).replace(r'\d+','',regex=True)
    tokens = RegexpTokenizer(r'\w+')
    temp['Review_Text']=temp['Review_Text'].apply(lambda x:tokens.tokenize(x.lower()))
    stop_words = set(stopwords.words('english')) 
    stop_words = stop_words.union(",","(",")","[","]","{","}","#","@","!",":",";",".","?")
    temp['Review_Text'] = temp['Review_Text'].apply(lambda x: [item for item in x if item not in stop_words])
tokenize_reg()

In [12]:
lem = WordNetLemmatizer()
def lemm(text):
    sent=[]
    for word in text:
        sent.append(lem.lemmatize(word))
    return sent

ps=PorterStemmer()

def stemm(text):        
    sent = []
    for word in text:
        sent.append(ps.stem(word))
    return sent

temp['Review_Text'] =  temp.apply(lambda x: stemm(lemm(x['Review_Text'])), axis=1)

In [13]:
revs=temp['Review_Text'].tolist()
for i in range(len(revs)):
    revs[i]=' '.join(revs[i])
temp['Review_Text'] = revs
temp

Unnamed: 0,Review_Text,Sentiment
0,realli nice place stay especi busi tourist purpos,2
1,seem hotel check basic amen room hand room tra...,0
2,worst hotel ever encount never think stay thii...,0
3,good time hotel staff kumar aishwarya hous kee...,2
4,good hotel staff veg food good non veg breakfa...,2
...,...,...
4694,fifth stay hotel busi room great restaur excel...,2
4695,enjoy,2
4696,impress servic staff area good restaur fit cen...,2
4697,linen smell bad elev pungent odour housekeep p...,0


In [14]:
pd.to_numeric(temp['Sentiment'])
st = temp['Sentiment'].tolist()
st=set(st)

In [15]:
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4699 entries, 0 to 4698
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Review_Text  4699 non-null   object
 1   Sentiment    4699 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 73.5+ KB


In [16]:
documents = temp['Review_Text'].values.astype("U")

tf_vectorizer = TfidfVectorizer(stop_words='english')
tf_features = tf_vectorizer.fit_transform(documents)

In [17]:
k = 3
tf_model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
tf_model.fit(tf_features)

KMeans(max_iter=100, n_clusters=3, n_init=1)

In [18]:
temp['tf_cluster'] = tf_model.labels_
temp

Unnamed: 0,Review_Text,Sentiment,tf_cluster
0,realli nice place stay especi busi tourist purpos,2,0
1,seem hotel check basic amen room hand room tra...,0,2
2,worst hotel ever encount never think stay thii...,0,2
3,good time hotel staff kumar aishwarya hous kee...,2,0
4,good hotel staff veg food good non veg breakfa...,2,1
...,...,...,...
4694,fifth stay hotel busi room great restaur excel...,2,0
4695,enjoy,2,0
4696,impress servic staff area good restaur fit cen...,2,2
4697,linen smell bad elev pungent odour housekeep p...,0,2


In [19]:
clusters = temp.groupby('tf_cluster') 

In [20]:
print("Cluster centroids: \n")
order_centroids = tf_model.cluster_centers_.argsort()[:, ::-1]
terms = tf_vectorizer.get_feature_names()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 stay
 hotel
 great
 staff
 nice
 excel
 help
 place
 food
 good
------------
Cluster 1:
 good
 hotel
 room
 servic
 locat
 food
 stay
 nice
 staff
 overal
------------
Cluster 2:
 room
 hotel
 good
 servic
 breakfast
 locat
 food
 need
 stay
 clean
------------


In [21]:
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score

print("Detailed classification report:")
print()
print()
print(classification_report(temp['Sentiment'], temp['tf_cluster'], digits=4))
print()
print ("Confusion Matrix")
print(confusion_matrix(temp['Sentiment'], temp['tf_cluster']))
print("Accuracy Score")
print(accuracy_score(temp['Sentiment'], temp['tf_cluster']))

Detailed classification report:


              precision    recall  f1-score   support

           0     0.0117    0.0330    0.0173       485
           1     0.1193    0.1499    0.1329       827
           2     0.5375    0.3637    0.4339      3387

    accuracy                         0.2920      4699
   macro avg     0.2229    0.1822    0.1947      4699
weighted avg     0.4097    0.2920    0.3379      4699


Confusion Matrix
[[  16   28  441]
 [  84  124  619]
 [1268  887 1232]]
Accuracy Score
0.2919770163864652
