Satiscan:
A satisfaction analysis applied to the review of a hospital

In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# Download NLTK stopwords and punkt if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Data Cleaning

In [3]:
# Getting the Data
table = pd.read_csv('/content/table (1).csv')
print(table.columns)

Index(['Author name', 'Reviews amount', 'Rating', 'Date',
       'Author profile image', 'Review content'],
      dtype='object')


In [4]:
# Checking the shape of the data
table_shape = table.shape
print(f"\nThe dataset comprises {table_shape[0]} authors and {table_shape[1]} features.")


The dataset comprises 388 authors and 6 features.


In [5]:
# Getting the Review Content
reviews = table[['Review content']].copy()
reviews.rename(columns={'Review content': 'review'}, inplace=True)

In [6]:
# Exploring the length of each review to get a valuable perspective on the data
reviews['review_length'] = reviews['review'].apply(lambda x: len(str(x).split()))

Short reviews might hold significance or not

In [7]:
# Identifying and examining short reviews
short_reviews = reviews[reviews['review_length'] < 5]
num_short_reviews = len(short_reviews)


In [8]:
# Displaying a sample of these short reviews, if they exist.
if num_short_reviews > 0:
    print(f"\nNumber of reviews with less than 5 words: {num_short_reviews}")
    print("\nSample of short reviews:")
    print(short_reviews['review'])



Number of reviews with less than 5 words: 216

Sample of short reviews:
67                Good Healthcare service
77            Clean and great environment
86                              Very Good
92     Excellent medical service delivery
113       Great hospital and institution.
                      ...                
383                                   NaN
384                                   NaN
385                                   NaN
386                                   NaN
387                                   NaN
Name: review, Length: 216, dtype: object


In [9]:
# Identifying empty reviews (NaN) is crucial as well. Let's quantify them.
nan_reviews = reviews['review'].isna().sum()
print(f"\nNumber of empty reviews: {nan_reviews}")


Number of empty reviews: 148


Checking longer reviews which may be adverts or spam messages

In [10]:
# Exploring longer reviews by analyzing the 75th percentile of the review length.
percentile_75 = reviews['review_length'].quantile(0.75)
long_reviews = reviews[reviews['review_length'] > percentile_75]
print((long_reviews[['review', 'review_length']]).head())

                                              review  review_length
0  Quite an expanse of facilities, tho mostly pri...             29
1  Jollof rice and chicken salad we have it for o...             27
2  Lagos University Teaching Hospital is viable a...             43
3  A very big hospital tipical of teaching hospit...             36
4  Lagos University Teaching Hospital has a majes...             33


The second column shows an advert

In [11]:
# Dropping columns with no reviews
reviews.dropna(inplace=True)
reviews.drop(columns=['review_length'], inplace=True)

# Displaying the shape of the processed reviews
processed_reviews_shape = reviews.shape
print(f"\nLeft with {processed_reviews_shape[0]} reviews")


Left with 240 reviews


Text Preprocessing

In [12]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

reviews['review'] = reviews['review'].apply(remove_punctuation)


In [13]:
# Lemmatize and Tokenize the reviews
lemmatizer = WordNetLemmatizer()
stop_words = set(nltk.corpus.stopwords.words('english'))

def preprocess_text(text):
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words]
    return ' '.join(lemmatized_tokens)

reviews['processed_reviews'] = reviews['review'].apply(preprocess_text)


In [14]:
# Displaying the DataFrame with processed reviews
print("\nProcessed Reviews:")
print(reviews[['review', 'processed_reviews']].head())


Processed Reviews:
                                              review  \
0  Quite an expanse of facilities tho mostly priv...   
1  Jollof rice and chicken salad we have it for o...   
2  Lagos University Teaching Hospital is viable a...   
3  A very big hospital tipical of teaching hospit...   
4  Lagos University Teaching Hospital has a majes...   

                                   processed_reviews  
0  quite expanse facility tho mostly privatised t...  
1  jollof rice chicken salad 10000 people best fo...  
2  lagos university teaching hospital viable prod...  
3  big hospital tipical teaching hospital course ...  
4  lagos university teaching hospital majestic en...  


In [15]:
# Displaying an example of pre-processed review
print("\nExample of pre-processed review:")
print(reviews['processed_reviews'][50])


Example of pre-processed review:
flagship teaching hospital lagos nigeria always open 24 hour qualified doctor nurse also medical student ground practicing get degree medical issue lagos university teaching hospital popularly known luth place go


Topic Modelling

Due to the small size of the current data, we will check for just two topics

In [16]:
# Using CountVectorizer to convert processed reviews into a document-term matrix
cv = CountVectorizer(max_df=0.4, min_df=2, stop_words='english')
dtm = cv.fit_transform(reviews['processed_reviews'])

In [17]:
# The shape of our document-term matrix reveals the complexity of our text data.
dtm_shape = dtm.shape
print(f"\nShape of Document-Term Matrix: {dtm_shape}")


Shape of Document-Term Matrix: (240, 236)


In [18]:
# Applying Latent Dirichlet Allocation (LDA) to identify topics within the reviews
LDA = LatentDirichletAllocation(n_components=2, random_state=42)
topic_results = LDA.fit_transform(dtm)


In [19]:
# The shape of LDA components provides insights into the identified topics.
lda_components_shape = LDA.components_.shape
print(f"Shape of LDA Components: {lda_components_shape}")

Shape of LDA Components: (2, 236)


In [20]:
# Exploring the top 10 words for each topic gives us a glimpse into the essence of these topics.
print("\nTop 10 Words for Each Topic:")
for i, arr in enumerate(LDA.components_):
    print(f'TOP 10 WORDS FOR TOPIC #{i}')
    print([cv.get_feature_names_out()[i] for i in arr.argsort()[-10:]])
    print('\n\n')



Top 10 Words for Each Topic:
TOP 10 WORDS FOR TOPIC #0
['place', 'best', 'good', 'facility', 'nigeria', 'medical', 'doctor', 'teaching', 'lagos', 'hospital']



TOP 10 WORDS FOR TOPIC #1
['healthcare', 'care', 'staff', 'patient', 'time', 'health', 'nice', 'service', 'need', 'good']





The 1st Topic List points towards the Organisational Structure comprising words like 'hospital', 'place', 'facility', 'doctor'.
The 2nd Topic List points towards the Service including words like; 'care', 'time','health', 'need',  'service'