#### Research Problem
To understand the factors that influence service satisfaction in healthcare facilities and how patient feedback can be leveraged to improve service quality.

#### Research Questions
1. What themes are prevalent in the positive feedback provided by the patients?
1. What common issues are mentioned in the suggestions for improvement?
1. How does the sentiment of the feedback correlate with the reported service satisfaction?
1. Can we predict the level of service satisfaction based on the feedback provided?

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import gensim
from gensim import corpora
from gensim.models import LdaModel

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = pd.read_csv('data/clm_open_ended.csv')

In [None]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Normalization (lowercase)
    tokens = [token.lower() for token in tokens]
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token.isalpha()]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Applying the preprocessing function to each open-ended text column
text_columns = ['ServicesLiked', 'ServicesDisliked', 'ImprovementSuggestions', 'AccessImprovementSuggestions',
                'PositiveObservations', 'GeneralImprovementSuggestions', 'AdditionalComments', 'TopFacilityFeatures']
for column in text_columns:
    df[column] = df[column].astype(str).apply(preprocess_text)

In [None]:
# Generate a word cloud for the 'ServicesLiked' column
text = ' '.join(df['ServicesLiked'].dropna())
wordcloud = WordCloud(background_color='white').generate(text)

# Display the generated word cloud
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
tokenized_docs = [word_tokenize(doc) for doc in df['text_data']]

# Create a dictionary representation of the documents
dictionary = corpora.Dictionary(tokenized_docs)

# Filter out extremes to limit the number of features
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

# Convert dictionary to a bag of words corpus
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

# Set parameters
num_topics = 5  # Adjust the number of topics
passes = 15     # Adjust the number of passes
iterations = 400  # Adjust the number of iterations

# Create an LDA model
lda_model = LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=passes, iterations=iterations)

# Print the topics
topics = lda_model.print_topics(num_words=4)  # Adjust the number of words to represent each topic
for topic in topics:
    print(topic)

In [None]:
df['FriendlyStaff'] = df['ServicesLiked'].apply(lambda x: 1 if 'friendly staff' in x else 0)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['ServicesLiked'])
y = df['FriendlyStaff']

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))