In [46]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import regex as re
import torch

In [47]:
data = pd.read_csv('data/clm_open_ended.csv')
data.head()

Unnamed: 0,Date of feedback collection,County,Positive feedback on services,Negative feedback on services,Suggestions for improving access to services,Positive aspects of facility to emulate,General suggestions for improvement,Top 1-3 Positive aspects of care and treatment,Top 1-3 Negative aspects of care and treatment
0,2024-04-17 20:55:46,Narok,Good,,To used hospital,Good services,Time,Awareness,
1,2024-04-17 19:49:25,Kisumu,Quick service delivery,,Additional of medics to speed service delivery,Medics talk well to clients,,Quick service delivery,
2,2024-04-17 19:44:28,Kisumu,Got drugs and happy,,,Good treatment services,,Got drugs and happy.,
3,2024-04-17 19:37:41,Kisumu,Quick service delivery,,,Good medication services,Infrastructure development and add machines to...,Quick service delivery,
4,2024-04-17 19:31:12,Kisumu,Good reception from medics,,,Medics are patient and have good listening ear...,Quick service delivery,Good reception from medics,


In [48]:
data.shape

(60749, 9)

In [49]:
data.drop(['Date of feedback collection', 'County'], axis=1, inplace=True)

In [50]:
df = data.dropna()
print('Original dataset shape: ', data.shape)
print('Cleaned dataset shape: ', df.shape)

Original dataset shape:  (60749, 7)
Cleaned dataset shape:  (21123, 7)


In [51]:
df['combined_feedback'] = df[['Positive feedback on services', 'Negative feedback on services', 'Suggestions for improving access to services', 'Positive aspects of facility to emulate', 'General suggestions for improvement']].agg('. '.join, axis=1)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_feedback'] = df[['Positive feedback on services', 'Negative feedback on services', 'Suggestions for improving access to services', 'Positive aspects of facility to emulate', 'General suggestions for improvement']].agg('. '.join, axis=1)


Unnamed: 0,Positive feedback on services,Negative feedback on services,Suggestions for improving access to services,Positive aspects of facility to emulate,General suggestions for improvement,Top 1-3 Positive aspects of care and treatment,Top 1-3 Negative aspects of care and treatment,combined_feedback
14,Good handling from the staff,,More medicine,Good follow up with the clients through phone ...,More medicine,Puctuality of the staff,,Good handling from the staff . None . More med...
15,Good reception from the staff,,More staff,Good communication skills from the staff,More medicine,Good reception from the staff,,Good reception from the staff . None . More st...
16,Good reception from the staff,,More staff,Constant reminder through phone calls on appoi...,More medicine especially septrine,Staff are very reliable,,Good reception from the staff . None . More st...
17,Good handling from the staff,,More medicine,Good reception and communication skills from t...,More medicine especially septrine,Availability of staff in the facility,,Good handling from the staff . None . More med...
18,Staff are very friendly and understanding,,More medicine,Staff are very friendly and welcoming,More medicine especially septrine,Facility is easily accessible,,Staff are very friendly and understanding . No...
...,...,...,...,...,...,...,...,...
60739,Friendly staffs,Referral for the service,Making All services available,Friendly staffs,Nutrition services,"ARTS available, lab services available and sta...","Overcrowded wards for inpatient clients, poor ...",Friendly staffs . Referral for the service . M...
60743,All are good,Nothing wrong,No idea,Good communication skills by staffs and confid...,No idea,Good treatment,Nothing,All are good. Nothing wrong. No idea. Good com...
60745,At least pmtct is screened and nutrition couns...,Nothing,no idea,Good treatment and care,No idea,I like the pmtct part,Nothing,At least pmtct is screened and nutrition couns...
60746,All are well,Nothing,No idea,good care,Nothing,Good treatment,Nothing wrong seen at all services points,All are well. Nothing . No idea. good care. No...


In [53]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the feedback
def tokenize_feedback(text):
    encoded = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',  # Returns PyTorch tensors
        truncation=True
    )
    return encoded['input_ids'], encoded['attention_mask']

# Apply tokenizer to the DataFrame
df['tokens'] = df['combined_feedback'].apply(lambda x: tokenize_feedback(x))
print(df['tokens'])

ImportError: Unable to convert output to PyTorch tensors format, PyTorch is not installed.

In [None]:
# Load BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(tokens):
    model.eval()
    with torch.no_grad():
        outputs = model(tokens[0], attention_mask=tokens[1])
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

# Apply the function to get embeddings
df['embeddings'] = df['tokens'].apply(get_bert_embeddings)

# Concatenate all embeddings into a matrix for clustering
all_embeddings = np.vstack(df['embeddings'].values)

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(all_embeddings)

# Perform PCA to reduce dimensionality for better clustering
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# Clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(X_pca)
df['topic'] = kmeans.labels_

In [None]:
# Placeholder for summary
# This is a pseudo code. You would need a fine-tuned BERT sequence-to-sequence model for summarization
df['summary'] = df['combined_feedback'].apply(lambda x: 'Summarized text here')

In [None]:
# Analyze the topics and summaries
df[['combined_feedback', 'topic', 'summary']]