In [10]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
q1 = pd.read_json('~/thesis/data/processed_data/knowledge_q1_lemmatized.jsonl', orient = 'records', lines = True)
q1 = q1.rename(columns = {'answer': 'q1'})

q2 = pd.read_json('~/thesis/data/processed_data/knowledge_q2_lemmatized.jsonl', orient = 'records', lines = True)
q2 = q2.rename(columns = {'answer': 'q2'})

q3 = pd.read_json('~/thesis/data/processed_data/knowledge_q3_lemmatized.jsonl', orient = 'records', lines = True)
q3 = q3.rename(columns = {'answer': 'q3'})

q4 = pd.read_json('~/thesis/data/processed_data/knowledge_q4_lemmatized.jsonl', orient = 'records', lines = True)
q4 = q4.rename(columns = {'answer': 'q4'})

q5 = pd.read_json('~/thesis/data/processed_data/knowledge_q5_lemmatized.jsonl', orient = 'records', lines = True)
q5 = q5.rename(columns = {'answer': 'q5'})

## 1. Participant-dataset: concatenated answers for questions 1,2,3,4,5

**Why not the sum of BOW?**: The Bag of Words (BoW) vector representation of the concatenation of multiple sentences is indeed the same as the sum of the Bag of Words vectors of each individual sentence before concatenation.

In [12]:
def concatenate_responses(df, question_col):
    return df.groupby('participant_id')[question_col].apply(lambda x: ' '.join(x)).reset_index()

In [13]:
# first, concatenate each response for each questions
q1_grouped = concatenate_responses(q1, 'q1')
q2_grouped = concatenate_responses(q2, 'q2')
q3_grouped = concatenate_responses(q3, 'q3')
q4_grouped = concatenate_responses(q4, 'q4')
q5_grouped = concatenate_responses(q5, 'q5')

In [14]:
# Merge the grouped DataFrames
q_merged = q1_grouped.merge(q2_grouped, on='participant_id', how='outer')
q_merged = q_merged.merge(q3_grouped, on='participant_id', how='outer')
q_merged = q_merged.merge(q4_grouped, on='participant_id', how='outer')
q_merged = q_merged.merge(q5_grouped, on='participant_id', how='outer')

In [15]:
q_merged

Unnamed: 0,participant_id,q1,q2,q3,q4,q5
0,00278,vaccination scheme area increase awareness edu...,equal pay woman men role ensure woman voice he...,look conservation plan ensure poacher prosecut...,make affordable make reliable make accessible ...,better worklife balance flexible working affor...
1,0068d,mobile vaccination van used ask various cleric...,ensure equal salary paid people regardless sex...,hefty fine long prison sentence educating scho...,lower price ask local people decrease gap serv...,provide local event ensure fairness people pos...
2,00f13,get people vaccination travel rural area get r...,ensuring equal pay ensuring fair 5050 split me...,punishing poacher armed guard around habitat i...,better network frequent transport reducing cos...,4 day week getting rid social medium automatin...
3,01cb6,visiting nurse dropin rural clinic evening vis...,employ equal number target desired gender impr...,safe living area tourist based funding preserv...,lower price increase reliability increase freq...,slow reduce working hour garden allotment read...
4,02670,organise targeted campaign rural area engage s...,hire woman nonbinary people organise programme...,ban hunting elephant create sanctuary elephant...,dropping price offer free public transport bui...,promote benefit meditation focus benefit exerc...
...,...,...,...,...,...,...
295,fb2a2,lot clear evidence making easy travel receive ...,equal pay regardless gender listening worker m...,funding security animal harsh punishment hunte...,ventilation available time avoid crowding good...,police available cctv coverage smaller area so...
296,fb9c4,referral scheme patient referred family friend...,offer regular seminar advocate gender equality...,increase sexual reproduction animal zoo harshe...,lower price public transportation ticket bette...,four day working week green space available ed...
297,ff2bf,education benefit vaccination reward incentive...,gender identity workshop staff look current pr...,ban ivory importsexports try breed captivity p...,free travel need le parking private vehicle sa...,kinder educate people mental health issue give...
298,b9593,,firstly pay difference opportunity advancement...,keep trophy hunter away breed confined area ma...,firstly reduce fare increase discount student ...,newspaper headline le depressing full doom glo...


In [16]:
# concatenate each question response by participant
# Concatenate responses for each participant
q_merged['concat'] = q_merged.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
print(q_merged['concat'])


0      00278 vaccination scheme area increase awarene...
1      0068d mobile vaccination van used ask various ...
2      00f13 get people vaccination travel rural area...
3      01cb6 visiting nurse dropin rural clinic eveni...
4      02670 organise targeted campaign rural area en...
                             ...                        
295    fb2a2 lot clear evidence making easy travel re...
296    fb9c4 referral scheme patient referred family ...
297    ff2bf education benefit vaccination reward inc...
298    b9593 nan firstly pay difference opportunity a...
299    f3885 nan teach school provide benefit opportu...
Name: concat, Length: 300, dtype: object


In [17]:
# Specify the data to analysis
data = q_merged['concat']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['00278' '0068d' '00f13' ... 'zone' 'zoo' 'zoossafari']
Bag of Words Matrix:
 [[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [19]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 1 0
 0 0 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0
 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 1
 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 1 0 1 0 0 1
 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0]


In [20]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.08260337296058397


**Result:** Worse than my clustering result for the participants clustering (sil 0.48)

## 2. Topics-dataset: stacked responses for each questions

### Question 1

In [21]:
# Specify the data to use
data = q1['q1']

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['12' '18' '24' ... 'wrong' 'year' 'young']
Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [23]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [1 0 1 ... 1 0 0]


In [24]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.057548109988467405


### Question 2

In [25]:
# Specify the data to use
data = q2['q2']

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['10' '40' '50' ... 'younger' 'zero' 'zone']
Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [27]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [1 1 1 ... 1 1 1]


In [28]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.09859761890316641


### Question 3

In [29]:
# Specify the data to use
data = q3['q3']

In [30]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['10' '20' 'accept' ... 'zone' 'zoo' 'zoossafari']
Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [31]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [1 1 1 ... 1 1 1]


In [32]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.025451296656610094


### Question 4

In [33]:
# Specify the data to use
data = q4['q4']

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['10th' '18' '20' ... 'young' 'younger' 'zone']
Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [35]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [1 1 1 ... 1 1 1]


In [36]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.0989674907570943


### Question 5

In [37]:
# Specify the data to use
data = q5['q5']

In [38]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the combined responses
X = vectorizer.fit_transform(data)

# Convert to an array to view the BoW matrix
X_array = X.toarray()

# Feature names (words)
features = vectorizer.get_feature_names_out()

print("Features (Words):", features)
print("Bag of Words Matrix:\n", X_array)


Features (Words): ['100' '110mph' '160mins' ... 'young' 'youth' 'zone']
Bag of Words Matrix:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [39]:
from sklearn.cluster import KMeans

# Apply KMeans clustering
n_clusters = 2  # Example: trying to find 2 clusters
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(X_array)

# Get cluster labels
labels = kmeans.labels_

print("Cluster Labels:", labels)

Cluster Labels: [1 0 0 ... 0 0 0]


In [40]:
from sklearn.metrics import silhouette_score

# Calculate the silhouette score
sil_score = silhouette_score(X_array, labels)
print(f"Silhouette Score: {sil_score}")


Silhouette Score: 0.04291646507313858
