# Embedding generation
### Knowledge-explore dataset

In [287]:
import torch

# Check if the GPU is available
use_cuda = torch.cuda.is_available()
use_cuda

True

In [288]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [289]:
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.spatial.distance import cdist

In [290]:
from transformers import DistilBertTokenizer, DistilBertModel
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

### Question 1 
"How could we increase vaccination rates in social areas?"

In [291]:
data = pd.read_json('~/thesis/data/processed_data/knowledge_q1_lemmatized.jsonl', orient='records', lines=True)

In [292]:
participant = data['participant_id'].copy()

In [293]:
for x in range(len(data)): 
    if data['answer'].iloc[x] == '': # Check if there's empty strings values
        print(x)

655


In [294]:
data = data['answer'].copy()

In [295]:
# Example list of texts to cluster
texts = [ answer for answer in data]

In [296]:
# Tokenize and encode the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Obtain distilBERT embeddings
with torch.no_grad():
    model_output = model(**encoded_texts)

# Get the embeddings from the model output
embeddings = model_output.last_hidden_state[:, 0, :].numpy()

In [297]:
embeddings.shape

(1294, 768)

In [298]:
embeddings

array([[-0.19127376, -0.17733298, -0.26054257, ..., -0.22700275,
         0.01577356,  0.41399235],
       [-0.06792968, -0.13514109, -0.13267083, ..., -0.17478277,
         0.13954921,  0.22890878],
       [-0.10311951, -0.21144652, -0.38806748, ..., -0.29994968,
         0.1419051 ,  0.34876743],
       ...,
       [-0.04628503, -0.13581693, -0.37709054, ..., -0.20757696,
         0.04609798,  0.44425988],
       [-0.33347216, -0.17306219,  0.09216662, ..., -0.25133562,
        -0.05142123,  0.27784327],
       [-0.20325579, -0.15237466, -0.23004963, ..., -0.25612628,
         0.05647596,  0.45491406]], dtype=float32)

In [299]:
# save it to the json file
column_names = ['participant_id', 'answer1', 'embedding1']

# Ensure that the length of text_data and the number of embeddings match
assert len(data) == len(embeddings), "The length of text data and embeddings must match"

# Convert each row of embeddings to a list
embeddings_list = embeddings.tolist()

result = pd.DataFrame({column_names[0]: participant , column_names[1]: data, column_names[2]: embeddings_list})

In [300]:
result

Unnamed: 0,participant_id,answer1,embedding1
0,00278,vaccination scheme area,"[-0.1912737637758255, -0.17733298242092133, -0..."
1,00278,increase awareness education,"[-0.06792967766523361, -0.13514108955860138, -..."
2,00278,mass vaccination programme,"[-0.10311951488256454, -0.21144652366638184, -..."
3,00278,gp practice discus patient,"[-0.22583332657814026, -0.0020551765337586403,..."
4,00278,advertisement vaccination need availability,"[-0.12228399515151978, 0.03003622218966484, -0..."
...,...,...,...
1289,fb9c4,visiting local school vaccinating child,"[-0.21164929866790771, -0.10677900910377502, -..."
1290,fb9c4,offering door door vaccination programme,"[-0.19278275966644287, -0.1603478342294693, -0..."
1291,ff2bf,education benefit vaccination,"[-0.04628502577543259, -0.13581693172454834, -..."
1292,ff2bf,reward incentive eg cash good,"[-0.3334721624851227, -0.17306219041347504, 0...."


In [301]:
# save the embedding to json file
result.to_json('~/thesis/embeddings/knowledge_distilbert_answer1.jsonl', orient='index')

### Question 2
"How could we improve gender equality at work?"

In [302]:
data = pd.read_json('~/thesis/data/processed_data/knowledge_q2_lemmatized.jsonl', orient='records', lines=True)

In [303]:
participant = data['participant_id'].copy()

In [304]:
for x in range(len(data)): 
    if data['answer'].iloc[x] == '': # Check if there's empty strings values
        print(x)

In [305]:
data = data['answer'].copy()

In [306]:
# Example list of texts to cluster
texts = [ answer for answer in data]

In [307]:
# Tokenize and encode the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Obtain distilBERT embeddings
with torch.no_grad():
    model_output = model(**encoded_texts)

# Get the embeddings from the model output
embeddings = model_output.last_hidden_state[:, 0, :].numpy()

In [308]:
embeddings.shape

(1299, 768)

In [309]:
embeddings

array([[-0.27246663, -0.11421003, -0.25762886, ..., -0.25822142,
         0.04988267,  0.33396098],
       [-0.20662355, -0.05778245, -0.02536843, ..., -0.23782359,
         0.12621512,  0.18647127],
       [-0.32578346, -0.05905151, -0.19272937, ..., -0.28107145,
         0.25809512,  0.38366055],
       ...,
       [-0.29391214, -0.01502347, -0.37965733, ..., -0.19332778,
         0.10386197,  0.33840808],
       [-0.24768837, -0.22516981,  0.00928485, ..., -0.2717694 ,
         0.00258983,  0.40271506],
       [-0.3109877 , -0.16754803, -0.3148152 , ..., -0.26551035,
         0.03409884,  0.4636308 ]], dtype=float32)

In [310]:
# save it to the json file
column_names = ['participant_id', 'answer2', 'embedding2']

# Ensure that the length of text_data and the number of embeddings match
assert len(data) == len(embeddings), "The length of text data and embeddings must match"

# Convert each row of embeddings to a list
embeddings_list = embeddings.tolist()

result = pd.DataFrame({column_names[0]: participant , column_names[1]: data, column_names[2]: embeddings_list})

In [311]:
result

Unnamed: 0,participant_id,answer2,embedding2
0,00278,equal pay woman men role,"[-0.27246662974357605, -0.11421003192663193, -..."
1,00278,ensure woman voice heard,"[-0.20662355422973633, -0.05778244510293007, -..."
2,00278,woman equally considered promotion,"[-0.32578346133232117, -0.0590515062212944, -0..."
3,00278,education sexual harassment,"[-0.15188588201999664, 0.03038639947772026, -0..."
4,00278,woman seen equal men capable,"[-0.2573717534542084, -0.19600646197795868, -0..."
...,...,...,...
1294,fb9c4,female high paying role within company,"[-0.33563175797462463, -0.17630937695503235, -..."
1295,fb9c4,offering male female worker time maternitypate...,"[-0.330673485994339, -0.04863996431231499, -0...."
1296,ff2bf,gender identity workshop staff,"[-0.2939121425151825, -0.015023472718894482, -..."
1297,ff2bf,look current procedure improved,"[-0.24768836796283722, -0.2251698076725006, 0...."


In [312]:
# save the embedding to json file
result.to_json('~/thesis/embeddings/knowledge_distilbert_answer2.jsonl', orient='index')

### Question 3
"How could we prevent the extinction of the elephant?"

In [313]:
data = pd.read_json('~/thesis/data/processed_data/knowledge_q3_lemmatized.jsonl', orient='records', lines=True)

In [314]:
participant = data['participant_id'].copy()

In [315]:
for x in range(len(data)): 
    if data['answer'].iloc[x] == '': # Check if there's empty strings values
        print(x)

In [316]:
data = data['answer'].copy()

In [317]:
# Example list of texts to cluster
texts = [ answer for answer in data]

In [318]:
# Tokenize and encode the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Obtain distilBERT embeddings
with torch.no_grad():
    model_output = model(**encoded_texts)

# Get the embeddings from the model output
embeddings = model_output.last_hidden_state[:, 0, :].numpy()

In [319]:
embeddings.shape

(1330, 768)

In [320]:
embeddings

array([[-0.26007342, -0.10248095, -0.06801523, ..., -0.19478488,
         0.07291865,  0.24238463],
       [-0.19300213, -0.09409611, -0.138834  , ..., -0.19442259,
         0.15002112,  0.23779206],
       [ 0.02903957, -0.15060198, -0.21470082, ..., -0.12194512,
         0.06245731,  0.15667342],
       ...,
       [-0.17498773,  0.02680914, -0.18700269, ..., -0.07259836,
         0.25937122,  0.18128693],
       [-0.09044164, -0.19122785, -0.1173988 , ..., -0.15873171,
         0.02188621,  0.2703989 ],
       [-0.23726068, -0.05401746, -0.4242938 , ..., -0.17293528,
         0.15000957,  0.22560188]], dtype=float32)

In [321]:
# save it to the json file
column_names = ['participant_id', 'answer3', 'embedding3']

# Ensure that the length of text_data and the number of embeddings match
assert len(data) == len(embeddings), "The length of text data and embeddings must match"

# Convert each row of embeddings to a list
embeddings_list = embeddings.tolist()

result = pd.DataFrame({column_names[0]: participant , column_names[1]: data, column_names[2]: embeddings_list})

In [322]:
result

Unnamed: 0,participant_id,answer3,embedding3
0,00278,look conservation plan,"[-0.2600734233856201, -0.102480947971344, -0.0..."
1,00278,ensure poacher prosecuted stopped,"[-0.1930021345615387, -0.0940961092710495, -0...."
2,00278,protect land live,"[0.029039565473794937, -0.15060198307037354, -..."
3,00278,protected area people access,"[-0.2305000275373459, 0.1180906593799591, -0.2..."
4,00278,stop used human entertainmenttourism,"[-0.0794864222407341, -0.07702479511499405, -0..."
...,...,...,...
1325,fb9c4,prosecute profit elephant called sancturies,"[-0.14983583986759186, 0.015399899333715439, -..."
1326,fb9c4,create scheme impregnate lot female elephant,"[-0.29400578141212463, -0.2693106532096863, -0..."
1327,ff2bf,ban ivory importsexports,"[-0.17498773336410522, 0.026809141039848328, -..."
1328,ff2bf,try breed captivity,"[-0.09044164419174194, -0.19122785329818726, -..."


In [323]:
# save the embedding to json file
result.to_json('~/thesis/embeddings/knowledge_distilbert_answer3.jsonl', orient='index')

### Question 4
"How could we increase the use of public transport?"

In [324]:
data = pd.read_json('~/thesis/data/processed_data/knowledge_q4_lemmatized.jsonl', orient='records', lines=True)

In [325]:
participant = data['participant_id'].copy()

In [326]:
for x in range(len(data)): 
    if data['answer'].iloc[x] == '': # Check if there's empty strings values
        print(x)

In [327]:
data = data['answer'].copy()

In [328]:
# Example list of texts to cluster
texts = [ answer for answer in data]

In [329]:
# Tokenize and encode the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Obtain distilBERT embeddings
with torch.no_grad():
    model_output = model(**encoded_texts)

# Get the embeddings from the model output
embeddings = model_output.last_hidden_state[:, 0, :].numpy()

In [330]:
embeddings.shape

(1228, 768)

In [331]:
embeddings

array([[-0.1105326 , -0.18945812, -0.03787645, ..., -0.16354793,
         0.17749348,  0.15497705],
       [-0.13820766, -0.03308623,  0.05139754, ..., -0.15541585,
         0.12188373,  0.22924781],
       [-0.19806035, -0.14538558, -0.02777266, ..., -0.12017956,
         0.07377895,  0.33324617],
       ...,
       [-0.09752385,  0.05519659, -0.00851689, ..., -0.20211183,
        -0.03859911,  0.3129117 ],
       [-0.22521718, -0.09770241, -0.13090718, ..., -0.18667755,
         0.02443858,  0.2766239 ],
       [-0.23792972, -0.09585156,  0.01390878, ..., -0.15713789,
         0.20701213,  0.36168784]], dtype=float32)

In [332]:
# save it to the json file
column_names = ['participant_id', 'answer4', 'embedding4']

# Ensure that the length of text_data and the number of embeddings match
assert len(data) == len(embeddings), "The length of text data and embeddings must match"

# Convert each row of embeddings to a list
embeddings_list = embeddings.tolist()

result = pd.DataFrame({column_names[0]: participant , column_names[1]: data, column_names[2]: embeddings_list})

In [333]:
result

Unnamed: 0,participant_id,answer4,embedding4
0,00278,make affordable,"[-0.11053259670734406, -0.18945811688899994, -..."
1,00278,make reliable,"[-0.13820765912532806, -0.033086229115724564, ..."
2,00278,make accessible,"[-0.19806034862995148, -0.14538557827472687, -..."
3,00278,hire staff support service,"[-0.024831995368003845, -0.03276780620217323, ..."
4,00278,funding newfaster route,"[-0.23124073445796967, -0.12966595590114594, -..."
...,...,...,...
1223,fb9c4,reward scheme often use public transport,"[-0.10024309158325195, -0.01906384527683258, -..."
1224,fb9c4,increase hygiene level bus train,"[-0.06521286815404892, -0.25344136357307434, -..."
1225,ff2bf,free travel need,"[-0.09752384573221207, 0.05519659444689751, -0..."
1226,ff2bf,le parking private vehicle,"[-0.22521717846393585, -0.09770240634679794, -..."


In [334]:
# save the embedding to json file
result.to_json('~/thesis/embeddings/knowledge_distilbert_answer4.jsonl', orient='index')

### Question 5
"How could we reduce stress in our society?"

In [335]:
data = pd.read_json('~/thesis/data/processed_data/knowledge_q5_lemmatized.jsonl', orient='records', lines=True)

In [336]:
participant = data['participant_id'].copy()

In [337]:
for x in range(len(data)): 
    if data['answer'].iloc[x] == '': # Check if there's empty strings values
        print(x)

In [338]:
data = data['answer'].copy()

In [339]:
# Example list of texts to cluster
texts = [ answer for answer in data]

In [340]:
# Tokenize and encode the texts
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Obtain distilBERT embeddings
with torch.no_grad():
    model_output = model(**encoded_texts)

# Get the embeddings from the model output
embeddings = model_output.last_hidden_state[:, 0, :].numpy()

In [341]:
embeddings.shape

(1311, 768)

In [342]:
embeddings

array([[-0.18155296,  0.01037041, -0.05033297, ..., -0.2776332 ,
         0.08216145,  0.28639117],
       [-0.25153264, -0.14449532, -0.0904041 , ..., -0.20504676,
        -0.00303022,  0.24036193],
       [-0.2174866 , -0.22824603, -0.14886817, ..., -0.00183848,
        -0.02660225,  0.2756349 ],
       ...,
       [-0.37797144, -0.15594889, -0.02280019, ..., -0.11585557,
         0.29140317,  0.24774489],
       [ 0.00210757, -0.14199375, -0.47691756, ..., -0.15677524,
         0.15974632,  0.3441266 ],
       [-0.15090314,  0.00807869, -0.22196297, ..., -0.29263887,
         0.05773852,  0.16149037]], dtype=float32)

In [343]:
# save it to the json file
column_names = ['participant_id', 'answer5', 'embedding5']

# Ensure that the length of text_data and the number of embeddings match
assert len(data) == len(embeddings), "The length of text data and embeddings must match"

# Convert each row of embeddings to a list
embeddings_list = embeddings.tolist()

result = pd.DataFrame({column_names[0]: participant , column_names[1]: data, column_names[2]: embeddings_list})

In [344]:
result

Unnamed: 0,participant_id,answer5,embedding5
0,00278,better worklife balance,"[-0.1815529614686966, 0.01037040539085865, -0...."
1,00278,flexible working,"[-0.251532644033432, -0.14449532330036163, -0...."
2,00278,affordable bill housing,"[-0.21748660504817963, -0.2282460331916809, -0..."
3,00278,higher living wage,"[-0.14937050640583038, -0.06709479540586472, -..."
4,00278,accessible healthcare,"[-0.18807484209537506, -0.08514208346605301, -..."
...,...,...,...
1306,fb9c4,creative style based curriculum school,"[-0.11061372607946396, -0.07716570794582367, -..."
1307,fb9c4,advocate importance mental health school workp...,"[-0.10510043799877167, 0.027560219168663025, -..."
1308,ff2bf,kinder,"[-0.37797144055366516, -0.15594889223575592, -..."
1309,ff2bf,educate people mental health issue,"[0.002107567386701703, -0.14199374616146088, -..."


In [345]:
# save the embedding to json file
result.to_json('~/thesis/embeddings/knowledge_distilbert_answer5.jsonl', orient='index')