In [1]:
import pandas as pd
import json

In [25]:
df = pd.read_csv('sensitive/health_interviews_proc.csv')

# Function to split long answers
def split_long_answers(row, max_length=1000):
    answer = row['answer']
    # print(answer)
    if type(answer) == float:
        return [row]
    if len(answer) <= max_length:
        return [row]
    
    split_rows = []
    base_id = row['id']
    parts = [answer[i:i+max_length] for i in range(0, len(answer), max_length)]
    
    for i, part in enumerate(parts):
        new_row = row.copy()
        new_row['id'] = f"{base_id}-{i+1}"
        new_row['answer'] = part
        split_rows.append(new_row)
    
    return split_rows

# Apply the splitting function
split_df = pd.DataFrame(
    [split_row for _, row in df.iterrows() for split_row in split_long_answers(row)]
)

# Save the modified DataFrame to a new CSV file
split_df.to_csv('llama/health_interview_proc_split.csv', index=False)

In [28]:
ret_text = '''[
  {
    "Original_Text": "I have faith in vaccination and in medical professionals.",
    "Code": "Trust in Vaccination and Medical Professionals",
    "Topic": "Belief System",
    "Keywords": ["Trust", "Vaccination", "Medical Professionals"]
  },
  {
    "Original_Text": "That faith kind of motivates you to seek out that protection, whenever possible, is that, does that sound accurate to you?",
    "Code": "Motivation for Seeking Protection",
    "Topic": "Motivation",
    "Keywords": ["Faith", "Protection"]
  },
  {
    "Original_Text": "Yeah, yeah.",
    "Code": "Agreement with Statement",
    "Topic": "Confirmation",
    "Keywords": ["Agreement", "Confirmation"]
  },
  {
    "Original_Text": "For me it wasn't really a decision I needed to make.",
    "Code": "No Decision Required",
    "Topic": "Decision-Making",
    "Keywords": ["Decision", "Automatic"]
  },
  {
    "Original_Text": "It was a foregone conclusion.",
    "Code": "Foregone Conclusion",
    "Topic": "Certainty",
    "Keywords": ["Conclusion", "Certainty"]
  },
  {
    "Original_Text": "I did check in with what his dad thought, and he was on the same page.",
    "Code": "Parental Agreement",
    "Topic": "Family Dynamics",
    "Keywords": ["Agreement", "Family"]
  },
  {
    "Original_Text": "That was just not really a decision.",
    "Code": "No Real Decision",
    "Topic": "Decision-Making",
    "Keywords": ["Decision", "Conclusion"]
  }
]'''

In [29]:
def convert_to_json(text):
    try:
        # Attempt to parse the text as JSON
        json_data = json.loads(text)
        return json_data
    except json.JSONDecodeError as e:
        # Handle JSON decoding errors
        print(f"JSON decoding error: {e}")
        return None
    except Exception as e:
        # Handle any other exceptions
        print(f"An unexpected error occurred: {e}")
        return None

In [30]:
ret = convert_to_json(ret_text)

In [31]:
ret

[{'Original_Text': 'I have faith in vaccination and in medical professionals.',
  'Code': 'Trust in Vaccination and Medical Professionals',
  'Topic': 'Belief System',
  'Keywords': ['Trust', 'Vaccination', 'Medical Professionals']},
 {'Original_Text': 'That faith kind of motivates you to seek out that protection, whenever possible, is that, does that sound accurate to you?',
  'Code': 'Motivation for Seeking Protection',
  'Topic': 'Motivation',
  'Keywords': ['Faith', 'Protection']},
 {'Original_Text': 'Yeah, yeah.',
  'Code': 'Agreement with Statement',
  'Topic': 'Confirmation',
  'Keywords': ['Agreement', 'Confirmation']},
 {'Original_Text': "For me it wasn't really a decision I needed to make.",
  'Code': 'No Decision Required',
  'Topic': 'Decision-Making',
  'Keywords': ['Decision', 'Automatic']},
 {'Original_Text': 'It was a foregone conclusion.',
  'Code': 'Foregone Conclusion',
  'Topic': 'Certainty',
  'Keywords': ['Conclusion', 'Certainty']},
 {'Original_Text': 'I did chec

In [32]:
import re
import json
import csv

# Example output text containing the JSON list
output_text = '''
Example 1:
       [{"Original_Text": "Do you know anyone whose child has gotten sick with the flu or has [CHILD] gotten the flu before that you know of?", "Code": "Flu Experience", "Topic": "Child's Health", "Keywords": ["Child's", "Flu", "Illness History"]}]
       
                Input:
                Interviewer: Yeah. So when you want to learn more about something like the flu. Where do you go to find that information?
                Answer: I would just Google it. I would look on the...the NHS website. So I would basically always Google it. And I would look at the NHS website. And then...
                JSON Output:
                [{"Original_Text": "Yeah. So when you want to learn more about something like the flu. Where do you go to find that information?", "Code": "Health Information Source", "Topic": "Health Education", "Keywords": ["Flu", "Information Source", "Health"]}]
'''


In [33]:
def validate_json_objects(json_objects):
    # Define the required keys
    required_keys = {"Original_Text", "Code", "Topic", "Keywords"}
    
    # Iterate over each JSON object and validate
    for obj in json_objects:
        # Check if all required keys are present
        if not required_keys.issubset(obj.keys()):
            return None
        
        # Check if 'Keywords' is a list of strings
        if not isinstance(obj['Keywords'], list) or not all(isinstance(keyword, str) for keyword in obj['Keywords']):
            return None
    
    # If all objects pass the validation, return the parsed JSON objects
    return json_objects

In [34]:
def json_match(output_text):
    json_list_pattern = r'\[\{.*?\}\]'
    list_matches = re.findall(json_list_pattern, output_text, re.DOTALL)
    if list_matches:
        json_pattern = r'\{"Original_Text":\s*".+?",\s*"Code":\s*".+?",\s*"Topic":\s*".+?",\s*"Keywords":\s*\[\s*(?:"[^"]*"\s*(?:,\s*)?)*\]\s*\}'
        json_matches = re.findall(json_pattern, list_matches[0], re.DOTALL)
        rets = []
        for match in json_matches:
            try:
                x = json.loads(match)
                rets.append(x)
            except json.JSONDecodeError as e:
                print(f"JSON decoding error: {e}")
        return rets
    return []

In [35]:
import pandas as pd
df = pd.read_csv('Depreciated-interview/all_txt_output.csv')

In [36]:
df.head()

Unnamed: 0,Result
0,"[{""Original_Text"": ""Alright, Okay."", ""Cod..."
1,"[{""Original_Text"": ""I agree"", ""Code"": ""Co..."
2,"[{""Original_Text"": ""I agree."", ""Code"": ""I..."
3,"[\n {""Original_Text"":""..."
4,"[{""Consent"": ""Yes""}]\n Inp..."


In [37]:
for i, obj in enumerate(df['Result']):
    res = json_match(obj)
    print(res)
    print('ORIG::::', obj)
    if i > 5:
        break

[{'Original_Text': 'Alright, Okay.', 'Code': 'Read: Agree Process', 'Topic': 'Consent', 'Keywords': ['Consent']}]
ORIG::::      [{"Original_Text": "Alright, Okay.", "Code": "Read: Agree Process","Topic": "Consent","Keywords": ["Consent"]}]
                
                Input:
                Interviewer: So could you please tell me your name and the region where you live?
                Answer: Yes, my name is [NAME]and we live in England.
                JSON Output:
                [{"Original_Text": "My name is [NAME] and we live in England.", "Code": "Provides Name and Country","Topic": "Demographic", "Keywords": ["Name", "Country", "England"]}]
                
                Input:
                Interviewer: And we're going to start by asking some questions about any child illness history or exposure events.
                Answer: And um, is it about children or is it about adults?
                JSON Output:

[]
ORIG::::      [{"Original_Text": "I agree", "Code": "Confi

In [38]:
df2 = pd.read_csv('gemma_7B_prompt_two_examples.csv')

In [39]:
df2.head()

Unnamed: 0,row,id,prompt,answer,label_json
0,3,ts9_3,I understand the information collected during ...,I agree,"[{'Original_Text': 'I agree.', 'Code': 'Agreed..."
1,4,ts9_4,Understand that information collected during t...,I agree,"[{'Original_Text': 'Agreed', 'Code': 'Agreed',..."
2,7,ts9_7,And I agree to take part in this study.,"Yeah, I agree.",[{'Original_Text': 'And I agree to take part i...
3,8,ts9_8,Excellent. And there's just two last things. S...,"Yeah, that's fine. I know my wife is involved ...","[{'Original_Text': ""So if you'd like to receiv..."
4,9,ts9_9,Excellent. Yep. I can do that. So that'll be i...,Email is fine,"[{'Original_Text': 'Email is fine', 'Code': 'E..."


In [40]:
df_grouped = df2.groupby('id').agg({
    'label_json': lambda x: '[' + ', '.join(x.str.strip('[]')) + ']',
    **{col: 'first' for col in df2.columns if col not in ['id', 'label_json']}
}).reset_index()
df_grouped.sort_values(by='id').head(20)

Unnamed: 0,id,label_json,row,prompt,answer
0,ts10_0,"[{'Original_Text': ""So it's basically just mak...",281,Alright so what we'll start with is I'm going ...,All right.
1,ts10_1,"[{'Original_Text': 'Yes.', 'Code': 'Agreed- In...",282,"Okay, yeah. Alright, so the first one I confir...",Yes.
2,ts10_10,"[{'Original_Text': ""Oh that's great. Two kiddo...",291,"Alright, so that's all that out of the way so ...",I do. Yeah.
3,ts10_11,"[{'Original_Text': 'They are two and five.', '...",292,Yeah. And how old are they?,They are two and five.
4,ts10_12,[{'Original_Text': 'She has yeah she started l...,293,"Yeah. Um, so the five year old has started sch...",She has yeah she started last September.
5,ts10_13,"[{'Original_Text': 'Um yeah, it took her a few...",294,"Yeah, how was how was she liking it?","Um yeah, it took her a few weeks to um, before..."
6,ts10_14,"[{'Original_Text': ""Um okay. It's just been an...",295,"Yeah. And how are, how are things going with h...",Um okay. It's just been an adjustment because ...
7,ts10_15,"[{'Original_Text': ""Yeah, it's nice to have so...",296,"Yeah, it's nice to have some outdoor access, b...","Yeah, yeah."
8,ts10_16,"[{'Original_Text': ""She's five and [Younger ch...",297,So can you tell me a little bit about your chi...,"[Older child] is the elder, she's five and [Yo..."
9,ts10_17,[{'Original_Text': 'With the kids and they do ...,298,"Yeah, very cool. So can you tell me what you k...","Well, I always think, you always, sometimes pe..."


In [41]:
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          385 non-null    object
 1   label_json  385 non-null    object
 2   row         385 non-null    int64 
 3   prompt      385 non-null    object
 4   answer      385 non-null    object
dtypes: int64(1), object(4)
memory usage: 15.2+ KB


In [42]:
import ast
def convert_to_dict(x):
    try:
        # Try to safely evaluate the string as a Python dictionary
        output = ast.literal_eval(x)

        return output
    except (ValueError, SyntaxError) as e:
        # Print the exception for debugging
        print(f"Didn't work: {x}")
        print(f"Exception: {e}")
        return None
    
df_grouped['tmp'] = df_grouped['label_json'].apply(convert_to_dict)

In [43]:
df_grouped['tmp'].head()[0]

[{'Original_Text': "So it's basically just making sure that you understand what we're doing and that we have your permission for you to take part in the study.",
  'Code': 'Understanding and Permission',
  'Topic': 'Context',
  'Keywords': ['Permission', 'Contact']},
 {'Original_Text': "So I'm going to read you each of these statements and I'll just ask that you say either yes or I agree, or no, I don't agree to each one.",
  'Code': "Yes or I Agree or No I  Don't Agree Option",
  'Topic': 'Context',
  'Keywords': ['Yes', 'No', 'Options']},
 {'Original_Text': 'OK.',
  'Code': 'OK',
  'Topic': 'General',
  'Keywords': ['OK']}]

In [44]:
import numpy as np
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt

model_ckpt = 'sentence-transformers/distiluse-base-multilingual-cased-v2'

model = SentenceTransformer(model_ckpt)



In [45]:
# def similarity_scores(model, s1, to_compare):
#     sentences = [s1]+to_compare
#     embeddings = model.encode(sentences)
#     print(embeddings)
#     sim = np.zeros((len(sentences), len(sentences)))
#     print(len(sentences))
#     for i in range(len(embeddings)):
#         sim[i] = cos_sim(embeddings[i], embeddings[i:])
#         break
#     return sim[0][1:]

In [46]:
# similarity_scores(model, 'YEAH', ['Acknowledged', 'Consent', 'Yeah', 'Agreed'])

In [47]:
def similarity_scores(s1, to_compare):
    embeddings = np.concatenate([[s1], to_compare])
    # print(embeddings)
    sim = np.zeros((len(embeddings), len(embeddings)))
    # print(len(embeddings))
    for i in range(len(embeddings)):
        sim[i] = cos_sim(embeddings[i], embeddings[i:])
        break
    return sim[0][1:]

In [48]:
e1 = model.encode('YEAH')
tc = model.encode(['Acknowledged', 'Consent', 'Yeah', 'Agreed'])
similarity_scores(e1, tc)

array([0.41554683, 0.4882232 , 0.87535828, 0.51043856])

In [49]:
from sentence_transformers.util import cos_sim

def similarity_score(model, in_sentence, mod_sentence):
  in_em = model.encode(in_sentence)
  mod_em = model.encode(mod_sentence)
  return cos_sim(in_em, mod_em)

In [50]:
df_grouped['tmp'].head()[0]

[{'Original_Text': "So it's basically just making sure that you understand what we're doing and that we have your permission for you to take part in the study.",
  'Code': 'Understanding and Permission',
  'Topic': 'Context',
  'Keywords': ['Permission', 'Contact']},
 {'Original_Text': "So I'm going to read you each of these statements and I'll just ask that you say either yes or I agree, or no, I don't agree to each one.",
  'Code': "Yes or I Agree or No I  Don't Agree Option",
  'Topic': 'Context',
  'Keywords': ['Yes', 'No', 'Options']},
 {'Original_Text': 'OK.',
  'Code': 'OK',
  'Topic': 'General',
  'Keywords': ['OK']}]

In [51]:
s1 = df_grouped['tmp'].head()[2][3]['Code']
s2 = df_grouped['tmp'].head()[2][4]['Code']
print(s1, s2)
similarity_score(model, s1, s2)

Number of Children Confirmed Parent with 2 Children


tensor([[0.4361]])

In [52]:
# for json_list in df_grouped['tmp'].head(2):
#     for i in range(len(json_list)):
#         # if similarity_score(model, json_list[i]['Code'], 'yeah') > .5:
#         if max(similarity_scores(model, json_list[i]['Topic'], ['Acknowledged', 'Consent', 'Yeah', 'Agreed'])) > .5:
#             print(json_list[i]['Code'], ':::' ,json_list[i]['Original_Text'])

In [53]:
df_grouped.head()

Unnamed: 0,id,label_json,row,prompt,answer,tmp
0,ts10_0,"[{'Original_Text': ""So it's basically just mak...",281,Alright so what we'll start with is I'm going ...,All right.,[{'Original_Text': 'So it's basically just mak...
1,ts10_1,"[{'Original_Text': 'Yes.', 'Code': 'Agreed- In...",282,"Okay, yeah. Alright, so the first one I confir...",Yes.,"[{'Original_Text': 'Yes.', 'Code': 'Agreed- In..."
2,ts10_10,"[{'Original_Text': ""Oh that's great. Two kiddo...",291,"Alright, so that's all that out of the way so ...",I do. Yeah.,[{'Original_Text': 'Oh that's great. Two kiddo...
3,ts10_11,"[{'Original_Text': 'They are two and five.', '...",292,Yeah. And how old are they?,They are two and five.,"[{'Original_Text': 'They are two and five.', '..."
4,ts10_12,[{'Original_Text': 'She has yeah she started l...,293,"Yeah. Um, so the five year old has started sch...",She has yeah she started last September.,[{'Original_Text': 'She has yeah she started l...


In [54]:
def cut_similars(model, json_list):
    to_drop = set()
    for i in range(len(json_list)):
        if i in to_drop:
            continue
        agg_words = model.encode(['Acknowledged', 'Consent', 'Yeah', 'Agreed', 'No Code'])
        c_emb = model.encode(json_list[i]['Code'])
        if max(similarity_scores(c_emb, agg_words)) > .5:
            to_drop.add(i)
            continue
        if max(similarity_scores(model.encode(json_list[i]['Topic']), agg_words)) > .5:
            to_drop.add(i)
            continue
        for j in range(i+1, len(json_list)):
            if cos_sim(c_emb, model.encode(json_list[j]['Code'])) > 0.6:
                # print(json_list[i]['Code'], json_list[j]['Code'])
                to_drop.add(j)
    json_list = [item for idx, item in enumerate(json_list) if idx not in to_drop]
    return json_list

# row = df_grouped['tmp'].head()[2]
# print(row)
# cut_similars(model, row)

# Apply the function to each row in the 'json_list' column
df_grouped['json_list'] = df_grouped['tmp'].apply(lambda x: cut_similars(model, x))

In [55]:
df_grouped.head()

Unnamed: 0,id,label_json,row,prompt,answer,tmp,json_list
0,ts10_0,"[{'Original_Text': ""So it's basically just mak...",281,Alright so what we'll start with is I'm going ...,All right.,[{'Original_Text': 'So it's basically just mak...,[]
1,ts10_1,"[{'Original_Text': 'Yes.', 'Code': 'Agreed- In...",282,"Okay, yeah. Alright, so the first one I confir...",Yes.,"[{'Original_Text': 'Yes.', 'Code': 'Agreed- In...",[]
2,ts10_10,"[{'Original_Text': ""Oh that's great. Two kiddo...",291,"Alright, so that's all that out of the way so ...",I do. Yeah.,[{'Original_Text': 'Oh that's great. Two kiddo...,[{'Original_Text': 'Oh that's great. Two kiddo...
3,ts10_11,"[{'Original_Text': 'They are two and five.', '...",292,Yeah. And how old are they?,They are two and five.,"[{'Original_Text': 'They are two and five.', '...","[{'Original_Text': 'They are two and five.', '..."
4,ts10_12,[{'Original_Text': 'She has yeah she started l...,293,"Yeah. Um, so the five year old has started sch...",She has yeah she started last September.,[{'Original_Text': 'She has yeah she started l...,[{'Original_Text': 'She has yeah she started l...


In [56]:
df4 = df_grouped[df_grouped['json_list'].apply(lambda x: x != [])]

In [57]:
df4.head()

Unnamed: 0,id,label_json,row,prompt,answer,tmp,json_list
2,ts10_10,"[{'Original_Text': ""Oh that's great. Two kiddo...",291,"Alright, so that's all that out of the way so ...",I do. Yeah.,[{'Original_Text': 'Oh that's great. Two kiddo...,[{'Original_Text': 'Oh that's great. Two kiddo...
3,ts10_11,"[{'Original_Text': 'They are two and five.', '...",292,Yeah. And how old are they?,They are two and five.,"[{'Original_Text': 'They are two and five.', '...","[{'Original_Text': 'They are two and five.', '..."
4,ts10_12,[{'Original_Text': 'She has yeah she started l...,293,"Yeah. Um, so the five year old has started sch...",She has yeah she started last September.,[{'Original_Text': 'She has yeah she started l...,[{'Original_Text': 'She has yeah she started l...
5,ts10_13,"[{'Original_Text': 'Um yeah, it took her a few...",294,"Yeah, how was how was she liking it?","Um yeah, it took her a few weeks to um, before...","[{'Original_Text': 'Um yeah, it took her a few...","[{'Original_Text': 'Um yeah, it took her a few..."
6,ts10_14,"[{'Original_Text': ""Um okay. It's just been an...",295,"Yeah. And how are, how are things going with h...",Um okay. It's just been an adjustment because ...,[{'Original_Text': 'Um okay. It's just been an...,[{'Original_Text': 'Um okay. It's just been an...


In [58]:
def extract_codes(json_list):
    return [entry['Code'] for entry in json_list]

all_codes = sum(df4['json_list'].apply(extract_codes), [])

print(len(all_codes))

1088


In [59]:
all_codes[100:150]

['No Availability: Time',
 'Limited by Staff and Premises',
 'Clinic Schedule',
 'Service Limitations',
 'Service is Good',
 'Yes Vaccines Given',
 'Standard Childhood Vaccines',
 'Purpose of Questions',
 'Nurses and Doctors Give Flu Shots',
 'Nurse Acknowledgement of Apprehension',
 'Doctor Sitting with You',
 'Positive Regarding Communication',
 'Transparency',
 'Willingness to Talk',
 'Potential Benefit of Disclosure',
 'Last Year Question',
 'Does Not Prioritize Vaccination',
 'Possible to get in future',
 'Consent to Record',
 'No Flu Experience',
 'Consent to Audio-Video Record Interview',
 'Knowledge mostly in Adults',
 'Flu Not Top of Mind',
 'Flu as Just Something to Face',
 'Complications due to Others',
 'Encourages Thoughts on Flu',
 'Want someone to say I have a flu',
 'Stop Recording',
 'No Further Information Provided',
 'First Answer',
 'Request for Findings',
 'Interested in Findings',
 'Study End Date',
 'Hard Copy Required',
 'Email only',
 'Contact for Additional In

In [60]:
pd.DataFrame(all_codes).to_csv('Depreciated-interview/all_codes.csv', index=False)

In [61]:
code_len = [len(code) for code in all_codes]
print(sum(code_len))

24926


In [62]:
# def json_match(output_text):
#     json_list_pattern = r'\[\{.*?\}\]'
#     list_matches = re.findall(json_list_pattern, output_text, re.DOTALL)
#     if list_matches:
#         json_pattern = r'\{"Original_Text":\s*".+?",\s*"Code":\s*".+?",\s*"Topic":\s*".+?",\s*"Keywords":\s*\[\s*(?:"[^"]*"\s*(?:,\s*)?)*\]\s*\}'
#         json_matches = re.findall(json_pattern, list_matches[0], re.DOTALL)
#         rets = []
#         for match in json_matches:
#             try:
#                 x = json.loads(match)
#                 rets.append(x)
#             except json.JSONDecodeError as e:
#                 print(f"JSON decoding error: {e}")
#         return rets
#     return []

def parse_result(result):
    json_pattern = r'\{\s*"Code":\s*".*?",\s*"Axial_Category":\s*".*?"\s*\}'

    # match = re.search(, result)
    matches = re.findall(json_pattern, result, re.DOTALL)
    results = []
    for match in matches:
        try:
            x = json.loads(match)
            print(x['Axial_Category'])
            results.append(x)
        except json.JSONDecodeError as e:
            print(e)
    return results

In [63]:
mystr = '''
    Certainly! your code is
    {"Code": "Email Preference",
	"Axial_Category": "NA"}
    {"Code":"Would Like a Copy of Report",
	"Axial_Category": "NA"}
'''
print(parse_result(mystr))

NA
NA
[{'Code': 'Email Preference', 'Axial_Category': 'NA'}, {'Code': 'Would Like a Copy of Report', 'Axial_Category': 'NA'}]


In [64]:
ax_set = set()
ax_set.add('NA')
ax_set.add('Email Preference')

In [65]:
pd.DataFrame(ax_set)

Unnamed: 0,0
0,
1,Email Preference


In [66]:
# import ast
# 
# 
# def process_csv(filename):
#     df = pd.read_csv(f'{filename}.csv')
#     df = df.drop_duplicates(subset=['id'])
# 
#     label_col = 'label_json'
#     
#     def convert_to_dict(x):
#         try:
#             # Try to safely evaluate the string as a Python dictionary
#             output = ast.literal_eval(x)
# 
#             return output
#         except (ValueError, SyntaxError) as e:
#             # Print the exception for debugging
#             print(f"Didn't work: {x}")
#             print(f"Exception: {e}")
#             return None
# 
#     df['label_json'] = df[label_col].apply(convert_to_dict)
#     df = df.dropna(subset=['label_json'])
#     expanded_df = df['label_json'].apply(pd.Series)
#     df_expanded = pd.concat([df.drop('label_json', axis=1), expanded_df], axis=1)
#     
#     return df_expanded
# 
# filename = 'gemma_7B_prompt_two_examples'
# process_csv(filename)

In [67]:
import numpy as np
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt

model_ckpt = 'sentence-transformers/distiluse-base-multilingual-cased-v2'

model = SentenceTransformer(model_ckpt)



In [68]:
sentences = ['Wanting experiential learning', 'constantly learning', 'working in a good environment', 
             'pioneering social media and easily adapting to change', 'feeling entitled due to unique qualifications, as compared to previous generations',
             'possessing the personal skills and characteristics needed', 'being groomed',
             'Craving immediate feedback and being motivated by feeling appreciated', 'detesting getting called out', 'receiving verbal encouragement and making observations', 'Mind reading and expectations for a miracle worker', 'getting called out', 'not being heard', 'Advocating a work-life balance', 'being cared for as a whole person', 'accommodating interests and preferences']
embeddings = model.encode(sentences)
# embeddings[0]

In [69]:
embeddings = model.encode(all_codes)

In [70]:
from sklearn.cluster import KMeans
import numpy as np

kmeans = KMeans(n_clusters=10, random_state=0, n_init="auto").fit(np.array(embeddings))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [71]:
kmeans.labels_

array([0, 4, 6, ..., 9, 9, 5], dtype=int32)

In [72]:
[(sect, sent) for sect, sent in zip(kmeans.labels_, all_codes)]

[(0, 'Current Residence since 1997'),
 (4, "Dad's Flu Symptoms More Tolerable"),
 (6, 'Number of Children in Household'),
 (6, 'Parent with 2 Children'),
 (4, 'Tried to get Flu Vaccine'),
 (6, '4-yr-old, 7-yr-old'),
 (6, "Children's Age"),
 (0, 'Five and Two Years of Age'),
 (0, 'Start of School Year'),
 (6, 'Child is/was in School'),
 (6, 'Child gets Home-Time'),
 (0, 'Started Last September'),
 (0, 'Time to Settle'),
 (8, 'Ongoing Illness'),
 (5, 'Anticipated Re-Entry'),
 (6, 'Child out on Sick Leave'),
 (5, 'Will be fine on Return'),
 (5, 'Settled Quickly'),
 (5, 'Anticipated Future Stability'),
 (1, 'No Schoolwork'),
 (5, 'Adjusting to Stay Home'),
 (7, 'Question'),
 (1, 'Nice to have but not necessary'),
 (5, 'Accept Current Conditions'),
 (5, 'General Agreement'),
 (9, 'N/A'),
 (5, 'Range of Interests'),
 (6, 'Older Child Interests'),
 (4, 'Get Flu Shots for Kids'),
 (1, 'Usually No Shot'),
 (0, 'Received spray this year'),
 (4, 'Received the Flu Vaccine'),
 (9, 'Spray Type'),
 (

In [73]:
ret_arr = [[] for _ in range(10)]

for sect, sent in zip(kmeans.labels_, all_codes):
    ret_arr[sect].append(sent)

In [74]:
i=0
for sect in ret_arr:
    print(i, sect)
    i+=1

0 ['Current Residence since 1997', 'Five and Two Years of Age', 'Start of School Year', 'Started Last September', 'Time to Settle', 'Received spray this year', 'Limited Time Frame', 'Possible to get in future', 'Study End Date', 'Complete Now', 'Accelerated Aging', 'Greying Over Time', 'Entry to Reception in September', 'She was in school nursery', "She's going to be the oldest", 'in school, but not in school yet', 'School Closing', 'Week to 10 days', 'Current Year', 'Rescheduled to Day Off', 'Within 10 Days of Receiving', 'Late Vax', 'Remembered Late Season', 'Study will End August', 'Appointment Time Early Morning', 'Date on Screen', 'Two, Five Years Old', 'In School', 'Second Year of School', 'Age', 'Started School Reception', 'School Start Date (9/7/19)', 'Within the Cut Off Date', 'Homeschooling is Hard', 'Challenge More than School', 'Now He is Fine', 'October-November Season', 'Nearly Four', '2 Years Old', 'Year-old is Going to School', '5 Year Old', 'Current Grade', 'Before Chr

In [113]:
import numpy as np
from sklearn.preprocessing import normalize

class KMeansCosine:
    def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centroids = None
        self.labels_ = None

    def _cosine_similarity(self, X, Y):
        # Cosine similarity = dot product of normalized vectors
        return np.dot(X, Y.T)

    def _initialize_centroids(self, X):
        if self.random_state:
            np.random.seed(self.random_state)
        random_idxs = np.random.permutation(X.shape[0])
        centroids = X[random_idxs[:self.n_clusters]]
        return centroids

    def fit(self, X):
        X_normalized = normalize(X)
        self.centroids = self._initialize_centroids(X_normalized)

        for _ in range(self.max_iter):
            # Compute similarity between all points and centroids
            similarity = self._cosine_similarity(X_normalized, self.centroids)
            self.labels_ = np.argmax(similarity, axis=1)

            new_centroids = np.array([
                X_normalized[self.labels_ == i].mean(axis=0) 
                for i in range(self.n_clusters)
            ])
            
            # Normalize new centroids
            new_centroids = normalize(new_centroids)

            if np.all(np.abs(self.centroids - new_centroids) < self.tol):
                break
            
            self.centroids = new_centroids

    def predict(self, X):
        X_normalized = normalize(X)
        similarity = self._cosine_similarity(X_normalized, self.centroids)
        return np.argmax(similarity, axis=1)

    def fit_predict(self, X):
        self.fit(X)
        return self.labels_
    
    
X = model.encode(all_codes)

kmeans_cosine = KMeansCosine(n_clusters=13, random_state=42)
kmeans_cosine.fit(X)
labels = kmeans_cosine.labels_

ret_arr = [[] for _ in range(13)]

for sect, sent in zip(labels, all_codes):
    ret_arr[sect].append(sent)
    
i=0
for sect in ret_arr:
    print(i, sect)
    i+=1

0 ['Use of Resources', 'Limited to Low-Risk Info', 'NHS Website Primary Source', 'Information from Health Education', 'GP as Information Source', 'Avoid Unverified Information Sources', 'Advantage of Family Resources Available', "Online 'Experts' for Information", 'Easy Access to Social Support/Experts Online', 'NHS website Primary Source', 'Poor Internet Connection', 'Government Website Primary Source', 'Limited Online Searches', 'NHS Website Primary Source', 'NHS Website as Primary Source (1 instance)', 'Limited Knowledge About Types', 'NHS Website Primary Source', 'Information Source', '111 Service Use', 'NHS Website', 'Trusted Source Required', 'Limited Info', 'Only the NHS Website', 'Information Sources', 'Internet and NHS website as primary source', 'Sometimes Other Sources Used', 'Charities as Secondary Information Source', 'Charities Websites', 'Not All Sources of Info are Equal', 'Online Information as Useful', 'NHS website is good for start', 'Go to NHS UK Website', 'Other Re

In [None]:
# from sklearn.metrics import silhouette_samples, silhouette_score
# import matplotlib.cm as cm
# import matplotlib.pyplot as plt
# 
# X = model.encode(all_codes)
# 
# range_n_clusters = [9, 10, 11, 12, 13, 14, 15]
# 
# for n_clusters in range_n_clusters:
#     # Create a subplot with 1 row and 2 columns
#     fig, (ax1, ax2) = plt.subplots(1, 2)
#     fig.set_size_inches(18, 7)
# 
#     # The 1st subplot is the silhouette plot
#     # The silhouette coefficient can range from -1, 1 but in this example all
#     # lie within [-0.1, 1]
#     ax1.set_xlim([-0.1, 1])
#     # The (n_clusters+1)*10 is for inserting blank space between silhouette
#     # plots of individual clusters, to demarcate them clearly.
#     ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# 
#     # Initialize the clusterer with n_clusters value and a random generator
#     # seed of 10 for reproducibility.
#     clusterer = KMeans(n_clusters=n_clusters, random_state=10)
#     cluster_labels = clusterer.fit_predict(X)
# 
#     # The silhouette_score gives the average value for all the samples.
#     # This gives a perspective into the density and separation of the formed
#     # clusters
#     silhouette_avg = silhouette_score(X, cluster_labels)
#     print(
#         "For n_clusters =",
#         n_clusters,
#         "The average silhouette_score is :",
#         silhouette_avg,
#     )
# 
#     # Compute the silhouette scores for each sample
#     sample_silhouette_values = silhouette_samples(X, cluster_labels)
# 
#     y_lower = 10
#     for i in range(n_clusters):
#         # Aggregate the silhouette scores for samples belonging to
#         # cluster i, and sort them
#         ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
# 
#         ith_cluster_silhouette_values.sort()
# 
#         size_cluster_i = ith_cluster_silhouette_values.shape[0]
#         y_upper = y_lower + size_cluster_i
# 
#         color = cm.nipy_spectral(float(i) / n_clusters)
#         ax1.fill_betweenx(
#             np.arange(y_lower, y_upper),
#             0,
#             ith_cluster_silhouette_values,
#             facecolor=color,
#             edgecolor=color,
#             alpha=0.7,
#         )
# 
#         # Label the silhouette plots with their cluster numbers at the middle
#         ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# 
#         # Compute the new y_lower for next plot
#         y_lower = y_upper + 10  # 10 for the 0 samples
# 
#     ax1.set_title("The silhouette plot for the various clusters.")
#     ax1.set_xlabel("The silhouette coefficient values")
#     ax1.set_ylabel("Cluster label")
# 
#     # The vertical line for average silhouette score of all the values
#     ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
# 
#     ax1.set_yticks([])  # Clear the yaxis labels / ticks
#     ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 
#     # 2nd Plot showing the actual clusters formed
#     colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
#     ax2.scatter(
#         X[:, 0], X[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
#     )
# 
#     # Labeling the clusters
#     centers = clusterer.cluster_centers_
#     # Draw white circles at cluster centers
#     ax2.scatter(
#         centers[:, 0],
#         centers[:, 1],
#         marker="o",
#         c="white",
#         alpha=1,
#         s=200,
#         edgecolor="k",
#     )
# 
#     for i, c in enumerate(centers):
#         ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50, edgecolor="k")
# 
#     ax2.set_title("The visualization of the clustered data.")
#     ax2.set_xlabel("Feature space for the 1st feature")
#     ax2.set_ylabel("Feature space for the 2nd feature")
# 
#     plt.suptitle(
#         "Silhouette analysis for KMeans clustering on sample data with n_clusters = %d"
#         % n_clusters,
#         fontsize=14,
#         fontweight="bold",
#     )
# 
# plt.show()

In [6]:
import pandas as pd
df3 = pd.read_csv('sensitive/health_interviews.csv', header=None)
df3.columns = df3.iloc[0] + '_' + df3.iloc[1]
df3 = df3.drop([0, 1, 2])
threshold = len(df3.columns) - 2 
df3 = df3.dropna(thresh=threshold).reset_index(drop=True)
df3.head(30)

Unnamed: 0,Q2_Utah Small Area,Q3_Health Problem,Q5_Sex,Q6_Race/Ethnicity - Selected Choice,Q7_Age Group,Q8_Affiliation,Q10_Health Impact,Q11_At-Risk Populations,Q12_Risk Factors,Q13_Root Cause,Q14_Community Strengths,Q15_Resources,Q16_Solutions,Q17_Other Thoughts
0,Payson,Obesity and Overweight,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search",Impacts some of our poorest residents harder. ...,Black Women,"Childhood trauma, getting food that is addicti...",Most people struggle to know what foods are be...,"Good friends, finances, outdoor community/comm...",Community resources at the hospital that have ...,Financial assistance and classes (getting the ...,It is a real bedroom community—if people are s...
1,Payson,Obesity and Overweight,Female,White/Caucasian,18-24,College student,Not many extracurricular activities outside of...,40-60 year olds,Not many outside activities for adults\n\nLots...,"In rural parts, there are fewer educated peopl...","Church with finance classes and self-reliance,...","Church with finance classes and self-reliance,...",Having more availability of knowledge and help...,"In general, she believes that people are healt..."
2,Provo East City Center,Obesity/Overweight,Female,White/Caucasian,18-24,Community Member (and volunteer for Volunteer ...,"With the care clinic, you have to be below the...","Definitely people in poverty, low income. I do...",Unhealthy lifestyle I guess. Like very sedenta...,I feel like it all stems from poverty because ...,For free health service like the volunteer car...,Public transportation is a huge perk. Like I f...,If there was a way to help people get healthy ...,
3,Spanish Fork,Depression,Female,White/Caucasian,45-54,Community member,"I guess when you first brought it up, I though...",I’d probably say like teenagers like in high s...,"COVID implications, social media, social life ...",There is a disparity and more so in my little ...,"[For her and her family] Life coaches, acupunc...",,(for kids/teens) [Programs that] taught them t...,Spanish Fork is really good at like the Harves...
4,Pleasant Grove/Lindon,Obesity/overweight,Female,Hispanic/Latino/LatinX,25-34,Community Member,I don't know or interact with many people in m...,Any low-income neighborhood like neighborhoods...,"Lack of walkable community, grocery stores (Sm...",Attitudes of being complacent in habits,Pleasant Grove has a decent REC center; acces...,Sidewalks aren't always up to standard,Infrastructural changes to the sidewalks,"Amazing sense of community, small town feel be..."
5,Pleasant Grove/Lindon,Obesity/Overweight,Male,Native Hawaiian or other Pacific Islander,18-24,Community Member,I grew up playing sports and was surrounded by...,"I noticed, at least in high school, there were...",Lots of food chains around town; also I feel l...,In junior high we watched the documentary of M...,There was a rec center we would go to and ther...,,Build a VASA in PG; Publicize the after school...,Being apart of athletics got me into wanting t...
6,Provo East City Center,Obesity/Overweight,Male,White/Caucasian,45-54,Works directly with community members as pedia...,In my experience working with hospitalized chi...,There are certainly populations that have high...,I think that a lot of people these days aren't...,I think economic factors play a big part. And ...,"In general, Utah has a lot of people who value...",I don't know of any in the Provo area.,I think there are lots of things that could be...,
7,American Fork,Depression,Female,White/Caucasian,35-44,"Communities that Care American Fork, Polaris H...","Well, we can see it with the drug use rates. W...",In wellness rooms I'm seeing a higher populati...,"Well, I think family situations, I see there's...","Lack of Sleep, So this survey goes from sixth,...","Well, our number one thing right now is the we...",Service: So every Thursday we have an organiza...,I feel like raising awareness. I think that if...,
8,American Fork,Depression,Female,White/Caucasian,25-34,The Family Therapy Clinic,I would say we see it a lot in like a general ...,Really the majority of the clients I see are w...,"I think in the American Fork, we probably see ...","I think in the American Fork, we probably see ...","Well, obviously the family therapy clinic that...","Well, obviously the family therapy clinic that...",I think having other resources or supplying ma...,
9,Provo West City Center,Obesity and overweight,Male,Asian or Asian American,35-44,Asian Market employee,"You know, I'm a father with three kids because...",For example the lunch people don't have that m...,"Less movement, you know, they don't spend too ...","Okay, this is easier for people to get food. E...","Okay, you know, we can have more events.\nkind...",Probably have more young people because of the...,"I know I'm almost 40. After 40, if you have th...","You know, for example, for the parks, you know..."


In [16]:
import pandas as pd
import numpy as np
import uuid

def generate_id(index, area, sex):
    # Combine index, first 3 letters of area, and first letter of sex
    # print('hi', index, area, sex)
    return f"{index}_{area[:3].upper()}_{sex[0].upper()}"

def update_id(id, prompt):
    return f'{id}_{prompt[:3].upper()}'

# Assuming df3 is your original dataframe
# Split df3 into id_info (first 6 columns) and questions (remaining 8 columns)
id_info = df3.iloc[:, :6]
questions = df3.iloc[:, 6:]

# Generate an 'id' for each row in id_info
id_info['id'] = id_info.apply(lambda row: generate_id(row.name, row['Q2_Utah Small Area'], row['Q5_Sex']), axis=1)

# Create a new dataframe with the id_info and the questions split into rows
df_expanded = pd.melt(questions.reset_index(), id_vars=['index'], var_name='prompt', value_name='answer')
df_expanded['id'] = df_expanded['index'].map(id_info.set_index(id_info.index)['id'])
df_expanded['id'] = df_expanded.apply(lambda row: update_id(row['id'], row['prompt']), axis=1)
df_expanded = df_expanded[['id', 'prompt', 'answer']]

# Create the second dataframe with the original id_info and the generated 'id'
# df_id_info = id_info.copy()

# Now df_expanded has columns: id, question, answer
# And df_id_info has columns: id_info and id

# Optionally, reset the index and drop the 'index' column
# df_expanded = df_expanded.drop(columns='index').reset_index(drop=True)
df_expanded

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  id_info['id'] = id_info.apply(lambda row: generate_id(row.name, row['Q2_Utah Small Area'], row['Q5_Sex']), axis=1)


Unnamed: 0,id,prompt,answer
0,0_PAY_M_Q10,Q10_Health Impact,Impacts some of our poorest residents harder. ...
1,1_PAY_F_Q10,Q10_Health Impact,Not many extracurricular activities outside of...
2,2_PRO_F_Q10,Q10_Health Impact,"With the care clinic, you have to be below the..."
3,3_SPA_F_Q10,Q10_Health Impact,"I guess when you first brought it up, I though..."
4,4_PLE_F_Q10,Q10_Health Impact,I don't know or interact with many people in m...
...,...,...,...
483,56_PAY_F_Q17,Q17_Other Thoughts,Use of delivery systems for walmart and or smi...
484,57_LEH_F_Q17,Q17_Other Thoughts,"Overall, I feel like Lehi is doing a great job..."
485,58_LEH_M_Q17,Q17_Other Thoughts,"No, I feel like Lehi is doing a good job overall!"
486,59_LEH_M_Q17,Q17_Other Thoughts,I think the affordability of the city is a con...


In [104]:
# df_id_info.head()

Unnamed: 0,Q2_Utah Small Area,Q3_Health Problem,Q5_Sex,Q6_Race/Ethnicity - Selected Choice,Q7_Age Group,Q8_Affiliation,id
0,Payson,Obesity and Overweight,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search",0_PAY_M
1,Payson,Obesity and Overweight,Female,White/Caucasian,18-24,College student,1_PAY_F
2,Provo East City Center,Obesity/Overweight,Female,White/Caucasian,18-24,Community Member (and volunteer for Volunteer ...,2_PRO_F
3,Spanish Fork,Depression,Female,White/Caucasian,45-54,Community member,3_SPA_F
4,Pleasant Grove/Lindon,Obesity/overweight,Female,Hispanic/Latino/LatinX,25-34,Community Member,4_PLE_F


In [109]:
# df_id_info.to_csv('id_info.csv', index=False)

In [105]:
# df_expanded.head(100)

Unnamed: 0,index,prompt,answer,id
0,0,Q10_Health Impact,Impacts some of our poorest residents harder. ...,0_PAY_M
1,1,Q10_Health Impact,Not many extracurricular activities outside of...,1_PAY_F
2,2,Q10_Health Impact,"With the care clinic, you have to be below the...",2_PRO_F
3,3,Q10_Health Impact,"I guess when you first brought it up, I though...",3_SPA_F
4,4,Q10_Health Impact,I don't know or interact with many people in m...,4_PLE_F
...,...,...,...,...
95,34,Q11_At-Risk Populations,I would definitely say there is a large comm...,34_PRO_F
96,35,Q11_At-Risk Populations,I wonder if I could like find a trend about l...,35_AME_F
97,36,Q11_At-Risk Populations,"\nWell you said most at risk, and I think most...",36_SPA_M
98,37,Q11_At-Risk Populations,I think young adults; specifically either have...,37_SPA_M


In [17]:
df_expanded.to_csv('health_interviews_proc.csv', index=False)