In [58]:
import numpy as np
import pandas as pd
from sentence_transformers.util import cos_sim
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt
import ast

In [59]:
og = pd.read_csv('sensitive/health_interview_proc_split.csv')
df2 = pd.read_csv('sensitive/llama_prompt_two_examples_neg_temp.csv')

completed = set(df2['id'])
rows_left = og[~og['id'].isin(completed)]
len(og), len(completed), len(rows_left)

print(rows_left['id'])

30     28_PRO_M_Q10-2
51       43_EAG_F_Q10
96       24_PRO_F_Q11
100      28_PRO_M_Q11
141    59_LEH_M_Q11-4
            ...      
537      26_PRO_M_Q17
548      34_PRO_F_Q17
560      45_PRO_F_Q17
561      46_PRO_F_Q17
569      54_SPR_F_Q17
Name: id, Length: 69, dtype: object


In [60]:
ids_with_minus_one = {row['id'].replace('-1', '') for index, row in df2.iterrows() if row['id'].endswith('-1')}
df2 = df2[~df2['id'].isin(ids_with_minus_one)]

#### S-bert cut similars

In [61]:
# df_grouped = df2.groupby(['id', 'prompt']).agg({
#     'label_json': lambda x: '[' + ', '.join(x.str.strip('[]')) + ']',
#     **{col: 'first' for col in df2.columns if col not in ['id', 'prompt', 'label_json']}
# }).reset_index()
# 
# df_grouped.sort_values(by='id').head(20)
#
df_grouped = df2.groupby(['id', 'prompt']).first().reset_index()


def convert_to_dict(x):
    try:
        # Try to safely evaluate the string as a Python dictionary
        output = ast.literal_eval(x)

        return output
    except (ValueError, SyntaxError) as e:
        # Print the exception for debugging
        print(f"Didn't work: {x}")
        print(f"Exception: {e}")
        return None
#     
df_grouped['tmp'] = df_grouped['label_json'].apply(convert_to_dict)

In [62]:
df_grouped

Unnamed: 0,id,prompt,row,answer,label_json,tmp
0,0_PAY_M_Q10,Q10_Health Impact,0,Impacts some of our poorest residents harder. ...,[{'Original_Text': 'Impacts some of our poores...,[{'Original_Text': 'Impacts some of our poores...
1,0_PAY_M_Q11,Q11_At-Risk Populations,61,Black Women,"[{'Original_Text': 'Black Women', 'Code': 'At-...","[{'Original_Text': 'Black Women', 'Code': 'At-..."
2,0_PAY_M_Q12,Q12_Risk Factors,122,"Childhood trauma, getting food that is addicti...","[{'Original_Text': 'Childhood trauma', 'Code':...","[{'Original_Text': 'Childhood trauma', 'Code':..."
3,0_PAY_M_Q13,Q13_Root Cause,183,Most people struggle to know what foods are be...,[{'Original_Text': 'Most people struggle to kn...,[{'Original_Text': 'Most people struggle to kn...
4,0_PAY_M_Q14,Q14_Community Strengths,244,"Good friends, finances, outdoor community/comm...","[{'Original_Text': 'Good friends', 'Code': 'St...","[{'Original_Text': 'Good friends', 'Code': 'St..."
...,...,...,...,...,...,...
513,9_PRO_M_Q13,Q13_Root Cause,192,"Okay, this is easier for people to get food. E...","[{'Original_Text': 'Okay, this is easier for p...","[{'Original_Text': 'Okay, this is easier for p..."
514,9_PRO_M_Q14,Q14_Community Strengths,253,"Okay, you know, we can have more events.\nkind...","[{'Original_Text': 'Okay, you know, we can hav...","[{'Original_Text': 'Okay, you know, we can hav..."
515,9_PRO_M_Q15,Q15_Resources,314,Probably have more young people because of the...,[{'Original_Text': 'Probably have more young p...,[{'Original_Text': 'Probably have more young p...
516,9_PRO_M_Q16,Q16_Solutions,375,"I know I'm almost 40. After 40, if you have th...","[{'Original_Text': ""I know I'm almost 40. Afte...",[{'Original_Text': 'I know I'm almost 40. Afte...


#### Cutting similar codes, removing affimation tags with SBERT

Not sure if this is currently needed, so I'm going to cut it for now.

In [63]:
model_ckpt = 'sentence-transformers/distiluse-base-multilingual-cased-v2'
model = SentenceTransformer(model_ckpt)

def similarity_scores(s1, to_compare):
    embeddings = np.concatenate([[s1], to_compare])
    # print(embeddings)
    sim = np.zeros((len(embeddings), len(embeddings)))
    # print(len(embeddings))
    for i in range(len(embeddings)):
        sim[i] = cos_sim(embeddings[i], embeddings[i:])
        break
    return sim[0][1:]
e1 = model.encode('YEAH')
tc = model.encode(['Acknowledged', 'Consent', 'Yeah', 'Agreed'])
similarity_scores(e1, tc)

array([0.41554677, 0.48822314, 0.87535834, 0.51043856])

In [64]:
def cut_similars(model, json_list):
    to_drop = set()
    agg_words = model.encode(['Acknowledged', 'Consent', 'Yeah', 'Agreed', 'No Code', "I don't know", 'Conclusion', 'Right'])
    affirmation_list = []
    for i in range(len(json_list)):
        if i in to_drop:
            # print(json_list[i])
            continue
        c_emb = model.encode(json_list[i]['Code'])
        # Drop if answer or topic too close to an affirmation
        if max(similarity_scores(c_emb, agg_words)) > .8:
            to_drop.add(i)
            affirmation_list.append(('Code', json_list[i]['Code'], json_list[i]['Original_Text']))
            continue
        if max(similarity_scores(model.encode(json_list[i]['Topic']), agg_words)) > .8:
            to_drop.add(i)
            affirmation_list.append(('Topic', json_list[i]['Topic'], json_list[i]['Original_Text']))
            continue
        ### Dropping codes that are similar to previous ones
        # for j in range(i+1, len(json_list)):
        #     if cos_sim(c_emb, model.encode(json_list[j]['Code'])) > 0.7:
        #         print(json_list[i]['Code'], ' ::: ', json_list[j]['Code'])
        #         print(json_list[i]['Topic'], ' ::: ', json_list[j]['Topic'])
        #         print(json_list[i]['Original_Text'], ' ::: ', json_list[j]['Original_Text'])
        #         print(json_list[i]['Keywords'], ' ::: ', json_list[j]['Keywords'])
        #         print('#'*50)
        #         to_drop.add(j)
    json_list = [item for idx, item in enumerate(json_list) if idx not in to_drop]
    if len(affirmation_list) > 0:
        print(affirmation_list)
    return json_list


# row = df_grouped['tmp'].head()[2]
# print(row)
# cut_similars(model, row)

# Apply the function to each row in the 'json_list' column
df_grouped['json_list'] = df_grouped['tmp'].apply(lambda x: cut_similars(model, x))

[('Topic', 'Conclusion', "That's it."), ('Topic', 'Conclusion', "I don't really have any other thoughts about it.")]
[('Topic', 'Recognition', "Yeah. Um,  and let's celebrate those.")]
[('Topic', 'Acknowledgement', 'Yeah. Yeah it was.')]
[('Code', 'Conclusion', 'So, yeah.')]
[('Topic', 'Conclusion', 'But no, no other thoughts.')]
[('Code', 'Yes', 'Yes.')]
[('Code', "Don't Know", "I don't know because I've n")]
[('Code', 'Right', 'Right, that for me.'), ('Code', 'Right', 'Right?')]


In [65]:
df_grouped.head()

Unnamed: 0,id,prompt,row,answer,label_json,tmp,json_list
0,0_PAY_M_Q10,Q10_Health Impact,0,Impacts some of our poorest residents harder. ...,[{'Original_Text': 'Impacts some of our poores...,[{'Original_Text': 'Impacts some of our poores...,[{'Original_Text': 'Impacts some of our poores...
1,0_PAY_M_Q11,Q11_At-Risk Populations,61,Black Women,"[{'Original_Text': 'Black Women', 'Code': 'At-...","[{'Original_Text': 'Black Women', 'Code': 'At-...","[{'Original_Text': 'Black Women', 'Code': 'At-..."
2,0_PAY_M_Q12,Q12_Risk Factors,122,"Childhood trauma, getting food that is addicti...","[{'Original_Text': 'Childhood trauma', 'Code':...","[{'Original_Text': 'Childhood trauma', 'Code':...","[{'Original_Text': 'Childhood trauma', 'Code':..."
3,0_PAY_M_Q13,Q13_Root Cause,183,Most people struggle to know what foods are be...,[{'Original_Text': 'Most people struggle to kn...,[{'Original_Text': 'Most people struggle to kn...,[{'Original_Text': 'Most people struggle to kn...
4,0_PAY_M_Q14,Q14_Community Strengths,244,"Good friends, finances, outdoor community/comm...","[{'Original_Text': 'Good friends', 'Code': 'St...","[{'Original_Text': 'Good friends', 'Code': 'St...","[{'Original_Text': 'Good friends', 'Code': 'St..."


Group, take only the first (negative temp)

In [66]:
# df_grouped = df2.groupby(['id', 'prompt']).first().reset_index()
# df_grouped['json_list'] = df_grouped['label_json'].apply(convert_to_dict)
# df_grouped.head()

In [67]:
df_grouped = df_grouped[df_grouped['json_list'].apply(lambda x: x != [])]

In [68]:
def extract_codes(json_list):
    return [entry['Code'] for entry in json_list]

df_grouped['tmp'] = df_grouped['json_list'].apply(extract_codes)

In [69]:
df_grouped['tmp'].head()

0    [Disproportionate Impact on Poor, Limited Food...
1                                 [At-Risk Population]
2    [Childhood Trauma, Unhealthy Food Addiction, E...
3    [Lack of Nutritional Knowledge, Genetic Influe...
4    [Strong Social Support, Financial Stability, A...
Name: tmp, dtype: object

In [70]:
# df4.drop(['tmp', 'label_json'], axis=1, inplace=True)
# df4.to_csv('health_combined.csv', index=False)

### Kmeans Clustering


In [71]:
df4 = df_grouped.copy()

In [72]:
# def extract_codes(json_list):
#     return [entry['Code'] for entry in json_list]
# 
# all_codes = sum(df4['json_list'].apply(extract_codes), [])
# 
# print(len(all_codes))

In [73]:
id_info = pd.read_csv('sensitive/id_info.csv')
q3_mapping = {
    'Obesity': 'Obesity',
    'obesity': 'Obesity',
    'Obesity/Overweight': 'Obesity',
    'Obesity and Overweight': 'Obesity',
    'overweight and obesity': 'Obesity',
    'Obesity/overweight': 'Obesity',
    'Obesity and overweight': 'Obesity',
    'Overweight/Obesity': 'Obesity',
    'overweight/obesity': 'Obesity',
    'Obestiy': 'Obesity',
    'Obesity ': 'Obesity',
    'Depression ': 'Depression'
}
id_info['Q3_Health Problem'] = id_info['Q3_Health Problem'].replace(q3_mapping)
id_info['Q3_Health Problem'].value_counts()

Q3_Health Problem
Obesity       43
Depression    18
Name: count, dtype: int64

In [74]:
df4['id_l'] = df4['id']
df4['id'] = df4['id'].str[:-4]
df4.head()

Unnamed: 0,id,prompt,row,answer,label_json,tmp,json_list,id_l
0,0_PAY_M,Q10_Health Impact,0,Impacts some of our poorest residents harder. ...,[{'Original_Text': 'Impacts some of our poores...,"[Disproportionate Impact on Poor, Limited Food...",[{'Original_Text': 'Impacts some of our poores...,0_PAY_M_Q10
1,0_PAY_M,Q11_At-Risk Populations,61,Black Women,"[{'Original_Text': 'Black Women', 'Code': 'At-...",[At-Risk Population],"[{'Original_Text': 'Black Women', 'Code': 'At-...",0_PAY_M_Q11
2,0_PAY_M,Q12_Risk Factors,122,"Childhood trauma, getting food that is addicti...","[{'Original_Text': 'Childhood trauma', 'Code':...","[Childhood Trauma, Unhealthy Food Addiction, E...","[{'Original_Text': 'Childhood trauma', 'Code':...",0_PAY_M_Q12
3,0_PAY_M,Q13_Root Cause,183,Most people struggle to know what foods are be...,[{'Original_Text': 'Most people struggle to kn...,"[Lack of Nutritional Knowledge, Genetic Influe...",[{'Original_Text': 'Most people struggle to kn...,0_PAY_M_Q13
4,0_PAY_M,Q14_Community Strengths,244,"Good friends, finances, outdoor community/comm...","[{'Original_Text': 'Good friends', 'Code': 'St...","[Strong Social Support, Financial Stability, A...","[{'Original_Text': 'Good friends', 'Code': 'St...",0_PAY_M_Q14


In [75]:
df5 = pd.merge(df4, id_info, on='id', how='left')
df5.head()

Unnamed: 0,id,prompt,row,answer,label_json,tmp,json_list,id_l,Q2_Utah Small Area,Q3_Health Problem,Q5_Sex,Q6_Race/Ethnicity - Selected Choice,Q7_Age Group,Q8_Affiliation
0,0_PAY_M,Q10_Health Impact,0,Impacts some of our poorest residents harder. ...,[{'Original_Text': 'Impacts some of our poores...,"[Disproportionate Impact on Poor, Limited Food...",[{'Original_Text': 'Impacts some of our poores...,0_PAY_M_Q10,Payson,Obesity,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search"
1,0_PAY_M,Q11_At-Risk Populations,61,Black Women,"[{'Original_Text': 'Black Women', 'Code': 'At-...",[At-Risk Population],"[{'Original_Text': 'Black Women', 'Code': 'At-...",0_PAY_M_Q11,Payson,Obesity,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search"
2,0_PAY_M,Q12_Risk Factors,122,"Childhood trauma, getting food that is addicti...","[{'Original_Text': 'Childhood trauma', 'Code':...","[Childhood Trauma, Unhealthy Food Addiction, E...","[{'Original_Text': 'Childhood trauma', 'Code':...",0_PAY_M_Q12,Payson,Obesity,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search"
3,0_PAY_M,Q13_Root Cause,183,Most people struggle to know what foods are be...,[{'Original_Text': 'Most people struggle to kn...,"[Lack of Nutritional Knowledge, Genetic Influe...",[{'Original_Text': 'Most people struggle to kn...,0_PAY_M_Q13,Payson,Obesity,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search"
4,0_PAY_M,Q14_Community Strengths,244,"Good friends, finances, outdoor community/comm...","[{'Original_Text': 'Good friends', 'Code': 'St...","[Strong Social Support, Financial Stability, A...","[{'Original_Text': 'Good friends', 'Code': 'St...",0_PAY_M_Q14,Payson,Obesity,Male,White/Caucasian,45-54,"Resident, Operations Manager for Family Search"


In [76]:
from collections import defaultdict

master_map = defaultdict(list)

for json_list in df5['json_list']:
    for item in json_list:
        text = item.get('Original_Text')
        code = item.get('Code')
        if text and code:
            master_map[code].append(text)

In [77]:
master_map

defaultdict(list,
            {'Disproportionate Impact on Poor': ['Impacts some of our poorest residents harder.'],
             'Limited Food Access and Trauma': ['Payson has a really active community, but there are those who are poor and do not have good access to food or are just traumatized (from childhood).'],
             'At-Risk Population': ['Black Women'],
             'Childhood Trauma': ['Childhood trauma'],
             'Unhealthy Food Addiction': ['Getting food that is addicting not healthy food'],
             'Economic Factors': ['Economic circumstances',
              'I think economic factors play a big part.'],
             'Lack of Nutritional Knowledge': ['Most people struggle to know what foods are best for them',
              "I think part of it is education, not knowing how foods work, which impacts us negatively, and we don't eat healthily.",
              "Maybe families that they don't know exactly about everything, you know, nutritional based too."],
     

In [78]:
keys_with_multiple_values = {key: values for key, values in master_map.items() if len(set(values)) > 1}
keys_with_multiple_values

{'Economic Factors': ['Economic circumstances',
  'I think economic factors play a big part.'],
 'Lack of Nutritional Knowledge': ['Most people struggle to know what foods are best for them',
  "I think part of it is education, not knowing how foods work, which impacts us negatively, and we don't eat healthily.",
  "Maybe families that they don't know exactly about everything, you know, nutritional based too."],
 'Active Lifestyle': ['Outdoor community/community that supports outdoor activities',
  'Like people are always walking outside or running or going to the gym.',
  'I grew up playing sports and was surrounded by athletes.'],
 'Childhood Obesity': ['And I see these children getting heavier.',
  'Young obesity'],
 'Fast Food Prevalence': ["Every time they put a building up, I'm almost willing to bet every five out of seven are going to be a fast food place.",
  'I am not sure why fast food places seem to come into a City first, but they always do.'],
 'Unhealthy Food Choices': ['

In [79]:
def create_master_map(group):
    master_map = defaultdict(list)
    for json_list in group['json_list']:
        for item in json_list:
            text = item.get('Original_Text')
            code = item.get('Code')
            if text and code:
                master_map[code].append(text)
    return master_map

map_df = df5.groupby(['prompt', 'Q3_Health Problem']).apply(create_master_map).reset_index(name='master_map')

  map_df = df5.groupby(['prompt', 'Q3_Health Problem']).apply(create_master_map).reset_index(name='master_map')


In [82]:
def extract_codes_by_prompt(df):
    grouped = df.groupby(['prompt', 'Q3_Health Problem'])['json_list'].apply(
        lambda json_lists: [entry['Code'] for json_list in json_lists for entry in json_list]
    )
    return grouped



# Apply the function to df4 and get a list of lists
all_codes = extract_codes_by_prompt(df5).reset_index()

all_codes.head()

Unnamed: 0,prompt,Q3_Health Problem,json_list
0,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther..."
1,Q10_Health Impact,Obesity,"[Disproportionate Impact on Poor, Limited Food..."
2,Q11_At-Risk Populations,Depression,"[Teens Most Affected, Young Adults Also Affect..."
3,Q11_At-Risk Populations,Obesity,"[At-Risk Population, Universal Risk for Obesit..."
4,Q12_Risk Factors,Depression,"[Social Media Influence, Emotional Isolation, ..."


In [91]:
myL = np.array([1,2,3,4,5])
labels = np.array([0,0,1,1,1])

np.where(myL == (max(myL[labels==1])))

(array([4]),)

In [108]:
import numpy as np
from sklearn.preprocessing import normalize

class KMeansCosine:
    def __init__(self, n_clusters=3, max_iter=300, tol=1e-4, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centroids = None
        self.labels_ = None
        self.centers = None

    def _cosine_similarity(self, X, Y):
        # Cosine similarity = dot product of normalized vectors
        return np.dot(X, Y.T)

    def _initialize_centroids(self, X):
        if self.random_state:
            np.random.seed(self.random_state)
        random_idxs = np.random.permutation(X.shape[0])
        centroids = X[random_idxs[:self.n_clusters]]
        return centroids

    def fit(self, X):
        X_normalized = normalize(X)
        self.centroids = self._initialize_centroids(X_normalized)

        for _ in range(self.max_iter):
            # Compute similarity between all points and centroids
            similarity = self._cosine_similarity(X_normalized, self.centroids)
            self.labels_ = np.argmax(similarity, axis=1)

            new_centroids = np.array([
                X_normalized[self.labels_ == i].mean(axis=0) 
                for i in range(self.n_clusters)
            ])
            
            # Normalize new centroids
            # new_centroids = normalize(new_centroids)

            if np.all(np.abs(self.centroids - new_centroids) < self.tol):
                break
            
            self.centroids = new_centroids


        self.centers = np.array([
            np.argmax(self._cosine_similarity(X_normalized, centroid))
            for centroid in self.centroids
        ])



    def predict(self, X):
        X_normalized = normalize(X)
        similarity = self._cosine_similarity(X_normalized, self.centroids)
        return np.argmax(similarity, axis=1)

    def fit_predict(self, X):
        self.fit(X)
        return self.labels_



    
    
num_clusters = 3
all_codes['clusters'] = None
new_rows = []

for index, row in all_codes.iterrows():    
    X = model.encode(row['json_list'])
    
    kmeans_cosine = KMeansCosine(n_clusters=num_clusters, random_state=42)
    kmeans_cosine.fit(X)
    labels = kmeans_cosine.labels_
    centers = kmeans_cosine.centers
    print(centers)
    for center in centers:
        print(center)
        print(row['json_list'][center])
    
    ret_arr = [[] for _ in range(num_clusters)]
    
    for sect, sent in zip(labels, row['json_list']):
        ret_arr[sect].append(sent)
           
    i=0
    for sect in ret_arr:
        print(i, sect)
        i+=1
    # all_codes.at[index, 'clusters'] = ret_arr

    for i, sect in enumerate(ret_arr):
        new_row = row.copy()  # Copy the original row
        new_row['clusters'] = sect  # Assign the cluster to the 'clusters' column
        new_row['cluster_id'] = i  # Add a cluster identifier
        new_rows.append(new_row)  # Append the new row to the list
        new_row['center'] = row['json_list'][centers[i]]

# Convert the list of new rows into a DataFrame
clustered_df = pd.DataFrame(new_rows)

# Reset the index for the new DataFrame
clustered_df.reset_index(drop=True, inplace=True)

# Display the new DataFrame
# clustered_df

[33 17 12]
33
Decent Amount of Impact
17
Depression Exacerbates Health Issues
12
Youth Work in Community
0 ['Impact on Some', 'Personal Perspective', 'Limited Experience as Bishop', 'Life Decisions', 'Peer Competition', 'Decent Amount of Impact', 'Yearly Variation', 'Limited Visibility', 'Difficulty Identifying', 'Personal Observation', 'Not Personally Known', 'Sudden Departure', 'Importance of Identifying Underlying Issues', 'Drug Use Rates', 'Substance Use', 'Pandemic Effects', 'Comparison to Others', 'Church Attendance Patterns', 'Parenting Challenges']
1 ['High Prevalence of Depression', 'Long-Term Therapy Experience', 'Recent Increase in Mental Health Issues', 'Multiple Factors Contributing to Mental Health Issues', 'Depression Less Noticed than Anxiety', 'Perception of Community Health', 'Social Withdrawal Post-Covid', 'Compelled Social Withdrawal', 'Higher Instances of Depression, Anxiety, and Suicide', 'Depression Exacerbates Health Issues', 'Societal Pressures', 'High Expectat

In [109]:
clustered_df = clustered_df.merge(map_df, on=['prompt', 'Q3_Health Problem'], how='left')

def map_texts_to_codes(texts, master_map):
    return [master_map[text] for text in texts if text in master_map]

clustered_df['cluster_orig_texts'] = clustered_df.apply(
    lambda x: map_texts_to_codes(x['clusters'], x['master_map']), axis=1
)

In [110]:
def generate_id(q_num, problem, clusterId):
    return f"{q_num[:3].upper()}_{problem[:3]}_{str(clusterId).upper()}"

clustered_df['id'] = clustered_df.apply(lambda row: generate_id(row['prompt'], row['Q3_Health Problem'], row['cluster_id']), axis=1)

In [111]:
def DeList(top_list):
    ret = []
    for x in top_list:
        ret.extend(x)
    return ret

clustered_df['cluster_orig_texts'] = clustered_df.apply(lambda row: DeList(row['cluster_orig_texts']), axis=1)

In [112]:
clustered_df.head()

Unnamed: 0,prompt,Q3_Health Problem,json_list,clusters,cluster_id,center,master_map,cluster_orig_texts,id
0,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[Impact on Some, Personal Perspective, Limited...",0,Decent Amount of Impact,{'High Prevalence of Depression': ['I would sa...,"[It does have an impact on some., Personally, ...",Q10_Dep_0
1,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[High Prevalence of Depression, Long-Term Ther...",1,Depression Exacerbates Health Issues,{'High Prevalence of Depression': ['I would sa...,[I would say there's a lot of depression going...,Q10_Dep_1
2,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[Uncertainty due to Work Schedule, Limited Int...",2,Youth Work in Community,{'High Prevalence of Depression': ['I would sa...,"[I am not super sure because I work a lot., I ...",Q10_Dep_2
3,Q10_Health Impact,Obesity,"[Disproportionate Impact on Poor, Limited Food...","[Disproportionate Impact on Poor, Parental Inf...",0,Community Impact,{'Disproportionate Impact on Poor': ['Impacts ...,[Impacts some of our poorest residents harder....,Q10_Obe_0
4,Q10_Health Impact,Obesity,"[Disproportionate Impact on Poor, Limited Food...","[Limited Food Access and Trauma, Limited Growt...",1,Limited Impact Perception,{'Disproportionate Impact on Poor': ['Impacts ...,"[Payson has a really active community, but the...",Q10_Obe_1


In [113]:
def find_center_text(text_list):
    texts = model.encode(text_list)
    centroid = texts.mean(axis=0)
    max_loc = np.argmax(np.dot(texts, centroid.T))
    return text_list[max_loc]

clustered_df['cluster_orig_texts_center'] = clustered_df.apply(lambda row: find_center_text(row['cluster_orig_texts']), axis=1)

In [114]:
clustered_df.head()

Unnamed: 0,prompt,Q3_Health Problem,json_list,clusters,cluster_id,center,master_map,cluster_orig_texts,id,cluster_orig_texts_center
0,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[Impact on Some, Personal Perspective, Limited...",0,Decent Amount of Impact,{'High Prevalence of Depression': ['I would sa...,"[It does have an impact on some., Personally, ...",Q10_Dep_0,At least from what I've seen
1,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[High Prevalence of Depression, Long-Term Ther...",1,Depression Exacerbates Health Issues,{'High Prevalence of Depression': ['I would sa...,[I would say there's a lot of depression going...,Q10_Dep_1,It seems like there's some amount of depressio...
2,Q10_Health Impact,Depression,"[High Prevalence of Depression, Long-Term Ther...","[Uncertainty due to Work Schedule, Limited Int...",2,Youth Work in Community,{'High Prevalence of Depression': ['I would sa...,"[I am not super sure because I work a lot., I ...",Q10_Dep_2,"For from my perspective, I do a lot of work wi..."
3,Q10_Health Impact,Obesity,"[Disproportionate Impact on Poor, Limited Food...","[Disproportionate Impact on Poor, Parental Inf...",0,Community Impact,{'Disproportionate Impact on Poor': ['Impacts ...,[Impacts some of our poorest residents harder....,Q10_Obe_0,Feels like it could affect the children becaus...
4,Q10_Health Impact,Obesity,"[Disproportionate Impact on Poor, Limited Food...","[Limited Food Access and Trauma, Limited Growt...",1,Limited Impact Perception,{'Disproportionate Impact on Poor': ['Impacts ...,"[Payson has a really active community, but the...",Q10_Obe_1,People don’t do the outside things when they’r...


In [106]:
clustered_df.to_csv('sensitive/clustered_by_codes_with_og_text.csv', index=False)

##### Json Experiments

In [33]:
# import re
# import json
# 
# json_pattern = r'\{\s*"Axial_Category":\s*".*?"\s*\}'
# matches = re.findall(json_pattern, 'blah blah {"Axial_Category: "your mom"} blah blah', re.DOTALL)
# results = []
# for match in matches:
#     try:
#         x = json.loads(match)
#         print(x)
#     except json.JSONDecodeError as e:
#         print(e)
#         

In [34]:
# import json
# import re
# txt = '''
#  [{"Original_Text": "With the care clinic, you have to be below the poverty line to even go there because they can't afford health care.", "Code": "Low-Income Patients", "Topic": "Patient Demographics", "Keywords": ["Poverty", "Healthcare Access"]},
#                  {"Original_Text": "So, you see there is a lot of overweight and obese people who come in with health complications that have developed to the point where now they're like, 'okay, I need to go get this seen'", "Code": "Overweight/Obese Patients with Health Complications", "Topic": "Patient Health Status", "Keywords": ["Obesity", "Health Complications"]},
#                  {"Original_Text": "But then there's only so much we can do at that point. Because it's gotten so far and they didn't get any early treatment, there's diabetes issues and just complications from all of the stuff that can come with being obese and overweight", "Code": "Limited Treatment Options for Late-Stage Health Issues", "Topic": "Treatment Limitations", "Keywords": ["Late-Stage Health Issues", "Treatment Limitations"]},
#                  {"Original_Text": "And then it's like, we can't really do very much about it. At that point, it's just managing rather than treating because we can't really undo the damage that's been done.", "Code": "Managing vs. Treating Late-Stage Health Issues", "Topic": "Treatment Approach", "Keywords": ["Managing", "Treating"]},
#                  {"Original_Text": "This could be stretching, but I think part of the reason we get such a high obese population in the people who come is because they're all so low income.", "Code": "Low-Income Patients and Obesity", "Topic": "Patient Demographics", "Keywords": ["Low-Income", "Obesity"]},
#                  {"Original_Text": "And fast food is cheap, unhealthy food is cheap, and it takes time and effort and money to be active and go to the gym and plan to eat healthy.", "Code": "Unhealthy Food Choices and Lack of Physical Activity", "Topic": "Lifestyle Factors", "Keywords": ["Unhealthy Food", "Lack of Physical Activity"]},
#                  {"Original_Text": "And these people are pretty much working and living paycheck to paycheck. So that's not a luxury they really have.", "Code": "Financial Constraints and Health Behaviors", "Topic": "Financial Constraints", "Keywords": ["Financial Constraints", "Health Behaviors"]}]
#                  '''
# 
# def parse_result(result):
#     # match = re.search(r'\[{\s\S*\}]', result)
#     # if match:
#     #     json_str = match.group(0)
#     #     try:
#     #         return json.loads(json_str)
#     #     except json.JSONDecodeError:
#     #         return None
#     # return None
#     try:
#         # Attempt to parse the text as JSON
#         json_data = json.loads(result)
#         return json_data
#     except json.JSONDecodeError as e:
#         # Handle JSON decoding errors
#         print(f"JSON decoding error: {e}")
#         return None
#     except Exception as e:
#         # Handle any other exceptions
#         print(f"An unexpected error occurred: {e}")
#         return None
# parse_result(txt)