In [3]:
import sqlite3
import pandas as pd

conn = sqlite3.connect('../data_collection/reddit_ed_0.6.0.sqlite3')

cursor = conn.cursor()

query = f"select * from subreddit_submission_metadata"

df = pd.read_sql_query(query,con=conn)
df["num_words"] = df["selftext"].apply(lambda x: len(x.split()))
word_length_cutoff = 20 # set a very low threshold
print(f"Number of short posts: {len(df[df['num_words'] < word_length_cutoff])}")
filtered_df = df[df["num_words"] >= word_length_cutoff]
print(f"Number of filtered posts (by word length): {len(filtered_df)}")
filtered_df["link_flair_text"] = filtered_df["link_flair_text"].apply(lambda x: x.lower().strip() if x is not None else "")
positive_flags = ["story", "progress"]
negative_flags = ["announcement", "educational", "research request", "resources", "advertise"]
uncertain_flags = ["question", "support", "advice"]
import re
positive_flag_regex = ".*(" + ")|(".join(positive_flags) + ").*"
negative_flag_regex = ".*(" + ")|(".join(negative_flags) + ").*"
uncertain_flag_regex = ".*(" + ")|(".join(uncertain_flags) + ").*"
filtered_df["narrative_positive"] = filtered_df["link_flair_text"].apply(lambda x: 1 if re.match(positive_flag_regex, x) else 0)
filtered_df["narrative_negative"] = filtered_df["link_flair_text"].apply(lambda x: 1 if re.match(negative_flag_regex, x) else 0)
filtered_df["narrative_uncertain"] = filtered_df["link_flair_text"].apply(lambda x: 1 if re.match(uncertain_flag_regex, x) else 0)

Number of short posts: 8940
Number of filtered posts (by word length): 15706


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["link_flair_text"] = filtered_df["link_flair_text"].apply(lambda x: x.lower().strip() if x is not None else "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["narrative_positive"] = filtered_df["link_flair_text"].apply(lambda x: 1 if re.match(positive_flag_regex, x) else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide

In [4]:
def get_analysis_info(data: pd.DataFrame, type: str):
    column_name = f"narrative_{type}" 
    print(f"Number of {type}: {sum(data[column_name])} / {len(data)}")
    print(f"Values of flair flags: {data[data[column_name] == 1]['link_flair_text'].unique()}")
    return data[data[column_name] == 1][["title","link_flair_text", "selftext"]]

positive_df = get_analysis_info(filtered_df, "positive")

Number of positive: 159 / 15706
Values of flair flags: ['recovery story' 'story time' ':karma:personal story:karma:' 'my story'
 'progress']


In [28]:
positive_df

Unnamed: 0,title,link_flair_text,selftext
13,ED tip if you are trying to eat more,recovery story,Smoothies really helped me. You can put a bana...
58,Recovery Story,recovery story,\n\nI started getting insecure during lockdown...
109,This has been going on since 2006,recovery story,17 years of ED. It just seems like this will b...
143,so proud of myself,recovery story,I'm six days binge free! Today i didn't rexist...
154,It gets better,recovery story,I'm more than 2 years into recovery. I promise...
...,...,...,...
24089,Life changing!!,progress,"I started Concerta for my ADHD, and taking it ..."
24098,Lost five pounds!!,progress,"I know it’s super small, but omg!! I have hard..."
24099,I resisted an urge today,progress,I told myself two days ago no more doordash be...
24124,Finally Happy with My Life (Vyvanse),progress,"For context, I have always had a binge eating ..."


In [67]:
import openai
import os


# Set up the OpenAI API credentials
openai.api_key = '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'

# Define the prompt for the AI to respond to
instruction_prompt = """I am going to be giving you narratives about people with eating disorders. 
Can you help me identify what helped the user with their eating disorder or what made their eating disorder worse?

I want you to first identify the person that they are writing about. If it's a first person narrative then the person would be the writer,
if it's unknown then say that it's the writer, else if it's second or third person please identify that person.

Then, I want you to identify what has helped or hurt their eating disorder, this will be the treatment. If it's multiple treatments
I want you to provide the answer as a list of strings, else I just want it to be a string. 

I also want you to explain whether that treatment on a person was helpful to the person or harmful for them. This will be the effect. 

The outcome can only be helpful, harmful, or neutral. If the treatment was helpful then it helped the person recover, if it was harmful then it harmed the user
and affected their recovery, if it was neutral it didn't do anything. 

If the narrative isn't related to an eating disorder please just return None for perspective, treatment, effect, and outcome. 

Then I want you to send that answer to me.
IMPORTANT: ONLY SEND ME THE FORMATTED ANSWER.

I want the answer to be formatted like this so it forms a python dictionary:
{'perspective': '[PERSON IDENTIFIED HERE]', 
'treatment': '[WHAT HELPED OR HARMED THEM HERE]', 
'effect': '[EXPLAIN THE EFFECT THE TREATMENT HAS HAD ON THE WRITER]', 
'outcome': '[WHETHER THE TREATMENT WAS HELPFUL OR HARMFUL]'}
"""


def apply_chatgpt(row, prompt):
  narrative = row['selftext']
  max_retries = 2
  retries = 0
  while retries < max_retries:
    try:
      response = openai.ChatCompletion.create(
      model="gpt-3.5-turbo",
      messages=[
            {"role": "user", "content":prompt},
            {"role": "user", "content": narrative},
        ]
    )

      result = eval(response.choices[0].message.content)
      row['perspective'] = result['perspective']
      row['treatment'] = result['treatment']
      row['effect_type'] = result['effect_type']
      row['effect_details'] = result['effect_details']
      return row
    except Exception as e:
        retries += 1
        if retries == max_retries:
          row['perspective'] = None
          row['treatment'] = None
          row['effect_type'] = None
          row['effect_details'] = None
          return row

In [50]:
instruction_df = positive_df.copy(deep=True).reset_index()
# instruction_df = instruction_df.head(20)
instruction_df = instruction_df.apply(apply_chatgpt, axis=1, prompt=instruction_prompt)


In [68]:
automatic_prompt = """
Please analyze the following narrative related to eating disorders and extract the relationships between specific clinical treatments and their positive or negative effects on the individual. 
Focus on identifying pharmaceutical treatments and their impact. For each narrative, provide a summary in the following format: {'perspective': ..., 'treatment': '...' or ['...', '...'], 'effect_type': harmful/helpful/neutral/unknown, 'effect_details': '...'}

'perspective': This refers to the point of view or the person speaking in the narrative. In most cases, it would be 'the writer' as it is their personal experience being shared.

'treatment': This refers to the specific treatment(s) or intervention(s) used by the individual, with a focus on pharmaceutical or clinical treatments. Examples include medications, therapy, or other clinical interventions. 
If only one treatment is mentioned in the narrative, the 'treatment' will be a single string value. If multiple treatments are discussed, 'treatment' will be a list of strings, each representing a different treatment.

'effect_type': For outcome here are the different options to choose from: 	
1. harmful': This outcome indicates that the treatment had a negative impact on the individual's eating disorder or well-being. It may have worsened their condition, led to adverse side effects, or created additional problems.
2. 'helpful': This outcome suggests that the treatment had a positive effect on the individual's eating disorder or overall well-being. It may have improved their condition, helped them manage their symptoms, or led to other beneficial outcomes.
3. 'neutral': This outcome means that the treatment neither helped nor harmed the individual's eating disorder or well-being. It could imply that the treatment had no noticeable effect or that any positive and negative effects balanced each other out.
4. 'unknown': This outcome is used when it's unclear or not explicitly mentioned in the narrative whether the treatment had a positive, negative, or neutral effect on the individual's eating disorder or well-being.

'effect_details': This provides a brief description of the specific effects or consequences the treatment had on the individual's life or eating disorder. It may include improvements or setbacks in their mental health, changes in their eating habits, or other related outcomes.
"""

automatic_prompt_df = positive_df.copy(deep=True).reset_index()
automatic_prompt_df = automatic_prompt_df.apply(apply_chatgpt, axis=1, prompt=automatic_prompt)
automatic_prompt_df.to_csv('automatic_prompt.csv')


In [63]:
automatic_prompt_df.head()

Unnamed: 0,index,title,link_flair_text,selftext,perspective,treatment,effect_type,effect_details
0,13,ED tip if you are trying to eat more,recovery story,Smoothies really helped me. You can put a bana...,the writer,Smoothies,helpful,Improved the writer’s eating habits and aided ...
1,58,Recovery Story,recovery story,\n\nI started getting insecure during lockdown...,,,,
2,109,This has been going on since 2006,recovery story,17 years of ED. It just seems like this will b...,the writer,unknown,unknown,No specific clinical treatments or interventio...
3,143,so proud of myself,recovery story,I'm six days binge free! Today i didn't rexist...,,,,
4,154,It gets better,recovery story,I'm more than 2 years into recovery. I promise...,,,,


In [51]:
# instruction_df.to_csv('instruction_df.csv')

In [25]:
row = instruction_df.iloc[0]
narrative = row['selftext']
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
        {"role": "user", "content":instruction_prompt},
        {"role": "user", "content": narrative},
    ]
)

result = eval(response.choices[0].message.content)
row['perspective'] = result['perspective']
row['treatment'] = result['treatment']
row['effect'] = result['effect']
row['outcome'] = result['outcome']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['perspective'] = result['perspective']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['perspective'] = result['perspective']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['treatment'] = result['treatment']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['treatment'] = result['treatment']
A va

In [26]:
result

{'perspective': 'writer',
 'treatment': 'Smoothies',
 'effect': 'Smoothies helped the writer with their recovery from the eating disorder.',
 'outcome': 'Positive'}

In [33]:
eval(response['choices'][0]['message']['content'])

{'perspective': 'Writer',
 'treatment': 'Smoothies',
 'effect': 'Smoothies helped the writer with their eating disorder recovery.'}

'The 2020 World Series was played at Globe Life Field in Arlington, Texas.'

In [6]:
openai.Model.list()


<OpenAIObject list at 0x1344a99f0> JSON: {
  "data": [
    {
      "created": 1649358449,
      "id": "babbage",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampling": true,
          "allow_search_indices": false,
          "allow_view": true,
          "created": 1669085501,
          "group": null,
          "id": "modelperm-49FUp5v084tBB49tC4z8LPH5",
          "is_blocking": false,
          "object": "model_permission",
          "organization": "*"
        }
      ],
      "root": "babbage"
    },
    {
      "created": 1649359874,
      "id": "davinci",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampl