In [1]:
### ALL NECESSARY LIBRARIES ###
import pandas as pd
import numpy as np
import random
import itertools
from itertools import chain
import warnings
warnings.simplefilter(action='ignore')

import time
# import transformers
# import torch
import re
import string

# for saving variables
import pickle

# needed for gpt
import openai
from tenacity import retry, wait_random_exponential, stop_after_attempt

# these are needed for preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# these are needed for coherence measures
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

from sklearn.preprocessing import Binarizer, normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.manifold import TSNE

# plotting
# import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

# network libraries
import networkx as nx
# import hypernetx as hnx

# pyarrow and/or multiprocessing (multithreading)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Import dataset

In [2]:
df = pd.read_csv('cues_list_extended_091823.csv')
df.head()

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,modification,exclude
0,OAV,1,Impaired control and cognition,I felt like a marionette.,,
1,OAV,2,Spiritual Experience,I had the feeling of being connected to a supe...,,
2,OAV,3,Blissful State,I enjoyed boundless pleasure.,,
3,OAV,4,Elementary imagery,I saw regular patterns in complete darkness or...,,
4,OAV,5,Experience of unity,Everything seemed to unify into a oneness.,,


# New Section

In [3]:
print('n_items: ', len(df))
print('n_dimensions: ', len(set(df['Dimension'].values)))
print(set(df['Instrument'].values))
print('n_instruments: ', len(set(df['Instrument'].values)))

n_items:  1216
n_dimensions:  219
{'FFMQ', 'MNE-92M', 'SSSQ', 'PhCI', '5D-ASC', 'TMS', 'PSI', 'MEQ-Memory', 'ReSQ', 'PANAS', 'SSD', 'MEQ30', 'DAQ', 'PANAS-X', 'EDI', 'DEQ', 'Siclari', 'M-scale', 'NYC-Q', 'OAV', 'ARSQ-1.0', 'ARCI-49', 'HRS', 'TATE', 'SOARS', 'CADSS', 'ARSQ-2.0', 'TAS-20', 'MAAS', 'POMS-A'}
n_instruments:  30


In [6]:
mod_prompts = df[~df['modification'].isna()][['Instrument', 'ItemNumber', 'Dimension', 'modification', 'exclude']].copy()
prompts = df[df['modification'].isna()][['Instrument', 'ItemNumber', 'Dimension', 'prompt', 'exclude']].copy()
all_prompts = pd.concat([mod_prompts.rename(columns={'modification': 'prompt'}), prompts], ignore_index=True)
all_prompts.head()
# len(all_prompts)

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude
0,MEQ30,1,transcendence of time and space,I lost my usual sense of time.,
1,MEQ30,2,positive mood,I had an experience of amazement.,
2,MEQ30,3,ineffability,I had a sense that my experience cannot be des...,
3,MEQ30,4,mystical,I gained insightful knowledge experienced at a...,
4,MEQ30,5,mystical,I felt that I experienced eternity or infinity.,


In [7]:
# print('n_dimensions: ', len(set(all_prompts['Dimension'].values)))
print('n_dimensions: ', len(df['Dimension'].unique()))

n_dimensions:  219


In [8]:
# some cleaning ...
all_prompts = all_prompts[all_prompts['exclude'] != 1]                           # # Drop rows where 'exclude' has value 1
all_prompts['no_punct_prompt'] = all_prompts['prompt'].str.replace(r'[^\w\s]', '', regex=True)      # remove punctuation from prompt
all_prompts_uniques = all_prompts.drop_duplicates(subset='no_punct_prompt')      # drop duplicate prompts
all_prompts = all_prompts_uniques.reset_index(drop=True)
len(all_prompts)

1023

In [9]:
# check if all prompts are in statement form (as opposed to question)
# Use regex to find strings ending with a question mark
question_mark_prompts = all_prompts[all_prompts['prompt'].str.contains(r'\?$')]
# Display the DataFrame with strings ending in a question mark (should be none)
print(question_mark_prompts)

Empty DataFrame
Columns: [Instrument, ItemNumber, Dimension, prompt, exclude, no_punct_prompt]
Index: []


In [10]:
for instrument in set(all_prompts.Instrument.values):
    df_subset = all_prompts[all_prompts['Instrument']==instrument]
    n_dims = len(set(df_subset.Dimension))
    n_items = len(set(df_subset['ItemNumber'].values))
    n_prompts = len(set(df_subset['prompt'].values))
    print(f"{instrument} n_dims: {n_dims} n_items: {n_items} n_prompts: {n_prompts}")

FFMQ n_dims: 5 n_items: 35 n_prompts: 35
MNE-92M n_dims: 14 n_items: 91 n_prompts: 91
SSSQ n_dims: 3 n_items: 24 n_prompts: 24
PhCI n_dims: 21 n_items: 51 n_prompts: 88
5D-ASC n_dims: 3 n_items: 77 n_prompts: 77
TMS n_dims: 2 n_items: 13 n_prompts: 13
PSI n_dims: 7 n_items: 48 n_prompts: 48
MEQ-Memory n_dims: 10 n_items: 31 n_prompts: 31
ReSQ n_dims: 6 n_items: 6 n_prompts: 6
PANAS n_dims: 2 n_items: 16 n_prompts: 16
SSD n_dims: 8 n_items: 56 n_prompts: 56
MEQ30 n_dims: 4 n_items: 30 n_prompts: 30
DAQ n_dims: 12 n_items: 31 n_prompts: 53
PANAS-X n_dims: 10 n_items: 27 n_prompts: 27
EDI n_dims: 2 n_items: 16 n_prompts: 16
DEQ n_dims: 8 n_items: 22 n_prompts: 22
Siclari n_dims: 1 n_items: 7 n_prompts: 7
M-scale n_dims: 8 n_items: 32 n_prompts: 32
NYC-Q n_dims: 8 n_items: 31 n_prompts: 31
OAV n_dims: 11 n_items: 42 n_prompts: 42
ARSQ-1.0 n_dims: 3 n_items: 8 n_prompts: 8
ARCI-49 n_dims: 5 n_items: 49 n_prompts: 49
HRS n_dims: 6 n_items: 70 n_prompts: 70
TATE n_dims: 27 n_items: 52 n_promp

In [11]:
print('n_items: ', len(all_prompts))
print('n_dimensions: ', len(set(all_prompts['Dimension'].values)))
print(set(all_prompts['Instrument'].values))
print('n_instruments: ', len(set(all_prompts['Instrument'].values)))

n_items:  1023
n_dimensions:  198
{'FFMQ', 'MNE-92M', 'SSSQ', 'PhCI', '5D-ASC', 'TMS', 'PSI', 'MEQ-Memory', 'ReSQ', 'PANAS', 'SSD', 'MEQ30', 'DAQ', 'PANAS-X', 'EDI', 'DEQ', 'Siclari', 'M-scale', 'NYC-Q', 'OAV', 'ARSQ-1.0', 'ARCI-49', 'HRS', 'TATE', 'SOARS', 'CADSS', 'ARSQ-2.0', 'TAS-20', 'MAAS', 'POMS-A'}
n_instruments:  30


In [12]:
all_prompts.head()

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt
0,MEQ30,1,transcendence of time and space,I lost my usual sense of time.,,I lost my usual sense of time
1,MEQ30,2,positive mood,I had an experience of amazement.,,I had an experience of amazement
2,MEQ30,3,ineffability,I had a sense that my experience cannot be des...,,I had a sense that my experience cannot be des...
3,MEQ30,4,mystical,I gained insightful knowledge experienced at a...,,I gained insightful knowledge experienced at a...
4,MEQ30,5,mystical,I felt that I experienced eternity or infinity.,,I felt that I experienced eternity or infinity


Prompt and collect responses from GPT3

Get prompt embeddings

Prompt GPT and collect responses

In [13]:
# GPT3.5 turbo UPGRADED VER?
from openai import OpenAI

client = OpenAI(api_key="") #insert api key

# MODEL = "gpt-3.5-turbo"
# MODEL = "gpt-4"
MODEL = "gpt-4-turbo-preview"

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def get_response(prompt):
  response = client.chat.completions.create(
                          model=MODEL,
                          messages=[
                              {"role": "system", "content": profile},
                              {"role": "user", "content": prompt}
                          ],
                          temperature=1,
                          max_tokens=100,              # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                          top_p=1,
                          frequency_penalty=1,
                          presence_penalty=1,
                          # seed=42,
                          # response_format={ "type": "json_object" },
                          n=5
                        )

  # answer = [obj['message']['content'] for obj in response.choices]
  answer = [obj.message.content for obj in response.choices]

  return answer

In [14]:
# USED THIS FOR ALL OTHER MODELS
instructions = """You will be presented with a statement that describes a certain subjective experience.
                For this experience, provide a comma-separated list of up to five most representative characteristics 
                of what it feels like to have that experience.
                Each answer should be about perceptual, emotional, physiological, or cognitive phenomena.
                Each answer should be strictly less than five words.
                Each answer should be unique, rather than a synonym of another.
                Refrain from providing synonyms of the given statement.

                {query}
                """


In [15]:
#adding the BFI-2 personality test results instructions
trait_df = pd.read_csv('bfi-10_questionnaire.csv')
trait_df

Unnamed: 0,ItemNumber,Factor,Scoring,prompt
0,1,Extraversion,Reversed-scored,Is reserved
1,2,Agreeableness,True-scored,Is generally trusting
2,3,Conscientiousness,Reversed-scored,Tends to be lazy
3,4,Neuroticism,Reversed-scored,"Is relaxed, handles stress well"
4,5,Openness,Reversed-scored,Has few artistic interests
5,6,Extraversion,True-scored,"Is outgoing, sociable"
6,7,Agreeableness,Reversed-scored,Tends to find fault with others
7,8,Conscientiousness,True-scored,Does a thorough job
8,9,Neuroticism,True-scored,Gets nervous easily
9,10,Openness,True-scored,Has an active imagination


In [16]:
#personality profile scores
pp_df = pd.read_csv('personality_profiles.csv')
pp_df

Unnamed: 0,profile,Extraversion,Agreeableness,Conscientiousness,Neuroticism,Openness
0,Anti-resilients,-0.97,-0.76,-1.28,1.34,0.39
1,Resilients,0.5,0.73,0.92,-1.21,0.5
2,Over-controllers,-0.14,0.56,0.31,0.78,0.55
3,Under-controllers,0.05,-0.56,-0.38,-0.15,-0.7


In [17]:
print(pp_df.iloc[0][1:])
print(trait_df['prompt'][1]) #testing example

Extraversion        -0.97
Agreeableness       -0.76
Conscientiousness   -1.28
Neuroticism          1.34
Openness             0.39
Name: 0, dtype: object
Is generally trusting


In [19]:
#true scored
prompts = trait_df['prompt']
ts = [prompts[1], prompts[5], prompts[7], prompts[8], prompts[9], prompts[10]]

#reverse scored
rs = [prompts[0], prompts[2], prompts[3], prompts[4], prompts[6]]

In [20]:
#separating prompts by domain
E = [prompts[0], prompts[5]]
A = [prompts[1], prompts[6], prompts[10]]
C = [prompts[2], prompts[7]]
N = [prompts[3], prompts[8]]
O = [prompts[4], prompts[9]]

In [21]:
prompt_instructions = """Many important individual differences in people's patterns of thinking, feeling, and behaving
    can be summarized in terms of the Big Five personality domains, which are extraversion, agreeableness, conscientiousness, neuroticism, and openness.
    The Big Five Inventory-10 (BFI-10) is a questionnaire that operationalizes this hierarchical conceptualization of personality structure by assessing the Big Five domains.
    The questionnaire consists of 10 questions and 1 optional additional question where people write a number next to each statement to indicate the extent to which they agree 
    or disagree with the statement about themselves (-2 is disagree strongly, -1 is disagree a little, 0 is neither agree nor disagree, 1 is agree a little, and 2 is agree 
    strongly). All of the statements are preceeded by "I see myself as someone who..." 
    
    Extraversion is measured by the statements {0}. Agreeableness is measured by the statements {1}. Conscientiousness is measured by the statements {2}. Neuroticism is
    measured by the statements {3}. Openness is measured by the statements {4}.

    Once numbers are assigned for all the statements in the questionnaire, here is how the scoring works. For the true scored-statements {5}, the numbers next to each
    statement are left the same. For the reverse-coded statements {6}, the numbers next to each statement is recoded so that -2 = 2, -1 = 1, 0 = 0, 1 = -1, and 2 = -1. Then,
    the average of the numbers for each domain are found to obtain a score in each domain.

    You are a person with these dimensions: {7}

    You just went through two minutes of eyes-closed rest and was asked to observe how your experience felt.
                """

In [None]:
#prompt_instructions.format(extraversion, agreeableness, conscientiousness, negative_emotionality, openmindedness, pp_df.iloc[i])
all_results = []

for i in range(4):
  profile = prompt_instructions.format(E, A, C, N, O, ts, rs, pp_df.iloc[i][1:])
  results = all_prompts['prompt'].head(2).map(lambda prompt : get_response(profile, instructions.format(query=prompt)))
  all_results.append(results)

In [None]:
all_results[0][1]

['Awestruck, Mind-blown, Overwhelmed, Speechless, Enchanted',
 'Awe-inspiring, jaw-dropping, wonder-filled, overwhelmed, astonished',
 'Wonder, Awe, Astonishment, Overwhelm, Speechless',
 'Awe, Wonder, Astonishment, Jaw-dropping',
 'Sense of wonder, awe, astonishment, disbelief.',
 'Wonder, Astonishment, Awe, Overwhelming, Speechless',
 'Awe-inspiring, overwhelming, wonder-filled, jaw-dropping',
 'Awestruck, Speechless, Overwhelmed, Mind-blowing, Astonishing.',
 'Awe-inspiring, jaw-dropping, overwhelming, stunned, astonished',
 'Awe-inspiring beauty, Overwhelmed with wonder, Mind-blown by the grandeur']

In [None]:
all_results[3][1]

['Awe-inspiring, Jaw-dropping, Overwhelmed, Astonished, Speechless',
 'Awe, Wonder, Jaw-dropping astonishment',
 'Awe-inspiring wonder, Jaw-dropping astonishment, Overwhelmed with disbelief',
 'Wonder, awe, astonishment, admiration, overwhelming',
 'Awe-inspiring visuals, Overwhelming sense of wonder',
 'Surprised, awe-inspired, astonished, wonder-filled.',
 'Wonder, awe, astonishment',
 'Wonder, awe, astonishment',
 'Wonder, awe, astonishment, disbelief',
 'Surprised, Astonished, Overwhelmed, Jaw-dropping']

In [None]:
### SAVE RESPONSES 
with open('/persona_gpt3.5-turbo_101323.pkl', 'wb') as f:
    pickle.dump(all_prompts, f)

all_prompts.head()