In [1]:
#PREPROCESSING RESULTS FROM GENERATION NOTEBOOK

###NECESSARY LIBRARIES###
import pandas as pd
import numpy as np
import random
import itertools
from itertools import chain
import warnings
warnings.simplefilter(action='ignore')

import time
import re
import string

# for saving variables
import pickle

# these are needed for preprocessing
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# these are needed for coherence measures
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel

from sklearn.preprocessing import Binarizer, normalize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.manifold import TSNE

# plotting
# import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

# network libraries
import networkx as nx

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/phyllissern/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




In [2]:
#setting up base all_prompts dataframe
df = pd.read_csv('cues_list_extended_091823.csv')

mod_prompts = df[~df['modification'].isna()][['Instrument', 'ItemNumber', 'Dimension', 'modification', 'exclude']].copy()
prompts = df[df['modification'].isna()][['Instrument', 'ItemNumber', 'Dimension', 'prompt', 'exclude']].copy()
all_prompts = pd.concat([mod_prompts.rename(columns={'modification': 'prompt'}), prompts], ignore_index=True)

print('n_dimensions: ', len(df['Dimension'].unique()))

# some cleaning ...
all_prompts = all_prompts[all_prompts['exclude'] != 1]                           # # Drop rows where 'exclude' has value 1
all_prompts['no_punct_prompt'] = all_prompts['prompt'].str.replace(r'[^\w\s]', '', regex=True)      # remove punctuation from prompt
all_prompts_uniques = all_prompts.drop_duplicates(subset='no_punct_prompt')      # drop duplicate prompts
all_prompts = all_prompts_uniques.reset_index(drop=True)
print(len(all_prompts))

# check if all prompts are in statement form (as opposed to question)
# Use regex to find strings ending with a question mark
question_mark_prompts = all_prompts[all_prompts['prompt'].str.contains(r'\?$')]
# Display the DataFrame with strings ending in a question mark (should be none)
print(question_mark_prompts)

all_prompts.head()

n_dimensions:  219
1023
Empty DataFrame
Columns: [Instrument, ItemNumber, Dimension, prompt, exclude, no_punct_prompt]
Index: []


Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt
0,MEQ30,1,transcendence of time and space,I lost my usual sense of time.,,I lost my usual sense of time
1,MEQ30,2,positive mood,I had an experience of amazement.,,I had an experience of amazement
2,MEQ30,3,ineffability,I had a sense that my experience cannot be des...,,I had a sense that my experience cannot be des...
3,MEQ30,4,mystical,I gained insightful knowledge experienced at a...,,I gained insightful knowledge experienced at a...
4,MEQ30,5,mystical,I felt that I experienced eternity or infinity.,,I felt that I experienced eternity or infinity


In [3]:
#loading in the results
with open('bfi-10_generation_gpt4_030424.pkl', 'rb') as f:
    persona_info = pickle.load(f)

persona_info

[0       [Feeling disoriented, Increased focus on task,...
 1       [Intense curiosity, Emergence of wonder, Emoti...
 2       [Ineffable, Perplexing, Overwhelms my vocabula...
 3       [Eureka sensation, Enhanced understanding, Sud...
 4       [Overwhelming scale, perpetual time loop, limi...
                               ...                        
 1018    [Attention diverted elsewhere, Emotionally det...
 1019    [In control of situation, positive about futur...
 1020    [Feeling of unity with environment, Absence of...
 1021    [Feeling empowered, Sensation of calmness, Inc...
 1022    [Sense of detachment, Loss of personal boundar...
 Name: prompt, Length: 1023, dtype: object,
 0       [Disorientation, Anxiety or unease, Altered pe...
 1       [Senses heightened, time seemed to slow, heart...
 2       [Ineffable, Frustrating language limitations, ...
 3       [Sudden clarity, instinctive understanding, in...
 4       [Timelessness, Overwhelming awe, Perceptual va...
            

In [19]:
type(persona_info[0])

pandas.core.series.Series

In [34]:
persona1 = persona_info[0].tolist()
persona2 = persona_info[1].tolist()
persona3 = persona_info[2].tolist()

persona_combined = []

for sublist1, sublist2, sublist3 in zip(persona1, persona2, persona3):
    combined_sublist = sublist1 + sublist2 + sublist3
    persona_combined.append(combined_sublist)

persona_combined

[['Feeling disoriented, Increased focus on task, Alteration in perception, No awareness of hours passing, Ignoring usual routines',
  'Disoriented, Anxious, Unfocused, In a trance-like state, Feels timeless',
  'Disoriented, Feeling almost dream-like, Out of sync with events, Altered perception of reality, Difficulty tracking sequence of activities',
  'Long gaps between regular tasks, Disorientation in schedule, Feeling of days blending together, Missing deadlines or forgetfulness, Difficulty in duration estimation',
  'Feeling disoriented, Memory difficulties, Lack of focus, Emotional distress, Sense of urgency',
  'Disorientation, Anxiety or unease, Altered perception, Absence of routine awareness, Mental confusion',
  'Disoriented, feeling of endlessness, inability to plan ahead, surprise at actual time, loss of routine',
  'Disoriented, Frustrated, Anxious about being late, Unaware of passing hours, Difficulty scheduling activities',
  'Disoriented, Unaware of passing hours, Misse

In [35]:
all_prompts["generated_text"] = persona_combined
all_prompts.head()

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt,generated_text
0,MEQ30,1,transcendence of time and space,I lost my usual sense of time.,,I lost my usual sense of time,"[Feeling disoriented, Increased focus on task,..."
1,MEQ30,2,positive mood,I had an experience of amazement.,,I had an experience of amazement,"[Intense curiosity, Emergence of wonder, Emoti..."
2,MEQ30,3,ineffability,I had a sense that my experience cannot be des...,,I had a sense that my experience cannot be des...,"[Ineffable, Perplexing, Overwhelms my vocabula..."
3,MEQ30,4,mystical,I gained insightful knowledge experienced at a...,,I gained insightful knowledge experienced at a...,"[Eureka sensation, Enhanced understanding, Sud..."
4,MEQ30,5,mystical,I felt that I experienced eternity or infinity.,,I felt that I experienced eternity or infinity,"[Overwhelming scale, perpetual time loop, limi..."


In [36]:
# # CHECK IF THERE WERE ANY ANSWERS WITH QUESTIONS
pattern = r'\?'
# pattern = r'\b\d+\b'
# pattern = r'\s-'

def has_question_mark(sublists):
    return any(re.search(pattern, item) for sublist in sublists for item in sublist)

filtered_df = all_prompts[all_prompts['generated_text'].apply(has_question_mark)]
filtered_df

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt,generated_text
502,PSI,35,delustional thinking,My thoughts were sometimes so strong that I co...,,My thoughts were sometimes so strong that I co...,"[Intense internal monologue, Perceptually vivi..."
968,SSD,11,,It feels as if I am going through the motions ...,,It feels as if I am going through the motions ...,"[Disconnected, Emotionally distanced, Involunt..."


In [39]:
all_prompts.at[502, 'generated_text'].pop(10)
all_prompts['generated_text'][502]

['Intense internal monologue, Perceptually vivid thinking, Overwhelming mental activity, Emotionally charged ideation, Sensory-like cognitive experience',
 'Loud internal dialogue, Sense of auditory hallucination, Overpowering cognitive activity, Strong emotional intensity, Vivid mental imagery',
 'Overpowering cognitive intensity, Sensory-like mental phenomena, Emotional overwhelm, Auditory perceptual experience, High mental activity',
 'Loud internal dialogue, Heightened self-awareness, Mental clarity, Intense focus, Sensation of hearing thoughts',
 'Intense mental imagery, overwhelming thoughts, perceptual involvement, heightened self-awareness, internal sensory overload',
 'Overwhelming mental activity, Sensation of auditory hallucination, Heightened self-awareness, Intense concentration, Elevated emotional responsiveness',
 'Intense mental focus, Intrusive thoughts, Auditory-like perception, Distinct inner voice, Struggle with silent thinking',
 'Intense internal dialogue, Vivid m

In [37]:
all_prompts.at[968, 'generated_text'].pop(3)
all_prompts['generated_text'][968]

['Disconnected, Emotionally distanced, Involuntary actions, Dream-like perception, Absence of self-identity',
 'Emotionally detached, Sensation of unbelonging, Perceived life as unreal, Conscious mental withdrawal, Feeling internally distant.',
 'Detached from surroundings, Unemotional response to events, Absence of personal investment, Difficulty in cognition processing, Lacking physical engagement',
 'Emotionally detached, Dissociative sensation, Feeling of autopilot, Lack of engagement with reality, Disconnected self-awareness',
 'Detached from surroundings, Emotionally numb, Mentally distant, Perceptually disconnected, Physiological autopilot',
 'Disconnected from reality, emotionally numb, lack of enthusiasm, feeling of outer body experience, uninterested in current activities',
 'Distant from personal reality, numbed emotions, detached self-perception, muted physical sensations, lack of cognitive engagement',
 'Disconnected from surroundings, Emotional numbness, Absence of person

In [45]:
import contractions

# clean responses to get list of features
def clean_response(gpt_answer):

    custom_stopwords = ['none', 'n/a', '']

    clean_answer = re.sub(r'\b\w+\b:', '', gpt_answer)      #  remove all words followed by a colon ':'
    # clean_answer = re.sub(r'\([^)]*\)', '', clean_answer)   # remove parentheses
    # clean_answer = re.sub(r'\([^)]+\)', '', clean_answer)   # remove parentheses and words inside it          # not for gpt4; keep words inside.

    #### SOMETIMES /N- SHOULD BE '' ??? CHECK IF THIS WORKS...
    clean_answer = re.sub(r'\n+-', ',', clean_answer)   
    # clean_answer = re.sub(r'\n+-+\s', ',', clean_answer)     
    clean_answer = re.sub(r'\s+-', '-', clean_answer)       
    # clean_answer = re.sub(r'\n', ',', clean_answer)
    # any occurrence of a newline character (\n), a semicolon (;), or a period (.)
    # clean_answer = re.sub(r'[.;]', ',', clean_answer)
    clean_answer = re.sub(r'[\n.;]', ',', clean_answer)

    clean_answer = contractions.fix(clean_answer) 

    # Remove exclamation points and quotation marks
    clean_answer = re.sub(r'[!\'"]', '', clean_answer)
    clean_answer = re.sub(r'[()]', '', clean_answer)   # remove any remaining parentheses

    # # remove numbers
    # clean_answer = re.sub(r'\b\d+\b', '', clean_answer)

    # Convert to list of entries (comma-separated)
    entries = list(clean_answer.lower().split(','))
    
    ## for any string that starts with either a whitespace character (\s) or the words "or" or "and," followed by a whitespace character, remove those characters
    entries = [re.sub(r'^(\s|or|and)\s', '', text.strip()) for text in entries]  

    # entries = [text for text in entries if text != '']
    
    # remove numbers from entries
    entries = [re.sub(r'\d', '', text.strip()) for text in entries]
    
    # REMOVE STOPWORDS
    entries = [text for text in entries if text not in custom_stopwords]

    # replace forward slash with 'or'
    entries = [re.sub(r'/', ' or ', text) for text in entries]

    unique_entries = list(set(entries))

    return unique_entries

all_prompts['cleaned_text'] = all_prompts['generated_text'].map(lambda row: [clean_response(answer) for answer in row])


In [46]:
all_prompts['generated_text'][449]

['Feeling heavy-hearted, experiencing constant sorrow, having negative thoughts, physical discomfort exists, lacking energetic feeling',
 'Overpowering sadness, sense of hopelessness, physical exhaustion, lack of motivation, persistent negative thoughts',
 'Overwhelming sadness, physical heaviness, lack of motivation, pervasive pessimism, extreme fatigue',
 'Overwhelming despair, continual negativity, physical heaviness, lack of enthusiasm, chronic sadness',
 'Overwhelming sadness, physical heaviness, lack of energy, negative thoughts, sense of hopelessness',
 'Overwhelmed by sadness, Physical discomfort, Negatively distorted thoughts, Lack of motivation, Sensation of heaviness',
 'Overwhelmed by sadness, Physical discomfort, Low energy levels, Mental fog , Loss of interest in activities',
 'Overwhelming sadness, physiological discomfort, pessimism for future, low energy levels, inability to enjoy activities',
 'Unhappiness seeping in, sense of isolation, feeling physically drained, co

In [47]:
all_prompts['cleaned_text'][449]

[['lacking energetic feeling',
  'experiencing constant sorrow',
  'having negative thoughts',
  'physical discomfort exists',
  'feeling heavy-hearted'],
 ['physical exhaustion',
  'lack of motivation',
  'persistent negative thoughts',
  'sense of hopelessness',
  'overpowering sadness'],
 ['lack of motivation',
  'overwhelming sadness',
  'pervasive pessimism',
  'physical heaviness',
  'extreme fatigue'],
 ['chronic sadness',
  'continual negativity',
  'physical heaviness',
  'overwhelming despair',
  'lack of enthusiasm'],
 ['lack of energy',
  'negative thoughts',
  'overwhelming sadness',
  'physical heaviness',
  'sense of hopelessness'],
 ['sensation of heaviness',
  'overwhelmed by sadness',
  'negatively distorted thoughts',
  'lack of motivation',
  'physical discomfort'],
 ['mental fog',
  'loss of interest in activities',
  'overwhelmed by sadness',
  'low energy levels',
  'physical discomfort'],
 ['pessimism for future',
  'physiological discomfort',
  'overwhelming sa

In [49]:
pattern = r'\s-'

def has_question_mark(sublists):
    return any(re.search(pattern, item) for sublist in sublists for item in sublist)

filtered_df = all_prompts[all_prompts['cleaned_text'].apply(has_question_mark)]
filtered_df

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt,generated_text,cleaned_text


SEPARATING ALL WORDS INSTEAD OF KEEPING THEM AS PHRASES

In [50]:
from nltk.corpus import wordnet

# Map POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    # Map POS tag to first character used by WordNetLemmatizer
    tag = tag[0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if not found

In [51]:
# PREPROCESSING FUNCTION TO GET TOKENS FROM PHRASES (SAME AS THAT IN THE FEATURE CLUSTERING NOTEBOOK...)
# NEED TO DO POS-TAGGING FOR PROPER LEMMATIZATION
# THIS FUNCTION IS ALSO NEEDED TO COMPUTE GENSIM COHERENCE DOWNSTREAM
from unidecode import unidecode

def preprocess_text(text, add_stopwords=None):

    # Tokenize the text
    tokens = word_tokenize(text.lower())
    # Perform Part-of-Speech (POS) tagging
    pos_tags = nltk.pos_tag(tokens)

    # Remove stop words
    stop_words = stopwords.words('english')
    # this is what I had originally...
    stop_words.extend(['', 'etc', 'feel', 'felt', 'feeling'])

    if add_stopwords is not None:
        stop_words.extend(add_stopwords)

    # stop_words.extend(['etc', 'feeling', 'sense', '', 'experience', 'felt', 'feel', 'feels', 'like', 'mind'])

    lemmatizer = WordNetLemmatizer()    # Lemmatize the tokens
    # stemmer = PorterStemmer()
    # tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    # Remove like as a preposition (e.g. "feel like")
    tokens = [lemmatizer.lemmatize(contractions.fix(unidecode(token)), get_wordnet_pos(tag))
              for token, tag in pos_tags
              if not (token == 'like' and tag == 'IN')]

    # tokens = [stemmer.stem(token) for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]

    preprocessed_text = ' '.join(tokens)     # Reconstruct the text from the preprocessed tokens

    return preprocessed_text
    # return tokens

In [52]:
all_prompts['preprocessed_text'] = all_prompts['cleaned_text'].map(
    lambda row: [[preprocess_text(text) for text in sublist] for sublist in row if len(sublist)>0]      # PREPROCESS AND REMOVE EMPTY LISTS
)
all_prompts.head()

Unnamed: 0,Instrument,ItemNumber,Dimension,prompt,exclude,no_punct_prompt,generated_text,cleaned_text,preprocessed_text
0,MEQ30,1,transcendence of time and space,I lost my usual sense of time.,,I lost my usual sense of time,"[Feeling disoriented, Increased focus on task,...","[[increased focus on task, no awareness of hou...","[[increased focus task, awareness hour pass, d..."
1,MEQ30,2,positive mood,I had an experience of amazement.,,I had an experience of amazement,"[Intense curiosity, Emergence of wonder, Emoti...","[[emergence of wonder, desire to explore more,...","[[emergence wonder, desire explore, awe, emoti..."
2,MEQ30,3,ineffability,I had a sense that my experience cannot be des...,,I had a sense that my experience cannot be des...,"[Ineffable, Perplexing, Overwhelms my vocabula...","[[defies interpretation, ineffable, overwhelms...","[[defies interpretation, ineffable, overwhelms..."
3,MEQ30,4,mystical,I gained insightful knowledge experienced at a...,,I gained insightful knowledge experienced at a...,"[Eureka sensation, Enhanced understanding, Sud...","[[mental lightbulb moment, sudden clarity, eur...","[[mental lightbulb moment, sudden clarity, eur..."
4,MEQ30,5,mystical,I felt that I experienced eternity or infinity.,,I felt that I experienced eternity or infinity,"[Overwhelming scale, perpetual time loop, limi...","[[sense of unending space, overwhelming scale,...","[[sense unending space, overwhelm scale, perpe..."


In [53]:
#checking to see the preprocessing so far

print(all_prompts['generated_text'][129])
print(all_prompts['cleaned_text'][129])
print(all_prompts['preprocessed_text'][129])

['Clarity about life goals, Emotional relief, Increased self-awareness, Understanding of personal patterns, Eases decision making', 'Enlightening, resolutions forming, relieving of stress, improved mental clarity, heightened self-awareness', 'Sudden clarity about past events, Understanding unseen connections, Emotional relief from realizations, Motivation for future action, Shift in personal perspective', 'Clear understanding of self, Aha moment, Feeling of relief, Sense of direction, Emotional catharsis', 'Sudden clarity on past events, Emotional relief from worries, Understanding underlying issues, Feeling of progress in life, Increased motivation for action', 'Clarity of thought, heightened self-awareness, emotional release, sense of determination, physical calmness.', 'Clarity about life decisions, Self-awareness increased, Emotional relief from worries, Enhanced motivation and focus, Stronger sense of purpose', 'Clarity about future actions, Reduction of anxiety, Sense of relief, 

In [54]:
# all generated features (not tokenized, as phrases)
all_answers_phrases = all_prompts['cleaned_text'].explode().explode() # if n>1, for list of lists
# all generated features (tokenized)
all_answers_tokenized = all_prompts['preprocessed_text'].explode().explode() # if n>1, for list of lists

print('Number of total phrases generated: ', len(all_answers_phrases))
print('Number of unique phrases generated, not lemmatized: ', len(set(all_answers_phrases.values)))
print('Number of unique phrases generated, lemmatized: ', len(set(all_answers_tokenized.values)))

Number of total phrases generated:  76802
Number of unique phrases generated, not lemmatized:  40745
Number of unique phrases generated, lemmatized:  37082


In [83]:
set1 = set(all_answers_tokenized)
set2 = set(gpt_responses_preprocessed)

difference = set1.difference(set2)

In [84]:
difference

{''}

In [55]:
gpt_responses_preprocessed = [word for word in list(set(all_prompts['preprocessed_text'].explode().explode().values)) if len(word)>0] # if n>1, for list of lists
print('Number of unique phrases generated by GPT, lemmatized: ', len(gpt_responses_preprocessed))

Number of unique phrases generated by GPT, lemmatized:  37081


# FIND AND PRESERVE N-GRAMS

In [56]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, regexp_tokenize, RegexpTokenizer
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

#phrases = list(set(all_answers_tokenized1.values))             # list of all phrases generated, lemmatized

#def custom_tokenizer(text):
#    # Use regular expression tokenizer to split on spaces while preserving hyphenated words
#    tokens = regexp_tokenize(text, pattern=r'\s+|-')
#    return tokens

# Step 1: Tokenize the phrases
# tokens = [word_tokenize(phrase.lower()) for phrase in phrases]
# tokens = [custom_tokenizer(phrase.lower()) for phrase in phrases]
tokenizer = RegexpTokenizer(r'\b[\w-]+\b')
# tokenizer = RegexpTokenizer(r'\s+', gaps=True)
tokens = [tokenizer.tokenize(phrase.lower()) for phrase in gpt_responses_preprocessed]

# Step 2: Filter out irrelevant tokens
#stop_words = set(stopwords.words('english'))
# custom extend stopwords
# stop_words.append('sense')
# filtered_tokens = [[word for word in phrase_tokens if word.isalnum() and word not in stop_words] for phrase_tokens in tokens]
#filtered_tokens = [[word for word in phrase_tokens if word not in stop_words] for phrase_tokens in tokens]

#filtered_tokens_phrases = [' '.join(token) for token in filtered_tokens]
# filtered_tokens_phrases

In [88]:
# BIGRAMS

bigram_measures = BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_words([word for phrase_tokens in tokens for word in phrase_tokens])
bigram_finder.apply_freq_filter(11)
bigrams = bigram_finder.nbest(bigram_measures.likelihood_ratio, len(bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)))

# Step 4 and 5: Check for compound words and save meaningful bigrams
compound_bigrams = []
for bigram in bigrams:
    first_word, second_word = bigram
    # String variable with the pattern you want to match
    pattern_string = first_word + second_word
    # Using an f-string to create the regex pattern
    pattern = re.compile(fr"{pattern_string}")

    for phrase in ["".join(phrase_tokens) for phrase_tokens in tokens]:
        if pattern.search(phrase):
            compound_bigrams.append(first_word + " " + second_word)

print(list(set(compound_bigrams)))
print(len(set(compound_bigrams)))

['physical perception', 'inner voice', 'physiological sensation', 'increased focus', 'sensory experience', 'positive emotion', 'thought increase', 'increase emotional', 'shift mood', 'immediate surroundings', 'personal identity', 'emotion sensation', 'emotional reaction', 'body language', 'past self', 'reduce emotional', 'sense profound', 'anticipation future', 'loss personal', 'connection surroundings', 'muscle tension', 'mental picture', 'vivid color', 'memory recall', 'perception surroundings', 'inner calmness', 'mental activity', 'focused attention', 'visual imagery', 'thought become', 'high emotional', 'stress level', 'response time', 'spatial awareness', 'connection past', 'awareness mental', 'sense purpose', 'focus sensation', 'empathy towards', 'familiar surroundings', 'perception shift', 'time passage', 'time distortion', 'without external', 'perception intense', 'time dilation', 'emotional involvement', 'physical presence', 'distorted perception', 'recall past', 'cognitive fo

In [86]:
# BIGRAMS
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

test_range = [*range(5, 15, 3)]
test_range

bigram_measures = BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_words([word for phrase_tokens in tokens for word in phrase_tokens])

n_bigrams = []

for freq in test_range:

    bigram_finder.apply_freq_filter(freq)
    bigrams = bigram_finder.nbest(bigram_measures.likelihood_ratio, len(bigram_finder.score_ngrams(bigram_measures.likelihood_ratio)))

    # Step 4 and 5: Check for compound words and save meaningful bigrams
    compound_bigrams = []
    for bigram in bigrams:
        first_word, second_word = bigram
        # String variable with the pattern you want to match
        pattern_string = first_word + second_word
        # Using an f-string to create the regex pattern
        pattern = re.compile(fr"{pattern_string}")

        for phrase in ["".join(phrase_tokens) for phrase_tokens in tokens]:
            if pattern.search(phrase):
                compound_bigrams.append(first_word + " " + second_word)

    n_bigrams.append(len(set(compound_bigrams)))

In [87]:
import plotly.express as px

fig = px.line(x=test_range, y=n_bigrams)
fig.update_layout(xaxis_title="frequency filter",
                  yaxis_title="number of bigrams")
fig.show()

In [89]:
# CHECK IF THESE WORDS WERE INCLUDED IN THE BIGRAMS LIST
# target_list = [('deja', 'vu'), ('mood', 'swings'), ('heart', 'rate')]
target_list = ['deja vu', 'mood swing','heart rate']

def are_tuples_in_list(list_of_tuples, target_list):
    for tup in list_of_tuples:
        if tup in target_list:
          return True
    return False

result = are_tuples_in_list(target_list, compound_bigrams)
print(result) #compound_bigrams

True


In [92]:
# TRIGRAMS
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder

# Step 3: Generate trigrams
trigram_measures = TrigramAssocMeasures()
trigram_finder = TrigramCollocationFinder.from_words([word for phrase_tokens in tokens for word in phrase_tokens])
trigram_finder.apply_freq_filter(5)

# trigrams = trigram_finder.nbest(trigram_measures.pmi, len(trigram_finder.score_ngrams(trigram_measures.pmi)))
trigrams = trigram_finder.nbest(trigram_measures.likelihood_ratio, len(trigram_finder.score_ngrams(trigram_measures.likelihood_ratio)))

# Step 4 and 5: Check for compound words and save meaningful trigrams
compound_trigrams = []
for trigram in trigrams:
    first_word, second_word, third_word = trigram
    # String variable with the pattern you want to match
    pattern_string = first_word + second_word + third_word
    # Using an f-string to create the regex pattern
    pattern = re.compile(fr"{pattern_string}")

    for phrase in ["".join(phrase_tokens) for phrase_tokens in tokens]:
        if pattern.search(phrase):
            compound_trigrams.append(first_word + " " + second_word + " " + third_word)

print(list(set(compound_trigrams)))
print(len(set(compound_trigrams)))

['without external stimulus', 'recall past event', 'beyond physical boundary', 'sense deja vu', 'strong emotional connection', 'deep emotional connection', 'increase empathy towards', 'emotional connection past', 'focus present task', 'focus physical sensation', 'emotional response sound', 'focus present moment', 'heighten awareness certain', 'cognitive shift towards', 'persistent negative thought', 'empathy towards others', 'sense spatial orientation', 'compassion towards others', 'increase heart rate', 'lack emotional connection', 'perception beyond physical', 'heart rate increase', 'time pass quickly', 'focus task hand', 'increased heart rate']
25


In [90]:
# TRIGRAMS
from nltk.collocations import TrigramAssocMeasures, TrigramCollocationFinder

trigram_measures = TrigramAssocMeasures()
trigram_finder = TrigramCollocationFinder.from_words([word for phrase_tokens in tokens for word in phrase_tokens])

n_trigrams = []

for freq in test_range:

    trigram_finder.apply_freq_filter(freq)

    # trigrams = trigram_finder.nbest(trigram_measures.pmi, len(trigram_finder.score_ngrams(trigram_measures.pmi)))
    trigrams = trigram_finder.nbest(trigram_measures.likelihood_ratio, len(trigram_finder.score_ngrams(trigram_measures.likelihood_ratio)))

    # Step 4 and 5: Check for compound words and save meaningful trigrams
    compound_trigrams = []
    for trigram in trigrams:
        first_word, second_word, third_word = trigram
        # String variable with the pattern you want to match
        pattern_string = first_word + second_word + third_word
        # Using an f-string to create the regex pattern
        pattern = re.compile(fr"{pattern_string}")

        for phrase in ["".join(phrase_tokens) for phrase_tokens in tokens]:
            if pattern.search(phrase):
                compound_trigrams.append(first_word + " " + second_word + " " + third_word)

    n_trigrams.append(len(set(compound_trigrams)))

In [91]:
fig = px.line(x=test_range, y=n_trigrams)
fig.update_layout(xaxis_title="frequency filter",
                  yaxis_title="number of trigrams")
fig.show()

In [93]:
# for tokens that were not included as bigrams or trigrams, extract and lemmatize
# Step 8: Combine compound bigrams and trigrams to get all compound words
compound_words = set(compound_bigrams + compound_trigrams)
len(compound_words)
# # Step 9: Lemmatize tokens that are not part of compound words
# non_compound_tokens = [word for phrase_tokens in filtered_tokens for word in phrase_tokens if word not in compound_words]
# lemmatizer = WordNetLemmatizer()
# lemmatized_tokens = [lemmatizer.lemmatize(token) for token in non_compound_tokens]

# print(lemmatized_tokens)

458

In [94]:
# FOR EACH PHRASE/FEATURE AN N-GRAM CAME FROM, EXTRACT LEFTOVER TOKENS AS INDIVIDUAL FEATURES..
# THEN SHOULD EACH PHRASE BE SAVED AS ['TOKEN', 'NGRAM', 'TOKEN']?

# Function to find remaining words in each input string
def find_remaining_words(input_string, given_list):

    # Find the phrases that match any part of the input string
    matching_phrases = [phrase for phrase in given_list if phrase in input_string]

    # Remove the matching phrases from the input string
    for phrase in matching_phrases:
        pattern1 = r'-' + re.escape(phrase)
        pattern2 = re.escape(phrase) + r'-'
        if not re.search(pattern1, input_string) and not re.search(pattern2, input_string):
            input_string = input_string.replace(phrase, '')

    # Remove any leading and trailing whitespaces and split the remaining words
    remaining_words = input_string.strip().split()

    # single_features = [word for word in remaining_words if word not in stop_words]

    # return single_features
    return remaining_words
    # return input_string

# Find remaining words for each input string
remaining_words_list = [find_remaining_words(input_string=input_str, given_list=compound_words) for input_str in gpt_responses_preprocessed]
print(remaining_words_list)

[[], ['fond', 'memory', 'replay'], ['sensation', 'numbness', 'area'], ['less', 'significant'], ['immediate'], ['intense', 'determination'], ['absence', 'breath', 'rhythm'], ['enjoyable', 'sensation'], ['skew'], ['sensory', 'doubt'], ['replay'], [], ['sensation', 'recall'], ['urgency', 'leave', 'situation'], ['uneven'], ['intense', 'alertness'], ['fluid'], ['fragment', 'self-perception'], ['miraculous', 'transcendent'], ['sensory', 'revival'], ['raise'], ['embarrass', 'consider', 'share', 'thought'], ['missed', 'time'], ['euphoric', 'exhilaration'], ['flow'], ['dizzy'], ['recognition', 'subjectivity'], ['moment', 'stretch', 'shrink'], ['detachment', 'emotion'], ['nature'], ['intrusion', 'consciousness'], ['negative', 'mindset'], ['drive'], ['linger', 'sense', 'contentment'], ['irrelevant'], ['reduce'], ['acceptance', 'uncertainty'], [], ['intriguing', 'revelation'], ['calm', 'center'], ['immense', 'optimism'], ['conscious', 'mind', 'engagement'], ['challenge', 'self-concept'], ['unclear

In [95]:
from itertools import chain

single_words = list(set(list(chain(*remaining_words_list))))
len(single_words)

6597

In [96]:
all_features = list(set(compound_words)) + list(set(single_words))
all_features_df = pd.DataFrame(list(set(all_features)), columns=["feature"])
print(len(list(set(all_features))))
all_features_df

7055


Unnamed: 0,feature
0,punishment
1,increased focus
2,arm
3,episode
4,repetitively
...,...
7050,hypnagogic
7051,cognitive engagement
7052,responsiveness
7053,sensually


In [97]:
all_features.index('s')

4921

In [99]:
all_features[1365]

's'

In [66]:
len(list(set(filtered_features)))

In [98]:
#a check
if '' in all_features:
    print(True)
else: print(False)

False


In [99]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Function to convert text data in the DataFrame column into lists of vocabularies
def get_vocabularies(sublists, custom_vocabulary, ngrams=(1,3)):
    vocabularies_list = []
    for sublist in sublists:
        sublist_vocabularies = []
        for text in sublist:
            # Create CountVectorizer instance with custom vocabulary and ngram_range
            # The (?u) flag makes the pattern Unicode-aware. The pattern \b\w[\w-]*\w\b matches words with hyphens as well as alphanumeric words.
            vectorizer = CountVectorizer(vocabulary=custom_vocabulary, ngram_range=ngrams, token_pattern=r"(?u)\b\w[\w-]*\w\b")

            # Transform the text data into a bag-of-words matrix
            bow_matrix = vectorizer.fit_transform([text])

            # Convert the bow_matrix to a list of features (lists of vocabularies)
            features_list = bow_matrix.toarray().tolist()

            # Convert the lists of features into a list of vocabularies
            vocabularies = [vocab for vocab, count in zip(custom_vocabulary, features_list[0]) if count > 0]
            sublist_vocabularies.append(vocabularies)
        vocabularies_list.append(sublist_vocabularies)

    return vocabularies_list

all_prompts['preprocessed_tokens'] = all_prompts['preprocessed_text'].apply(lambda x: get_vocabularies(sublists=x, custom_vocabulary=all_features, ngrams=(1,3)))

In [100]:
# checking to see the contents
all_prompts['preprocessed_tokens'][160][0]

[['perception', 'speed'],
 ['dizziness', 'slight'],
 ['contraction', 'muscle'],
 ['motion'],
 ['spatial awareness', 'spatial', 'awareness', 'alter']]

In [70]:
#exporting to excel file to search for mistakes
all_prompts.to_excel('personascombined.xlsx', index=False)

In [101]:
### EXPORT the preprocessed results - MAKE SURE TO CHANGE FILE NAME ###
with open('preprocessed/personascombined_gpt4_responses_030424.pkl', 'wb') as f:
    pickle.dump(all_prompts, f)

with open('preprocessed/personascombined_gpt4_features_ngrams_030424.pkl', 'wb') as f:
    pickle.dump(all_features_df, f)