# 1 | SETUP

# 1.1  Imports

In [1]:
from pycaret.regression import RegressionExperiment, load_model, predict_model
import os
import pandas as pd
import numpy as np
import openai

from OpenAIVectorizer import OpenAIVectorizer
from embedding_utilities import *
from tqdm import tqdm
tqdm.pandas(desc="Processing")

import random 
import time

# Joblib cache path
import joblib
cache_dir = os.path.abspath('./.joblib_cache/csv_cache')
os.makedirs(cache_dir, exist_ok=True)
os.environ['JOBLIB_TEMP_FOLDER'] = cache_dir

In [2]:
# 1.2  Global Parameters

In [3]:
# Define Directories
root_directory='.'
data_directory='Data'
models_directory = 'Models'
embed_path = 'Embeddings/'
output_path = 'Predictions/'
key_file = 'Keys/key.env'

# Data Path
DATA_PATH = os.path.abspath(os.path.join(root_directory, data_directory))

# 1.3  Prediction Task Parameters

In [4]:
# CSV file to score
input_data = 'text4scoring_csv-test'

# Define models for prediction task
model_names_openai = ['openai_extra', 'openai_openn', 'openai_consc', 'openai_agree', 'openai_neuro','openai_narc','openai_humility']

# 2 | LOAD DATASET

In [5]:
try:
    df = pd.read_csv(os.path.join(DATA_PATH, input_data+'.csv'), encoding='utf-8')
    print('Read the data usnig utf-8 encoding')
except:
    df = pd.read_csv(os.path.join(DATA_PATH, input_data+'.csv'), encoding='ISO-8859-1')
    print('Read the data using ISO-8859-1 encoding')
df

Read the data usnig utf-8 encoding


Unnamed: 0,textid,fulltext
0,1,"No. These things, of course, are very difficu..."
1,2,In our initial run we did identify a number o...
2,3,"Thank you, Debbie. Can everybody hear me? Is ..."
3,4,"The pieces are pretty actively moving around,..."
4,5,"Well, relative to the current situation with ..."
5,6,"Well, I mean Jamie we continue to work the pi..."
6,7,"Yes, they're pretty much across the board. Mo..."
7,8,"Well, good morning everyone and welcome to ou..."
8,9,"Thank you, Mary Ann. Ladies and gentlemen, go..."
9,10,"Buying back Aon stock, I don't think that's, ..."


In [6]:
text_col = df['fulltext']
text_col

0     No. These things, of course, are very difficu...
1     In our initial run we did identify a number o...
2     Thank you, Debbie. Can everybody hear me? Is ...
3     The pieces are pretty actively moving around,...
4     Well, relative to the current situation with ...
5     Well, I mean Jamie we continue to work the pi...
6     Yes, they're pretty much across the board. Mo...
7     Well, good morning everyone and welcome to ou...
8     Thank you, Mary Ann. Ladies and gentlemen, go...
9     Buying back Aon stock, I don't think that's, ...
Name: fulltext, dtype: object

# 3 | TRAIT PREDICTION

# 3.1 Load Trained OpenAI GPT Models

In [7]:
models = [load_model(f'{models_directory}/{model_name}_model') for model_name in model_names_openai]

# Slove pycaret and joblib cache error
from sklearn.pipeline import Pipeline
for m in models:
    if isinstance(m, Pipeline):
        m.memory = joblib.Memory(location=cache_dir)

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


# 3.2 Apply OpenAI GPT Embeddings

In [8]:
# Define the range of observations to score
start_row = 0
end_row = df.shape[0]

# Define file name for sample embeddings
embedding_file_name = embed_path + input_data + '_openai_embeddings_' + str(start_row) + '-' + str(end_row) + '.csv'

# Initialize a flag to keep track of successful embeddings
all_rows_embedded = False

# Get the OpenAI embeddings for the data  [This may take a minute]
while not all_rows_embedded:
    try:
        embeddings = apply_embedding(df[start_row:end_row], text_col[start_row:end_row], embedding_file_name, None, type='openai', key=key_file)
        all_rows_embedded = True  # Set flag to True if embeddings are successful for all rows
    except openai.OpenAIError as e:
        randomness_collision_avoidance = random.randint(0, 1000) / 1000.0
        sleep_dur = 20 ** start_row + randomness_collision_avoidance  # Exponential backoff based on start_row
        print(f"Error: {e}. Retrying in {round(sleep_dur, 2)} seconds.")
        time.sleep(sleep_dur)

Python-dotenv could not parse statement starting at line 2
Python-dotenv could not parse statement starting at line 8
Processing:   0%|                                                                                                             | 0/10 [00:00<?, ?it/s]

0.0


Processing: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:11<00:00,  1.18s/it]

embeddings done
embeddings saved to Embeddings/text4scoring_csv-test_openai_embeddings_0-10.csv
OPENAI embeddings applied.





In [9]:
enc_df_oai = pd.read_csv(embedding_file_name)
enc_df_oai

Unnamed: 0,textid,0,1,2,3,4,5,6,7,8,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,1,-0.004683,-0.01401,0.005662,-0.008923,0.012284,0.00764,-0.005017,-0.003163,-0.028378,...,0.012809,-0.005894,0.006294,0.003033,-0.027453,-0.011753,0.018413,-0.012056,-0.007012,0.011497
1,2,-0.014846,-0.009828,0.012392,-0.027066,0.000886,0.023499,-0.006331,-0.031102,-0.007658,...,0.022131,0.00189,0.019574,-0.022076,-0.029195,0.008107,-0.002975,-0.022311,-0.028946,0.004137
2,3,-0.000396,-0.008708,0.028673,-0.030848,-0.026639,0.009873,-0.004855,-0.008192,0.01118,...,0.000609,-0.01291,0.0288,-0.010297,-0.025806,0.026978,0.010396,-0.011321,-0.030368,-0.017656
3,4,0.006894,-0.012019,-0.00082,-0.014417,-0.010663,0.009873,-0.005017,-0.024304,-0.014299,...,0.004003,-0.004866,0.020151,-0.001914,-0.018333,0.010474,-0.003513,-0.007964,-0.024486,-0.02299
4,5,0.006205,-0.019333,0.004606,-0.012523,0.017807,0.011319,0.003509,-0.019361,-0.020873,...,0.019683,-0.015526,0.003337,-0.016785,-0.024835,-0.003335,-0.004823,-0.00547,0.001564,-0.011536
5,6,-0.0098,-0.01631,-0.003602,-0.017837,-0.016157,0.02046,-0.031732,-0.003375,-0.021307,...,0.011278,0.001855,0.004109,-0.011466,-0.032592,0.026818,-0.011653,-0.013138,-0.015435,-0.024555
6,7,-0.036656,-0.017652,0.007326,-0.018962,-0.012714,0.018497,-0.025259,-0.022681,-0.0224,...,0.026724,-0.008699,0.019018,-0.017877,-0.008875,0.015539,2e-06,-0.002251,-0.016116,0.001702
7,8,-0.015074,-0.001719,-0.02449,-0.010345,-0.031745,0.030753,0.004243,-0.009884,-0.009203,...,0.022163,-0.004495,0.004233,-0.024531,-0.02661,0.015583,0.009354,-0.015583,-0.014048,0.006959
8,9,-0.012027,-0.026447,-0.00285,-0.014039,-0.024602,0.020738,0.002156,0.009139,0.003981,...,0.014266,0.013411,0.012836,-0.025952,-0.035031,0.03447,0.017355,-0.025458,-0.010162,-0.002149
9,10,-0.011416,-0.023749,0.006914,-0.016848,-0.002916,-0.001104,-0.003919,-0.021066,-0.016578,...,0.015379,0.001413,0.033911,-0.001926,-0.02143,0.000832,-0.002305,-0.007218,-0.018169,-0.024342


# 3.3 Generate Predicted Scores

In [10]:
# Function to make OpenAI GPT predictions
def predict_attribute_openai(enc_df_oai):
    predictions = [predict_model(model, data=enc_df_oai)[['prediction_label']] for model in models]
    # rename the predictions output to the attribute names
    for i in range(len(model_names_openai)):
        predictions[i] = predictions[i].rename(columns={'prediction_label': model_names_openai[i]})
    # concatenate the predictions
    predictions = pd.concat(predictions, axis=1)
    return predictions

In [11]:
# Make predictions
predictions = predict_attribute_openai(enc_df_oai)
predictions

________________________________________________________________________________
[Memory] Calling pycaret.internal.pipeline._full_transform...
_full_transform(Pipeline(memory=Memory(location=/Users/apple/OLCPT-fix/.joblib_cache/csv_cache/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['0', '1', '2', '3', '4', '5', '6',
                                             '7', '8', '9', '10', '11', '12',
                                             '13', '14', '15', '16', '17', '18',
                                             '19', '20', '21', '22', '23', '24',
                                             '25', '26', '27', '28', '29', ...],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent')))]), 
          0         1         2         3         4         5    

Unnamed: 0,openai_extra,openai_openn,openai_consc,openai_agree,openai_neuro,openai_narc,openai_humility
0,5.124,4.68,4.638,4.74,3.048,3.714,3.606
1,4.812,4.752,4.29,4.932,2.922,4.566,3.552
2,4.68,3.948,3.888,3.99,3.372,4.242,4.32
3,4.17,4.428,4.932,4.872,3.03,3.282,3.81
4,3.996,3.786,3.738,4.8,2.766,4.59,3.276
5,3.804,4.836,4.896,4.283999,3.654,3.498,3.66
6,3.81,3.762,3.876,4.098,4.176,4.026,2.412
7,4.56,3.87,4.092,4.638,2.46,3.498,3.972
8,4.914,4.386,4.674,4.236,3.45,4.776,3.84
9,5.178,4.422,4.656,3.984,3.444,2.922,4.14


In [12]:
# Merge predictions with original data (without "fulltext" column)
output_df_oai = pd.concat([df, predictions], axis=1)
output_df_oai = output_df_oai.drop(['fulltext'], axis=1)
output_df_oai

Unnamed: 0,textid,openai_extra,openai_openn,openai_consc,openai_agree,openai_neuro,openai_narc,openai_humility
0,1,5.124,4.68,4.638,4.74,3.048,3.714,3.606
1,2,4.812,4.752,4.29,4.932,2.922,4.566,3.552
2,3,4.68,3.948,3.888,3.99,3.372,4.242,4.32
3,4,4.17,4.428,4.932,4.872,3.03,3.282,3.81
4,5,3.996,3.786,3.738,4.8,2.766,4.59,3.276
5,6,3.804,4.836,4.896,4.283999,3.654,3.498,3.66
6,7,3.81,3.762,3.876,4.098,4.176,4.026,2.412
7,8,4.56,3.87,4.092,4.638,2.46,3.498,3.972
8,9,4.914,4.386,4.674,4.236,3.45,4.776,3.84
9,10,5.178,4.422,4.656,3.984,3.444,2.922,4.14


In [13]:
# Function to save the predicted scores using OpenAI GPT embeddings
def save_output_openai(output_df_oai, output_path, name):
    if os.path.exists(os.path.join(output_path, name+'.csv')):
        i = 1
        while os.path.exists(os.path.join(output_path, name+'_'+str(i)+'.csv')):
            i += 1
        output_df_oai.to_csv(os.path.join(output_path, name+'_'+str(i)+'.csv'), index=False)
    else:
        output_df_oai.to_csv(os.path.join(output_path, name+'.csv'), index=False)

In [14]:
# Save predicted scores for OpenAI GPT models specified
name = input_data+'_predictions_openai'
save_output_openai(output_df_oai, output_path, name)