# 1 | SETUP

# 1.1  Imports

In [1]:
from pycaret.regression import RegressionExperiment, load_model, predict_model
import os
import pandas as pd
import numpy as np
import openai

from OpenAIVectorizer import OpenAIVectorizer
from embedding_utilities import *
from tqdm import tqdm
tqdm.pandas(desc="Processing")

from dotenv import load_dotenv

# Joblib cache path
import joblib
cache_dir = os.path.abspath('./.joblib_cache/txt_cache')
os.makedirs(cache_dir, exist_ok=True)
os.environ['JOBLIB_TEMP_FOLDER'] = cache_dir

# 1.2  Global Parameters

In [2]:
# Define Directories
root_directory='.'
data_directory='Data'
models_directory = 'Models'
embed_path = 'Embeddings/'
output_path = 'Predictions/'
key_file = 'Keys/key.env'

# Data Path
DATA_PATH = os.path.abspath(os.path.join(root_directory, data_directory))

# 1.3  Prediction Task Parameters

In [3]:
# Directory with .txt files to score
input_directory = 'text4scoring_txt-test'

# Define models for prediction task
model_names_openai = ['openai_extra', 'openai_openn', 'openai_consc', 'openai_agree', 'openai_neuro','openai_narc','openai_humility']

# 2 | LOAD DATASET

In [4]:
# Function to read all text files from directory and load them in an array
def load_txt_files(directory_path):
    files = os.listdir(directory_path)
    #files.sort()
    texts = []
    for file in files:
        with open(directory_path+file, 'r', encoding='utf-8', errors = 'ignore') as f:
            texts.append(f.read())

    return texts, files

texts, file_names = load_txt_files(data_directory+'/'+input_directory+'/')
texts = np.array(texts)
texts.shape

(11,)

In [5]:
df = pd.DataFrame(file_names, columns=['file_name'])
df

Unnamed: 0,file_name
0,4.txt
1,.DS_Store
2,9.txt
3,8.txt
4,7.txt
5,3.txt
6,10.txt
7,6.txt
8,5.txt
9,2.txt


# 3 | TRAIT PREDICTION

# 3.1 Load Trained OpenAI GPT Models

In [6]:
models = [load_model(f'{models_directory}/{model_name}_model') for model_name in model_names_openai]

# Slove pycaret and joblib cache error
from sklearn.pipeline import Pipeline
for m in models:
    if isinstance(m, Pipeline):
        m.memory = joblib.Memory(location=cache_dir)

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


# 3.2 Apply OpenAI GPT Embeddings

In [9]:
# Load OpenAI API key
def load_openai_key(fallback_env_path="Keys/key.env"):
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        if os.path.exists(fallback_env_path):
            load_dotenv(fallback_env_path)
            api_key = os.getenv("OPENAI_API_KEY")
        else:
            raise FileNotFoundError(f"Key file '{fallback_env_path}' not found and no OPENAI_API_KEY in environment.")
    if not api_key:
        raise ValueError("OPENAI_API_KEY not found in environment variables or key file.")
    openai.api_key = api_key

load_openai_key(key_file)

In [10]:
# Trim texts that exceed OpenAI token (word) limit
trimmed_texts = [OpenAIVectorizer.simple_trim(text) for text in texts]

In [11]:
# Get the OpenAI embeddings for the data [This may take a minute]
embeddings = [OpenAIVectorizer.simple_extract_embedding_only(OpenAIVectorizer.simple_encode(text,key)) for text in trimmed_texts]

0.0


In [12]:
enc_dfs = [pd.DataFrame(np.expand_dims(np.array(x), axis=-1).transpose()) for x in embeddings]

In [13]:
enc_df_oai = pd.concat(enc_dfs, axis=0).reset_index(drop=True)
enc_df_oai

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,-0.019261,-0.016952,0.023624,-0.002961,0.002761,0.007537,-0.005896,-0.014102,-0.030229,-0.026906,...,-0.00438,0.006615,0.010137,-0.009435,-0.012859,0.006767,-0.024056,-0.008739,-0.011204,-0.009833
1,-0.017212,-0.003617,-0.005733,-0.018562,-0.033541,0.014166,-0.021304,-0.002618,-0.000196,-0.015682,...,0.019251,-0.001674,0.022269,-0.022365,-0.005571,0.0409,-0.00751,-0.013491,0.016219,-0.003461
2,-0.001445,-0.01621,0.008953,-0.028643,-0.001764,-0.00725,0.000591,0.001077,-0.018139,-0.018112,...,0.008188,0.004163,0.011771,-0.017988,-0.025734,0.005862,-0.008194,-0.000928,-0.013033,-0.019104
3,-0.016319,-0.014558,0.00488,-0.014405,-0.013629,0.002244,0.006364,-0.016929,-0.014599,-0.032471,...,0.001787,-0.00443,0.008277,-0.005289,-0.038488,0.004024,-0.013871,0.003379,0.008284,-0.018121
4,-0.02041,-0.020219,0.017922,-0.007854,-0.010643,0.013144,-0.002201,1e-06,-0.002821,-0.033357,...,-0.008906,0.001762,0.016337,-0.033083,-0.015147,0.003862,-0.018907,-0.014559,-0.005971,-0.020506
5,-0.010674,-0.015419,0.013091,-0.028619,-0.005426,0.012591,0.003064,-0.007908,-0.010852,-0.014666,...,-0.003632,-0.020088,0.030892,-0.00556,-0.02762,0.009086,-0.016104,-0.001119,-0.011872,-0.011448
6,-0.003891,-0.013435,0.011368,-0.040208,-0.016094,0.021301,0.004706,-0.007692,-0.039233,-0.022972,...,0.008576,-0.000808,0.022791,-0.02112,-0.035307,0.016428,-0.012572,0.00575,-0.002892,-0.012161
7,-0.010288,0.003179,0.019143,0.003058,-0.029017,0.017101,0.00192,-0.020094,-0.03343,-0.030865,...,-0.000362,0.003851,0.012261,-0.006199,-0.019267,0.001803,0.001558,0.009551,-0.00581,-0.023708
8,-0.011264,-0.015404,-0.006323,-0.014859,-0.010249,-0.017311,-0.011352,-0.014219,-0.006132,-0.029855,...,0.004542,-0.010521,0.012156,-0.018223,-0.01964,0.016208,-0.011318,-0.022268,-0.003427,-0.021083
9,-0.014991,-0.009408,0.001684,-0.011648,-0.020581,0.01106,-0.000691,5e-06,-0.016048,-0.027923,...,-0.011775,0.000601,0.024419,-0.014336,-0.026693,0.019926,-0.011327,-0.012858,0.003561,-0.010986


# 3.3 Generate Predicted Scores

In [14]:
# Function to make OpenAI GPT predictions
def predict_attribute_openai(enc_df_oai):
    predictions = [predict_model(model, data=enc_df_oai)[['prediction_label']] for model in models]
    # rename the predictions output to the attribute names
    for i in range(len(model_names_openai)):
        predictions[i] = predictions[i].rename(columns={'prediction_label': model_names_openai[i]})
    # concatenate the predictions
    predictions = pd.concat(predictions, axis=1)
    return predictions

In [15]:
# Make predictions
predictions = predict_attribute_openai(enc_df_oai)
predictions

________________________________________________________________________________
[Memory] Calling pycaret.internal.pipeline._full_transform...
_full_transform(Pipeline(memory=Memory(location=/home/novel2430/distrobox/arch/Mars-CEO/.joblib_cache/joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['0', '1', '2', '3', '4', '5', '6',
                                             '7', '8', '9', '10', '11', '12',
                                             '13', '14', '15', '16', '17', '18',
                                             '19', '20', '21', '22', '23', '24',
                                             '25', '26', '27', '28', '29', ...],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=[],
                                    transformer=SimpleImputer(strategy='most_frequent')))]), 
           0         1         2         3         4      

Unnamed: 0,openai_extra,openai_openn,openai_consc,openai_agree,openai_neuro,openai_narc,openai_humility
0,4.554,4.776,4.452,4.896,3.054,3.954,4.098
1,4.734,4.422,4.836,3.36,4.104,4.962,3.822
2,4.566,5.28,5.28,5.64,2.79,4.164,3.654
3,4.284,4.170001,4.278,4.428,2.85,3.888,3.276
4,6.42,4.596,4.35,4.824,2.766,5.364,1.89
5,5.208,4.11,5.094,4.524,2.784,4.266,3.948
6,3.66,4.47,4.8,4.29,3.66,3.06,3.84
7,4.164,4.692,5.106,4.956,3.39,4.176,3.516
8,4.47,4.98,5.334,3.696,3.078,5.1,4.08
9,5.154,4.92,5.094,4.176,3.432,4.686,3.864


In [16]:
# Merge predictions with original data (without "fulltext" column)
output_df_oai = pd.concat([df, predictions], axis=1)
output_df_oai

Unnamed: 0,file_name,openai_extra,openai_openn,openai_consc,openai_agree,openai_neuro,openai_narc,openai_humility
0,4.txt,4.554,4.776,4.452,4.896,3.054,3.954,4.098
1,.DS_Store,4.734,4.422,4.836,3.36,4.104,4.962,3.822
2,9.txt,4.566,5.28,5.28,5.64,2.79,4.164,3.654
3,8.txt,4.284,4.170001,4.278,4.428,2.85,3.888,3.276
4,7.txt,6.42,4.596,4.35,4.824,2.766,5.364,1.89
5,3.txt,5.208,4.11,5.094,4.524,2.784,4.266,3.948
6,10.txt,3.66,4.47,4.8,4.29,3.66,3.06,3.84
7,6.txt,4.164,4.692,5.106,4.956,3.39,4.176,3.516
8,5.txt,4.47,4.98,5.334,3.696,3.078,5.1,4.08
9,2.txt,5.154,4.92,5.094,4.176,3.432,4.686,3.864


In [17]:
# Function to save the predicted scores using OpenAI GPT embeddings
def save_output_openai(output_df_oai, output_path, name):
    if os.path.exists(os.path.join(output_path, name+'.csv')):
        i = 1
        while os.path.exists(os.path.join(output_path, name+'_'+str(i)+'.csv')):
            i += 1
        output_df_oai.to_csv(os.path.join(output_path, name+'_'+str(i)+'.csv'), index=False)
    else:
        output_df_oai.to_csv(os.path.join(output_path, name+'.csv'), index=False)

In [18]:
# Save predicted scores for OpenAI GPT models specified
name = input_directory+'_predictions_openai'
save_output_openai(output_df_oai, output_path, name)