<a href="https://colab.research.google.com/github/phukon/notebooks/blob/main/text_embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
!pip install openai pandas numpy

In [30]:
from openai import OpenAI
from secretKey import openai_sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [31]:
client = OpenAI(api_key = openai_sk)
df_resume = pd.read_csv('data/train.csv')

# Generate embeddings

In [32]:
def generate_embeddings(text, openai_sk):
  response = client.embeddings.create(
      input=text,
      model="text-embedding-3-large"
  )

  return response.data

In [33]:
text_embeddings = generate_embeddings(df_resume['resume'], openai_sk)

# extract embeddings
text_embeddings_list = [text_embeddings[i].embedding for i in range(len(text_embeddings))]

# Store embeddings in dataframe

In [34]:
# df column names
column_names = ["embeddings_" + str(i) for i in range (len(text_embeddings_list[0]))]

# store text embeddings in dataframe
df_train = pd.DataFrame(text_embeddings_list, columns=column_names)

# create target variable
df_train['is_data_scientist'] = df_resume['role']=='Data Scientist'

## Model training

In [35]:
# split variables by predictors and target
X = df_train.iloc[:,:-1]
Y = df_train.iloc[:,-1]

# train rf model
clf = RandomForestClassifier(max_depth=2, random_state=0).fit(X, Y)

## Evaluation

In [36]:

print(clf.score(X, Y))

# AUC value for training data
print(roc_auc_score(Y, clf.predict_proba(X)[:, 1]))

1.0
1.0


# Load testing data

In [39]:
df_resume = pd.read_csv('data/test.csv')

# gen embeds
text_embeddings_list = generate_embeddings(df_resume['resume'], openai_sk)
text_embeddings_list = [text_embeddings_list[i].embedding for i in range(len(text_embeddings_list))]


# store embeds in dataframe
df_test = pd.DataFrame(text_embeddings_list, columns=column_names)

# target variable
df_test['is_data_scientist'] = df_resume['role']=='Data Scientist'

# predictors
X_test = df_test.iloc[:,:-1]
Y_test = df_test.iloc[:,-1]

print(clf.score(X_test, Y_test))
print(roc_auc_score(Y_test, clf.predict_proba(X_test)[:, 1]))

0.98
0.9983333333333333
